[pypy-commit] pypy default: cache the string keys that occur in the json dicts, as they are likely to repeat

Fri Sep 22 15:20:23 EDT 2017

Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: 
Changeset: r92444:8aeaf30c80e8
Date: 2017-09-22 21:18 +0200
http://bitbucket.org/pypy/pypy/changeset/8aeaf30c80e8/

Log:	cache the string keys that occur in the json dicts, as they are
	likely to repeat

	this reduces both parsing time and memory usage

diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,6 +1,6 @@
 import sys
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize, always_inline
+from rpython.rlib.objectmodel import specialize, always_inline, r_dict
 from rpython.rlib import rfloat, runicode
 from rpython.rtyper.lltypesystem import lltype, rffi
 from pypy.interpreter.error import oefmt
@@ -42,6 +42,22 @@
         ll_res.chars[i] = cast_primitive(UniChar, ch)
     return hlunicode(ll_res)
 
+def slice_eq(a, b):
+    (ll_chars1, start1, length1, _) = a
+    (ll_chars2, start2, length2, _) = b
+    if length1 != length2:
+        return False
+    j = start2
+    for i in range(start1, start1 + length1):
+        if ll_chars1[i] != ll_chars2[j]:
+            return False
+        j += 1
+    return True
+
+def slice_hash(a):
+    (ll_chars, start, length, h) = a
+    return h
+
 TYPE_UNKNOWN = 0
 TYPE_STRING = 1
 class JSONDecoder(object):
@@ -55,7 +71,7 @@
         self.ll_chars = rffi.str2charp(s)
         self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
         self.pos = 0
-        self.last_type = TYPE_UNKNOWN
+        self.cache = r_dict(slice_eq, slice_hash)
 
     def close(self):
         rffi.free_charp(self.ll_chars)
@@ -295,22 +311,23 @@
             i += 1
             bits |= ord(ch)
             if ch == '"':
-                if bits & 0x80:
-                    # the 8th bit is set, it's an utf8 strnig
-                    content_utf8 = self.getslice(start, i-1)
-                    content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
-                else:
-                    # ascii only, fast path (ascii is a strict subset of
-                    # latin1, and we already checked that all the chars are <
-                    # 128)
-                    content_unicode = strslice2unicode_latin1(self.s, start, i-1)
-                self.last_type = TYPE_STRING
                 self.pos = i
-                return self.space.newunicode(content_unicode)
+                return self.space.newunicode(
+                        self._create_string(start, i - 1, bits))
             elif ch == '\\' or ch < '\x20':
                 self.pos = i-1
                 return self.decode_string_escaped(start)
 
+    def _create_string(self, start, end, bits):
+        if bits & 0x80:
+            # the 8th bit is set, it's an utf8 string
+            content_utf8 = self.getslice(start, end)
+            return unicodehelper.decode_utf8(self.space, content_utf8)
+        else:
+            # ascii only, fast path (ascii is a strict subset of
+            # latin1, and we already checked that all the chars are <
+            # 128)
+            return strslice2unicode_latin1(self.s, start, end)
 
     def decode_string_escaped(self, start):
         i = self.pos
@@ -324,7 +341,6 @@
             if ch == '"':
                 content_utf8 = builder.build()
                 content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
-                self.last_type = TYPE_STRING
                 self.pos = i
                 return self.space.newunicode(content_unicode)
             elif ch == '\\':
@@ -387,6 +403,48 @@
         lowsurr = int(hexdigits, 16) # the possible ValueError is caugth by the caller
         return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
 
+    def decode_key(self, i):
+        """ returns an unwrapped unicode """
+        from rpython.rlib.rarithmetic import intmask
+
+        i = self.skip_whitespace(i)
+        ll_chars = self.ll_chars
+        ch = ll_chars[i]
+        if ch != '"':
+            self._raise("Key name must be string at char %d", i)
+        i += 1
+
+        start = i
+        bits = 0
+        strhash = ord(ll_chars[i]) << 7
+        while True:
+            ch = ll_chars[i]
+            i += 1
+            if ch == '"':
+                break
+            elif ch == '\\' or ch < '\x20':
+                self.pos = i-1
+                return self.space.unicode_w(self.decode_string_escaped(start))
+            strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
+            bits |= ord(ch)
+        length = i - start - 1
+        if length == 0:
+            strhash = -1
+        else:
+            strhash ^= length
+            strhash = intmask(strhash)
+        self.pos = i
+        # check cache first:
+        key = (ll_chars, start, length, strhash)
+        try:
+            return self.cache[key]
+        except KeyError:
+            pass
+        res = self._create_string(start, i - 1, bits)
+        self.cache[key] = res
+        return res
+
+
 def loads(space, w_s):
     if space.isinstance_w(w_s, space.w_unicode):
         raise oefmt(space.w_TypeError,
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -10,7 +10,18 @@
     assert dec.skip_whitespace(8) == len(s)
     dec.close()
 
-    
+def test_decode_key():
+    s1 = "123" * 100
+    s = ' "%s"   "%s" ' % (s1, s1)
+    dec = JSONDecoder('fake space', s)
+    assert dec.pos == 0
+    x = dec.decode_key(0)
+    assert x == s1
+    # check caching
+    y = dec.decode_key(dec.pos)
+    assert y == s1
+    assert y is x
+    dec.close()
 
 class AppTest(object):
     spaceconfig = {"objspace.usemodules._pypyjson": True}
@@ -190,6 +201,12 @@
         res = _pypyjson.loads(json)
         assert res == {u'a': u'\ud83d'}
 
+    def test_cache_keys(self):
+        import _pypyjson
+        json = '[{"a": 1}, {"a": 2}]'
+        res = _pypyjson.loads(json)
+        assert res == [{u'a': 1}, {u'a': 2}]
+
     def test_tab_in_string_should_fail(self):
         import _pypyjson
         # http://json.org/JSON_checker/test/fail25.json
@@ -226,7 +243,7 @@
             ('{"spam":[42}', "Unexpected '}' when decoding array (char 11)"),
             ('["]', 'Unterminated string starting at char 1'),
             ('["spam":', "Unexpected ':' when decoding array (char 7)"),
-            ('[{]', "No JSON object could be decoded: unexpected ']' at char 2"),
+            ('[{]', "Key name must be string at char 2"),
         ]
         for inputtext, errmsg in test_cases:
             exc = raises(ValueError, _pypyjson.loads, inputtext)