[pypy-commit] pypy default: cache the string keys that occur in the json dicts, as they are likely to repeat
cfbolz
pypy.commits at gmail.com
Fri Sep 22 15:20:23 EDT 2017
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r92444:8aeaf30c80e8
Date: 2017-09-22 21:18 +0200
http://bitbucket.org/pypy/pypy/changeset/8aeaf30c80e8/
Log: cache the string keys that occur in the json dicts, as they are
likely to repeat
this reduces both parsing time and memory usage
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,6 +1,6 @@
import sys
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize, always_inline
+from rpython.rlib.objectmodel import specialize, always_inline, r_dict
from rpython.rlib import rfloat, runicode
from rpython.rtyper.lltypesystem import lltype, rffi
from pypy.interpreter.error import oefmt
@@ -42,6 +42,22 @@
ll_res.chars[i] = cast_primitive(UniChar, ch)
return hlunicode(ll_res)
+def slice_eq(a, b):
+ (ll_chars1, start1, length1, _) = a
+ (ll_chars2, start2, length2, _) = b
+ if length1 != length2:
+ return False
+ j = start2
+ for i in range(start1, start1 + length1):
+ if ll_chars1[i] != ll_chars2[j]:
+ return False
+ j += 1
+ return True
+
+def slice_hash(a):
+ (ll_chars, start, length, h) = a
+ return h
+
TYPE_UNKNOWN = 0
TYPE_STRING = 1
class JSONDecoder(object):
@@ -55,7 +71,7 @@
self.ll_chars = rffi.str2charp(s)
self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
self.pos = 0
- self.last_type = TYPE_UNKNOWN
+ self.cache = r_dict(slice_eq, slice_hash)
def close(self):
rffi.free_charp(self.ll_chars)
@@ -295,22 +311,23 @@
i += 1
bits |= ord(ch)
if ch == '"':
- if bits & 0x80:
- # the 8th bit is set, it's an utf8 strnig
- content_utf8 = self.getslice(start, i-1)
- content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
- else:
- # ascii only, fast path (ascii is a strict subset of
- # latin1, and we already checked that all the chars are <
- # 128)
- content_unicode = strslice2unicode_latin1(self.s, start, i-1)
- self.last_type = TYPE_STRING
self.pos = i
- return self.space.newunicode(content_unicode)
+ return self.space.newunicode(
+ self._create_string(start, i - 1, bits))
elif ch == '\\' or ch < '\x20':
self.pos = i-1
return self.decode_string_escaped(start)
+ def _create_string(self, start, end, bits):
+ if bits & 0x80:
+ # the 8th bit is set, it's an utf8 string
+ content_utf8 = self.getslice(start, end)
+ return unicodehelper.decode_utf8(self.space, content_utf8)
+ else:
+ # ascii only, fast path (ascii is a strict subset of
+ # latin1, and we already checked that all the chars are <
+ # 128)
+ return strslice2unicode_latin1(self.s, start, end)
def decode_string_escaped(self, start):
i = self.pos
@@ -324,7 +341,6 @@
if ch == '"':
content_utf8 = builder.build()
content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
- self.last_type = TYPE_STRING
self.pos = i
return self.space.newunicode(content_unicode)
elif ch == '\\':
@@ -387,6 +403,48 @@
lowsurr = int(hexdigits, 16) # the possible ValueError is caugth by the caller
return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
+ def decode_key(self, i):
+ """ returns an unwrapped unicode """
+ from rpython.rlib.rarithmetic import intmask
+
+ i = self.skip_whitespace(i)
+ ll_chars = self.ll_chars
+ ch = ll_chars[i]
+ if ch != '"':
+ self._raise("Key name must be string at char %d", i)
+ i += 1
+
+ start = i
+ bits = 0
+ strhash = ord(ll_chars[i]) << 7
+ while True:
+ ch = ll_chars[i]
+ i += 1
+ if ch == '"':
+ break
+ elif ch == '\\' or ch < '\x20':
+ self.pos = i-1
+ return self.space.unicode_w(self.decode_string_escaped(start))
+ strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
+ bits |= ord(ch)
+ length = i - start - 1
+ if length == 0:
+ strhash = -1
+ else:
+ strhash ^= length
+ strhash = intmask(strhash)
+ self.pos = i
+ # check cache first:
+ key = (ll_chars, start, length, strhash)
+ try:
+ return self.cache[key]
+ except KeyError:
+ pass
+ res = self._create_string(start, i - 1, bits)
+ self.cache[key] = res
+ return res
+
+
def loads(space, w_s):
if space.isinstance_w(w_s, space.w_unicode):
raise oefmt(space.w_TypeError,
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -10,7 +10,18 @@
assert dec.skip_whitespace(8) == len(s)
dec.close()
-
+def test_decode_key():
+ s1 = "123" * 100
+ s = ' "%s" "%s" ' % (s1, s1)
+ dec = JSONDecoder('fake space', s)
+ assert dec.pos == 0
+ x = dec.decode_key(0)
+ assert x == s1
+ # check caching
+ y = dec.decode_key(dec.pos)
+ assert y == s1
+ assert y is x
+ dec.close()
class AppTest(object):
spaceconfig = {"objspace.usemodules._pypyjson": True}
@@ -190,6 +201,12 @@
res = _pypyjson.loads(json)
assert res == {u'a': u'\ud83d'}
+ def test_cache_keys(self):
+ import _pypyjson
+ json = '[{"a": 1}, {"a": 2}]'
+ res = _pypyjson.loads(json)
+ assert res == [{u'a': 1}, {u'a': 2}]
+
def test_tab_in_string_should_fail(self):
import _pypyjson
# http://json.org/JSON_checker/test/fail25.json
@@ -226,7 +243,7 @@
('{"spam":[42}', "Unexpected '}' when decoding array (char 11)"),
('["]', 'Unterminated string starting at char 1'),
('["spam":', "Unexpected ':' when decoding array (char 7)"),
- ('[{]', "No JSON object could be decoded: unexpected ']' at char 2"),
+ ('[{]', "Key name must be string at char 2"),
]
for inputtext, errmsg in test_cases:
exc = raises(ValueError, _pypyjson.loads, inputtext)
More information about the pypy-commit
mailing list