[pypy-commit] pypy default: removed cPython-oriented code in json and added KeyValueBuilder(s) for speeding up json decoding

Wed Feb 8 10:57:59 CET 2012

Author: Stefano Parmesan <stefanop at ahref.eu>
Branch: 
Changeset: r52215:de5504a0f4f0
Date: 2012-01-27 15:06 +0100
http://bitbucket.org/pypy/pypy/changeset/de5504a0f4f0/

Log:	removed cPython-oriented code in json and added KeyValueBuilder(s)
	for speeding up json decoding

diff --git a/lib-python/modified-2.7/json/decoder.py b/lib-python/modified-2.7/json/decoder.py
--- a/lib-python/modified-2.7/json/decoder.py
+++ b/lib-python/modified-2.7/json/decoder.py
@@ -5,15 +5,47 @@
 import struct
 
 from json.scanner import make_scanner
-try:
-    from _json import scanstring as c_scanstring
-except ImportError:
-    c_scanstring = None
 
 __all__ = ['JSONDecoder']
 
 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
 
+
+class KeyValueElement(object):
+    __slots__ = ['key', 'value']
+
+    def __init__(self, key, value):
+        self.key = key
+        self.value = value
+
+
+class KeyValueAbstractBuilder(object):
+    __slots__ = ['elements', 'base_type']
+
+    def __init__(self):
+        self.elements = self.base_type()
+
+    def append(self, key, value):
+        pass
+
+    def build(self):
+        return self.elements
+
+
+class KeyValueListBuilder(KeyValueAbstractBuilder):
+    base_type = list
+
+    def append(self, key, value):
+        self.elements.append((key, value))
+
+
+class KeyValueDictBuilder(KeyValueAbstractBuilder):
+    base_type = dict
+
+    def append(self, key, value):
+        self.elements[key] = value
+
+
 def _floatconstants():
     _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
     if sys.byteorder != 'big':
@@ -62,7 +94,7 @@
 
 DEFAULT_ENCODING = "utf-8"
 
-def py_scanstring(s, end, encoding=None, strict=True,
+def scanstring(s, end, encoding=None, strict=True,
         _b=BACKSLASH, _m=STRINGCHUNK.match):
     """Scan the string s for a JSON string. End is the index of the
     character in s after the quote that started the JSON string.
@@ -75,7 +107,6 @@
     if encoding is None:
         encoding = DEFAULT_ENCODING
     chunks = []
-    _append = chunks.append
     begin = end - 1
     while 1:
         chunk = _m(s, end)
@@ -84,11 +115,13 @@
                 errmsg("Unterminated string starting at", s, begin))
         end = chunk.end()
         content, terminator = chunk.groups()
+        del chunk
         # Content is contains zero or more unescaped string characters
         if content:
             if not isinstance(content, unicode):
                 content = unicode(content, encoding)
-            _append(content)
+            chunks.append(content)
+        del content
         # Terminator is the end of string, a literal control character,
         # or a backslash denoting that an escape sequence follows
         if terminator == '"':
@@ -99,7 +132,8 @@
                 msg = "Invalid control character {0!r} at".format(terminator)
                 raise ValueError(errmsg(msg, s, end))
             else:
-                _append(terminator)
+                chunks.append(terminator)
+                del terminator
                 continue
         try:
             esc = s[end]
@@ -136,21 +170,16 @@
             char = unichr(uni)
             end = next_end
         # Append the unescaped character
-        _append(char)
+        chunks.append(char)
     return u''.join(chunks), end
 
 
-# Use speedup if available
-scanstring = c_scanstring or py_scanstring
-
 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
 WHITESPACE_STR = ' \t\n\r'
 
 def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
                object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
     s, end = s_and_end
-    pairs = []
-    pairs_append = pairs.append
     # Use a slice to prevent IndexError from being raised, the following
     # check will raise a more specific ValueError if the string is empty
     nextchar = s[end:end + 1]
@@ -162,7 +191,7 @@
         # Trivial empty object
         if nextchar == '}':
             if object_pairs_hook is not None:
-                result = object_pairs_hook(pairs)
+                result = object_pairs_hook([])
                 return result, end
             pairs = {}
             if object_hook is not None:
@@ -171,7 +200,13 @@
         elif nextchar != '"':
             raise ValueError(errmsg("Expecting property name", s, end))
     end += 1
-    while True:
+
+    if object_pairs_hook is not None:
+        pairs = KeyValueListBuilder()
+    else:
+        pairs = KeyValueDictBuilder()
+
+    while 1:
         key, end = scanstring(s, end, encoding, strict)
 
         # To skip some function call overhead we optimize the fast paths where
@@ -195,7 +230,7 @@
             value, end = scan_once(s, end)
         except StopIteration:
             raise ValueError(errmsg("Expecting object", s, end))
-        pairs_append((key, value))
+        pairs.append(key, value)
 
         try:
             nextchar = s[end]
@@ -227,9 +262,9 @@
             raise ValueError(errmsg("Expecting property name", s, end - 1))
 
     if object_pairs_hook is not None:
-        result = object_pairs_hook(pairs)
+        result = object_pairs_hook(pairs.build())  # to list
         return result, end
-    pairs = dict(pairs)
+    pairs = pairs.build()  # to dict
     if object_hook is not None:
         pairs = object_hook(pairs)
     return pairs, end
@@ -244,13 +279,12 @@
     # Look-ahead for trivial empty array
     if nextchar == ']':
         return values, end + 1
-    _append = values.append
-    while True:
+    while 1:
         try:
             value, end = scan_once(s, end)
         except StopIteration:
             raise ValueError(errmsg("Expecting object", s, end))
-        _append(value)
+        values.append(value)
         nextchar = s[end:end + 1]
         if nextchar in _ws:
             end = _w(s, end + 1).end()
diff --git a/lib-python/modified-2.7/json/scanner.py b/lib-python/modified-2.7/json/scanner.py
--- a/lib-python/modified-2.7/json/scanner.py
+++ b/lib-python/modified-2.7/json/scanner.py
@@ -1,10 +1,6 @@
 """JSON token scanner
 """
 import re
-try:
-    from _json import make_scanner as c_make_scanner
-except ImportError:
-    c_make_scanner = None
 
 __all__ = ['make_scanner']
 
@@ -12,19 +8,7 @@
     r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
     (re.VERBOSE | re.MULTILINE | re.DOTALL))
 
-def py_make_scanner(context):
-    parse_object = context.parse_object
-    parse_array = context.parse_array
-    parse_string = context.parse_string
-    match_number = NUMBER_RE.match
-    encoding = context.encoding
-    strict = context.strict
-    parse_float = context.parse_float
-    parse_int = context.parse_int
-    parse_constant = context.parse_constant
-    object_hook = context.object_hook
-    object_pairs_hook = context.object_pairs_hook
-
+def make_scanner(context):
     def _scan_once(string, idx):
         try:
             nextchar = string[idx]
@@ -32,12 +16,12 @@
             raise StopIteration
 
         if nextchar == '"':
-            return parse_string(string, idx + 1, encoding, strict)
+            return context.parse_string(string, idx + 1, context.encoding, context.strict)
         elif nextchar == '{':
-            return parse_object((string, idx + 1), encoding, strict,
-                _scan_once, object_hook, object_pairs_hook)
+            return context.parse_object((string, idx + 1), context.encoding, context.strict,
+                _scan_once, context.object_hook, context.object_pairs_hook)
         elif nextchar == '[':
-            return parse_array((string, idx + 1), _scan_once)
+            return context.parse_array((string, idx + 1), _scan_once)
         elif nextchar == 'n' and string[idx:idx + 4] == 'null':
             return None, idx + 4
         elif nextchar == 't' and string[idx:idx + 4] == 'true':
@@ -45,23 +29,21 @@
         elif nextchar == 'f' and string[idx:idx + 5] == 'false':
             return False, idx + 5
 
-        m = match_number(string, idx)
+        m = NUMBER_RE.match(string, idx)
         if m is not None:
             integer, frac, exp = m.groups()
             if frac or exp:
-                res = parse_float(integer + (frac or '') + (exp or ''))
+                res = context.parse_float(integer + (frac or '') + (exp or ''))
             else:
-                res = parse_int(integer)
+                res = context.parse_int(integer)
             return res, m.end()
         elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
-            return parse_constant('NaN'), idx + 3
+            return context.parse_constant('NaN'), idx + 3
         elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
-            return parse_constant('Infinity'), idx + 8
+            return context.parse_constant('Infinity'), idx + 8
         elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
-            return parse_constant('-Infinity'), idx + 9
+            return context.parse_constant('-Infinity'), idx + 9
         else:
             raise StopIteration
 
     return _scan_once
-
-make_scanner = c_make_scanner or py_make_scanner