[pypy-commit] pypy default: merge faster-json branch. It provides encode() optimization for json module,

Thu Oct 20 19:50:34 CEST 2011

Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: 
Changeset: r48276:51aebf5ad032
Date: 2011-10-20 19:49 +0200
http://bitbucket.org/pypy/pypy/changeset/51aebf5ad032/

Log:	merge faster-json branch. It provides encode() optimization for json
	module, that runs faster than the C extension on CPython (for our
	benchmark)

diff --git a/lib-python/modified-2.7/json/encoder.py b/lib-python/modified-2.7/json/encoder.py
--- a/lib-python/modified-2.7/json/encoder.py
+++ b/lib-python/modified-2.7/json/encoder.py
@@ -2,14 +2,7 @@
 """
 import re
 
-try:
-    from _json import encode_basestring_ascii as c_encode_basestring_ascii
-except ImportError:
-    c_encode_basestring_ascii = None
-try:
-    from _json import make_encoder as c_make_encoder
-except ImportError:
-    c_make_encoder = None
+from __pypy__.builders import StringBuilder, UnicodeBuilder
 
 ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
@@ -24,8 +17,7 @@
     '\t': '\\t',
 }
 for i in range(0x20):
-    ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
-    #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
+    ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
 
 # Assume this produces an infinity on all machines (probably not guaranteed)
 INFINITY = float('1e66666')
@@ -37,10 +29,9 @@
     """
     def replace(match):
         return ESCAPE_DCT[match.group(0)]
-    return '"' + ESCAPE.sub(replace, s) + '"'
+    return ESCAPE.sub(replace, s)
 
-
-def py_encode_basestring_ascii(s):
+def encode_basestring_ascii(s):
     """Return an ASCII-only JSON representation of a Python string
 
     """
@@ -53,20 +44,18 @@
         except KeyError:
             n = ord(s)
             if n < 0x10000:
-                return '\\u{0:04x}'.format(n)
-                #return '\\u%04x' % (n,)
+                return '\\u%04x' % (n,)
             else:
                 # surrogate pair
                 n -= 0x10000
                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
                 s2 = 0xdc00 | (n & 0x3ff)
-                return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
-                #return '\\u%04x\\u%04x' % (s1, s2)
-    return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
-
-
-encode_basestring_ascii = (
-    c_encode_basestring_ascii or py_encode_basestring_ascii)
+                return '\\u%04x\\u%04x' % (s1, s2)
+    if ESCAPE_ASCII.search(s):
+        return str(ESCAPE_ASCII.sub(replace, s))
+    return s
+py_encode_basestring_ascii = lambda s: '"' + encode_basestring_ascii(s) + '"'
+c_encode_basestring_ascii = None
 
 class JSONEncoder(object):
     """Extensible JSON <http://json.org> encoder for Python data structures.
@@ -147,6 +136,17 @@
 
         self.skipkeys = skipkeys
         self.ensure_ascii = ensure_ascii
+        if ensure_ascii:
+            self.encoder = encode_basestring_ascii
+        else:
+            self.encoder = encode_basestring
+        if encoding != 'utf-8':
+            orig_encoder = self.encoder
+            def encoder(o):
+                if isinstance(o, str):
+                    o = o.decode(encoding)
+                return orig_encoder(o)
+            self.encoder = encoder
         self.check_circular = check_circular
         self.allow_nan = allow_nan
         self.sort_keys = sort_keys
@@ -184,24 +184,126 @@
         '{"foo": ["bar", "baz"]}'
 
         """
-        # This is for extremely simple cases and benchmarks.
+        if self.check_circular:
+            markers = {}
+        else:
+            markers = None
+        if self.ensure_ascii:
+            builder = StringBuilder()
+        else:
+            builder = UnicodeBuilder()
+        self._encode(o, markers, builder, 0)
+        return builder.build()
+
+    def _emit_indent(self, builder, _current_indent_level):
+        if self.indent is not None:
+            _current_indent_level += 1
+            newline_indent = '\n' + (' ' * (self.indent *
+                                            _current_indent_level))
+            separator = self.item_separator + newline_indent
+            builder.append(newline_indent)
+        else:
+            separator = self.item_separator
+        return separator, _current_indent_level
+
+    def _emit_unindent(self, builder, _current_indent_level):
+        if self.indent is not None:
+            builder.append('\n')
+            builder.append(' ' * (self.indent * (_current_indent_level - 1)))
+
+    def _encode(self, o, markers, builder, _current_indent_level):
         if isinstance(o, basestring):
-            if isinstance(o, str):
-                _encoding = self.encoding
-                if (_encoding is not None
-                        and not (_encoding == 'utf-8')):
-                    o = o.decode(_encoding)
-            if self.ensure_ascii:
-                return encode_basestring_ascii(o)
+            builder.append('"')
+            builder.append(self.encoder(o))
+            builder.append('"')
+        elif o is None:
+            builder.append('null')
+        elif o is True:
+            builder.append('true')
+        elif o is False:
+            builder.append('false')
+        elif isinstance(o, (int, long)):
+            builder.append(str(o))
+        elif isinstance(o, float):
+            builder.append(self._floatstr(o))
+        elif isinstance(o, (list, tuple)):
+            if not o:
+                builder.append('[]')
+                return
+            self._encode_list(o, markers, builder, _current_indent_level)
+        elif isinstance(o, dict):
+            if not o:
+                builder.append('{}')
+                return
+            self._encode_dict(o, markers, builder, _current_indent_level)
+        else:
+            self._mark_markers(markers, o)
+            res = self.default(o)
+            self._encode(res, markers, builder, _current_indent_level)
+            self._remove_markers(markers, o)
+            return res
+
+    def _encode_list(self, l, markers, builder, _current_indent_level):
+        self._mark_markers(markers, l)
+        builder.append('[')
+        first = True
+        separator, _current_indent_level = self._emit_indent(builder,
+                                                      _current_indent_level)
+        for elem in l:
+            if first:
+                first = False
             else:
-                return encode_basestring(o)
-        # This doesn't pass the iterator directly to ''.join() because the
-        # exceptions aren't as detailed.  The list call should be roughly
-        # equivalent to the PySequence_Fast that ''.join() would do.
-        chunks = self.iterencode(o, _one_shot=True)
-        if not isinstance(chunks, (list, tuple)):
-            chunks = list(chunks)
-        return ''.join(chunks)
+                builder.append(separator)
+            self._encode(elem, markers, builder, _current_indent_level)
+            del elem # XXX grumble
+        self._emit_unindent(builder, _current_indent_level)
+        builder.append(']')
+        self._remove_markers(markers, l)
+
+    def _encode_dict(self, d, markers, builder, _current_indent_level):
+        self._mark_markers(markers, d)
+        first = True
+        builder.append('{')
+        separator, _current_indent_level = self._emit_indent(builder,
+                                                         _current_indent_level)
+        if self.sort_keys:
+            items = sorted(d.items(), key=lambda kv: kv[0])
+        else:
+            items = d.iteritems()
+
+        for key, v in items:
+            if first:
+                first = False
+            else:
+                builder.append(separator)
+            if isinstance(key, basestring):
+                pass
+            # JavaScript is weakly typed for these, so it makes sense to
+            # also allow them.  Many encoders seem to do something like this.
+            elif isinstance(key, float):
+                key = self._floatstr(key)
+            elif key is True:
+                key = 'true'
+            elif key is False:
+                key = 'false'
+            elif key is None:
+                key = 'null'
+            elif isinstance(key, (int, long)):
+                key = str(key)
+            elif self.skipkeys:
+                continue
+            else:
+                raise TypeError("key " + repr(key) + " is not a string")
+            builder.append('"')
+            builder.append(self.encoder(key))
+            builder.append('"')
+            builder.append(self.key_separator)
+            self._encode(v, markers, builder, _current_indent_level)
+            del key
+            del v # XXX grumble
+        self._emit_unindent(builder, _current_indent_level)
+        builder.append('}')
+        self._remove_markers(markers, d)
 
     def iterencode(self, o, _one_shot=False):
         """Encode the given object and yield each string
@@ -217,86 +319,54 @@
             markers = {}
         else:
             markers = None
-        if self.ensure_ascii:
-            _encoder = encode_basestring_ascii
+        return self._iterencode(o, markers, 0)
+
+    def _floatstr(self, o):
+        # Check for specials.  Note that this type of test is processor
+        # and/or platform-specific, so do tests which don't depend on the
+        # internals.
+
+        if o != o:
+            text = 'NaN'
+        elif o == INFINITY:
+            text = 'Infinity'
+        elif o == -INFINITY:
+            text = '-Infinity'
         else:
-            _encoder = encode_basestring
-        if self.encoding != 'utf-8':
-            def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
-                if isinstance(o, str):
-                    o = o.decode(_encoding)
-                return _orig_encoder(o)
+            return FLOAT_REPR(o)
 
-        def floatstr(o, allow_nan=self.allow_nan,
-                _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
-            # Check for specials.  Note that this type of test is processor
-            # and/or platform-specific, so do tests which don't depend on the
-            # internals.
+        if not self.allow_nan:
+            raise ValueError(
+                "Out of range float values are not JSON compliant: " +
+                repr(o))
 
-            if o != o:
-                text = 'NaN'
-            elif o == _inf:
-                text = 'Infinity'
-            elif o == _neginf:
-                text = '-Infinity'
-            else:
-                return _repr(o)
+        return text
 
-            if not allow_nan:
-                raise ValueError(
-                    "Out of range float values are not JSON compliant: " +
-                    repr(o))
+    def _mark_markers(self, markers, o):
+        if markers is not None:
+            if id(o) in markers:
+                raise ValueError("Circular reference detected")
+            markers[id(o)] = None
 
-            return text
+    def _remove_markers(self, markers, o):
+        if markers is not None:
+            del markers[id(o)]
 
-
-        if (_one_shot and c_make_encoder is not None
-                and not self.indent and not self.sort_keys):
-            _iterencode = c_make_encoder(
-                markers, self.default, _encoder, self.indent,
-                self.key_separator, self.item_separator, self.sort_keys,
-                self.skipkeys, self.allow_nan)
-        else:
-            _iterencode = _make_iterencode(
-                markers, self.default, _encoder, self.indent, floatstr,
-                self.key_separator, self.item_separator, self.sort_keys,
-                self.skipkeys, _one_shot)
-        return _iterencode(o, 0)
-
-def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
-        _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
-        ## HACK: hand-optimized bytecode; turn globals into locals
-        ValueError=ValueError,
-        basestring=basestring,
-        dict=dict,
-        float=float,
-        id=id,
-        int=int,
-        isinstance=isinstance,
-        list=list,
-        long=long,
-        str=str,
-        tuple=tuple,
-    ):
-
-    def _iterencode_list(lst, _current_indent_level):
+    def _iterencode_list(self, lst, markers, _current_indent_level):
         if not lst:
             yield '[]'
             return
-        if markers is not None:
-            markerid = id(lst)
-            if markerid in markers:
-                raise ValueError("Circular reference detected")
-            markers[markerid] = lst
+        self._mark_markers(markers, lst)
         buf = '['
-        if _indent is not None:
+        if self.indent is not None:
             _current_indent_level += 1
-            newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
-            separator = _item_separator + newline_indent
+            newline_indent = '\n' + (' ' * (self.indent *
+                                            _current_indent_level))
+            separator = self.item_separator + newline_indent
             buf += newline_indent
         else:
             newline_indent = None
-            separator = _item_separator
+            separator = self.item_separator
         first = True
         for value in lst:
             if first:
@@ -304,7 +374,7 @@
             else:
                 buf = separator
             if isinstance(value, basestring):
-                yield buf + _encoder(value)
+                yield buf + '"' + self.encoder(value) + '"'
             elif value is None:
                 yield buf + 'null'
             elif value is True:
@@ -314,44 +384,43 @@
             elif isinstance(value, (int, long)):
                 yield buf + str(value)
             elif isinstance(value, float):
-                yield buf + _floatstr(value)
+                yield buf + self._floatstr(value)
             else:
                 yield buf
                 if isinstance(value, (list, tuple)):
-                    chunks = _iterencode_list(value, _current_indent_level)
+                    chunks = self._iterencode_list(value, markers,
+                                                   _current_indent_level)
                 elif isinstance(value, dict):
-                    chunks = _iterencode_dict(value, _current_indent_level)
+                    chunks = self._iterencode_dict(value, markers,
+                                                   _current_indent_level)
                 else:
-                    chunks = _iterencode(value, _current_indent_level)
+                    chunks = self._iterencode(value, markers,
+                                              _current_indent_level)
                 for chunk in chunks:
                     yield chunk
         if newline_indent is not None:
             _current_indent_level -= 1
-            yield '\n' + (' ' * (_indent * _current_indent_level))
+            yield '\n' + (' ' * (self.indent * _current_indent_level))
         yield ']'
-        if markers is not None:
-            del markers[markerid]
+        self._remove_markers(markers, lst)
 
-    def _iterencode_dict(dct, _current_indent_level):
+    def _iterencode_dict(self, dct, markers, _current_indent_level):
         if not dct:
             yield '{}'
             return
-        if markers is not None:
-            markerid = id(dct)
-            if markerid in markers:
-                raise ValueError("Circular reference detected")
-            markers[markerid] = dct
+        self._mark_markers(markers, dct)
         yield '{'
-        if _indent is not None:
+        if self.indent is not None:
             _current_indent_level += 1
-            newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
-            item_separator = _item_separator + newline_indent
+            newline_indent = '\n' + (' ' * (self.indent *
+                                            _current_indent_level))
+            item_separator = self.item_separator + newline_indent
             yield newline_indent
         else:
             newline_indent = None
-            item_separator = _item_separator
+            item_separator = self.item_separator
         first = True
-        if _sort_keys:
+        if self.sort_keys:
             items = sorted(dct.items(), key=lambda kv: kv[0])
         else:
             items = dct.iteritems()
@@ -361,7 +430,7 @@
             # JavaScript is weakly typed for these, so it makes sense to
             # also allow them.  Many encoders seem to do something like this.
             elif isinstance(key, float):
-                key = _floatstr(key)
+                key = self._floatstr(key)
             elif key is True:
                 key = 'true'
             elif key is False:
@@ -370,7 +439,7 @@
                 key = 'null'
             elif isinstance(key, (int, long)):
                 key = str(key)
-            elif _skipkeys:
+            elif self.skipkeys:
                 continue
             else:
                 raise TypeError("key " + repr(key) + " is not a string")
@@ -378,10 +447,10 @@
                 first = False
             else:
                 yield item_separator
-            yield _encoder(key)
-            yield _key_separator
+            yield '"' + self.encoder(key) + '"'
+            yield self.key_separator
             if isinstance(value, basestring):
-                yield _encoder(value)
+                yield '"' + self.encoder(value) + '"'
             elif value is None:
                 yield 'null'
             elif value is True:
@@ -391,26 +460,28 @@
             elif isinstance(value, (int, long)):
                 yield str(value)
             elif isinstance(value, float):
-                yield _floatstr(value)
+                yield self._floatstr(value)
             else:
                 if isinstance(value, (list, tuple)):
-                    chunks = _iterencode_list(value, _current_indent_level)
+                    chunks = self._iterencode_list(value, markers,
+                                                   _current_indent_level)
                 elif isinstance(value, dict):
-                    chunks = _iterencode_dict(value, _current_indent_level)
+                    chunks = self._iterencode_dict(value, markers,
+                                                   _current_indent_level)
                 else:
-                    chunks = _iterencode(value, _current_indent_level)
+                    chunks = self._iterencode(value, markers,
+                                              _current_indent_level)
                 for chunk in chunks:
                     yield chunk
         if newline_indent is not None:
             _current_indent_level -= 1
-            yield '\n' + (' ' * (_indent * _current_indent_level))
+            yield '\n' + (' ' * (self.indent * _current_indent_level))
         yield '}'
-        if markers is not None:
-            del markers[markerid]
+        self._remove_markers(markers, dct)
 
-    def _iterencode(o, _current_indent_level):
+    def _iterencode(self, o, markers, _current_indent_level):
         if isinstance(o, basestring):
-            yield _encoder(o)
+            yield '"' + self.encoder(o) + '"'
         elif o is None:
             yield 'null'
         elif o is True:
@@ -420,23 +491,19 @@
         elif isinstance(o, (int, long)):
             yield str(o)
         elif isinstance(o, float):
-            yield _floatstr(o)
+            yield self._floatstr(o)
         elif isinstance(o, (list, tuple)):
-            for chunk in _iterencode_list(o, _current_indent_level):
+            for chunk in self._iterencode_list(o, markers,
+                                               _current_indent_level):
                 yield chunk
         elif isinstance(o, dict):
-            for chunk in _iterencode_dict(o, _current_indent_level):
+            for chunk in self._iterencode_dict(o, markers,
+                                               _current_indent_level):
                 yield chunk
         else:
-            if markers is not None:
-                markerid = id(o)
-                if markerid in markers:
-                    raise ValueError("Circular reference detected")
-                markers[markerid] = o
-            o = _default(o)
-            for chunk in _iterencode(o, _current_indent_level):
+            self._mark_markers(markers, o)
+            obj = self.default(o)
+            for chunk in self._iterencode(obj, markers,
+                                          _current_indent_level):
                 yield chunk
-            if markers is not None:
-                del markers[markerid]
-
-    return _iterencode
+            self._remove_markers(markers, o)
diff --git a/lib-python/modified-2.7/json/tests/test_unicode.py b/lib-python/modified-2.7/json/tests/test_unicode.py
--- a/lib-python/modified-2.7/json/tests/test_unicode.py
+++ b/lib-python/modified-2.7/json/tests/test_unicode.py
@@ -80,3 +80,9 @@
         self.assertEqual(type(json.loads(u'["a"]')[0]), unicode)
         # Issue 10038.
         self.assertEqual(type(json.loads('"foo"')), unicode)
+
+    def test_encode_not_utf_8(self):
+        self.assertEqual(json.dumps('\xb1\xe6', encoding='iso8859-2'),
+                         '"\\u0105\\u0107"')
+        self.assertEqual(json.dumps(['\xb1\xe6'], encoding='iso8859-2'),
+                         '["\\u0105\\u0107"]')