[pypy-commit] pypy default: merge faster-json branch. It provides encode() optimization for json module,
fijal
noreply at buildbot.pypy.org
Thu Oct 20 19:50:34 CEST 2011
Author: Maciej Fijalkowski <fijall at gmail.com>
Branch:
Changeset: r48276:51aebf5ad032
Date: 2011-10-20 19:49 +0200
http://bitbucket.org/pypy/pypy/changeset/51aebf5ad032/
Log: merge faster-json branch. It provides encode() optimization for json
module, that runs faster than the C extension on CPython (for our
benchmark)
diff --git a/lib-python/modified-2.7/json/encoder.py b/lib-python/modified-2.7/json/encoder.py
--- a/lib-python/modified-2.7/json/encoder.py
+++ b/lib-python/modified-2.7/json/encoder.py
@@ -2,14 +2,7 @@
"""
import re
-try:
- from _json import encode_basestring_ascii as c_encode_basestring_ascii
-except ImportError:
- c_encode_basestring_ascii = None
-try:
- from _json import make_encoder as c_make_encoder
-except ImportError:
- c_make_encoder = None
+from __pypy__.builders import StringBuilder, UnicodeBuilder
ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
@@ -24,8 +17,7 @@
'\t': '\\t',
}
for i in range(0x20):
- ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
- #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
+ ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
# Assume this produces an infinity on all machines (probably not guaranteed)
INFINITY = float('1e66666')
@@ -37,10 +29,9 @@
"""
def replace(match):
return ESCAPE_DCT[match.group(0)]
- return '"' + ESCAPE.sub(replace, s) + '"'
+ return ESCAPE.sub(replace, s)
-
-def py_encode_basestring_ascii(s):
+def encode_basestring_ascii(s):
"""Return an ASCII-only JSON representation of a Python string
"""
@@ -53,20 +44,18 @@
except KeyError:
n = ord(s)
if n < 0x10000:
- return '\\u{0:04x}'.format(n)
- #return '\\u%04x' % (n,)
+ return '\\u%04x' % (n,)
else:
# surrogate pair
n -= 0x10000
s1 = 0xd800 | ((n >> 10) & 0x3ff)
s2 = 0xdc00 | (n & 0x3ff)
- return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
- #return '\\u%04x\\u%04x' % (s1, s2)
- return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
-
-
-encode_basestring_ascii = (
- c_encode_basestring_ascii or py_encode_basestring_ascii)
+ return '\\u%04x\\u%04x' % (s1, s2)
+ if ESCAPE_ASCII.search(s):
+ return str(ESCAPE_ASCII.sub(replace, s))
+ return s
+py_encode_basestring_ascii = lambda s: '"' + encode_basestring_ascii(s) + '"'
+c_encode_basestring_ascii = None
class JSONEncoder(object):
"""Extensible JSON <http://json.org> encoder for Python data structures.
@@ -147,6 +136,17 @@
self.skipkeys = skipkeys
self.ensure_ascii = ensure_ascii
+ if ensure_ascii:
+ self.encoder = encode_basestring_ascii
+ else:
+ self.encoder = encode_basestring
+ if encoding != 'utf-8':
+ orig_encoder = self.encoder
+ def encoder(o):
+ if isinstance(o, str):
+ o = o.decode(encoding)
+ return orig_encoder(o)
+ self.encoder = encoder
self.check_circular = check_circular
self.allow_nan = allow_nan
self.sort_keys = sort_keys
@@ -184,24 +184,126 @@
'{"foo": ["bar", "baz"]}'
"""
- # This is for extremely simple cases and benchmarks.
+ if self.check_circular:
+ markers = {}
+ else:
+ markers = None
+ if self.ensure_ascii:
+ builder = StringBuilder()
+ else:
+ builder = UnicodeBuilder()
+ self._encode(o, markers, builder, 0)
+ return builder.build()
+
+ def _emit_indent(self, builder, _current_indent_level):
+ if self.indent is not None:
+ _current_indent_level += 1
+ newline_indent = '\n' + (' ' * (self.indent *
+ _current_indent_level))
+ separator = self.item_separator + newline_indent
+ builder.append(newline_indent)
+ else:
+ separator = self.item_separator
+ return separator, _current_indent_level
+
+ def _emit_unindent(self, builder, _current_indent_level):
+ if self.indent is not None:
+ builder.append('\n')
+ builder.append(' ' * (self.indent * (_current_indent_level - 1)))
+
+ def _encode(self, o, markers, builder, _current_indent_level):
if isinstance(o, basestring):
- if isinstance(o, str):
- _encoding = self.encoding
- if (_encoding is not None
- and not (_encoding == 'utf-8')):
- o = o.decode(_encoding)
- if self.ensure_ascii:
- return encode_basestring_ascii(o)
+ builder.append('"')
+ builder.append(self.encoder(o))
+ builder.append('"')
+ elif o is None:
+ builder.append('null')
+ elif o is True:
+ builder.append('true')
+ elif o is False:
+ builder.append('false')
+ elif isinstance(o, (int, long)):
+ builder.append(str(o))
+ elif isinstance(o, float):
+ builder.append(self._floatstr(o))
+ elif isinstance(o, (list, tuple)):
+ if not o:
+ builder.append('[]')
+ return
+ self._encode_list(o, markers, builder, _current_indent_level)
+ elif isinstance(o, dict):
+ if not o:
+ builder.append('{}')
+ return
+ self._encode_dict(o, markers, builder, _current_indent_level)
+ else:
+ self._mark_markers(markers, o)
+ res = self.default(o)
+ self._encode(res, markers, builder, _current_indent_level)
+ self._remove_markers(markers, o)
+ return res
+
+ def _encode_list(self, l, markers, builder, _current_indent_level):
+ self._mark_markers(markers, l)
+ builder.append('[')
+ first = True
+ separator, _current_indent_level = self._emit_indent(builder,
+ _current_indent_level)
+ for elem in l:
+ if first:
+ first = False
else:
- return encode_basestring(o)
- # This doesn't pass the iterator directly to ''.join() because the
- # exceptions aren't as detailed. The list call should be roughly
- # equivalent to the PySequence_Fast that ''.join() would do.
- chunks = self.iterencode(o, _one_shot=True)
- if not isinstance(chunks, (list, tuple)):
- chunks = list(chunks)
- return ''.join(chunks)
+ builder.append(separator)
+ self._encode(elem, markers, builder, _current_indent_level)
+ del elem # XXX grumble
+ self._emit_unindent(builder, _current_indent_level)
+ builder.append(']')
+ self._remove_markers(markers, l)
+
+ def _encode_dict(self, d, markers, builder, _current_indent_level):
+ self._mark_markers(markers, d)
+ first = True
+ builder.append('{')
+ separator, _current_indent_level = self._emit_indent(builder,
+ _current_indent_level)
+ if self.sort_keys:
+ items = sorted(d.items(), key=lambda kv: kv[0])
+ else:
+ items = d.iteritems()
+
+ for key, v in items:
+ if first:
+ first = False
+ else:
+ builder.append(separator)
+ if isinstance(key, basestring):
+ pass
+ # JavaScript is weakly typed for these, so it makes sense to
+ # also allow them. Many encoders seem to do something like this.
+ elif isinstance(key, float):
+ key = self._floatstr(key)
+ elif key is True:
+ key = 'true'
+ elif key is False:
+ key = 'false'
+ elif key is None:
+ key = 'null'
+ elif isinstance(key, (int, long)):
+ key = str(key)
+ elif self.skipkeys:
+ continue
+ else:
+ raise TypeError("key " + repr(key) + " is not a string")
+ builder.append('"')
+ builder.append(self.encoder(key))
+ builder.append('"')
+ builder.append(self.key_separator)
+ self._encode(v, markers, builder, _current_indent_level)
+ del key
+ del v # XXX grumble
+ self._emit_unindent(builder, _current_indent_level)
+ builder.append('}')
+ self._remove_markers(markers, d)
def iterencode(self, o, _one_shot=False):
"""Encode the given object and yield each string
@@ -217,86 +319,54 @@
markers = {}
else:
markers = None
- if self.ensure_ascii:
- _encoder = encode_basestring_ascii
+ return self._iterencode(o, markers, 0)
+
+ def _floatstr(self, o):
+ # Check for specials. Note that this type of test is processor
+ # and/or platform-specific, so do tests which don't depend on the
+ # internals.
+
+ if o != o:
+ text = 'NaN'
+ elif o == INFINITY:
+ text = 'Infinity'
+ elif o == -INFINITY:
+ text = '-Infinity'
else:
- _encoder = encode_basestring
- if self.encoding != 'utf-8':
- def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
- if isinstance(o, str):
- o = o.decode(_encoding)
- return _orig_encoder(o)
+ return FLOAT_REPR(o)
- def floatstr(o, allow_nan=self.allow_nan,
- _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
- # Check for specials. Note that this type of test is processor
- # and/or platform-specific, so do tests which don't depend on the
- # internals.
+ if not self.allow_nan:
+ raise ValueError(
+ "Out of range float values are not JSON compliant: " +
+ repr(o))
- if o != o:
- text = 'NaN'
- elif o == _inf:
- text = 'Infinity'
- elif o == _neginf:
- text = '-Infinity'
- else:
- return _repr(o)
+ return text
- if not allow_nan:
- raise ValueError(
- "Out of range float values are not JSON compliant: " +
- repr(o))
+ def _mark_markers(self, markers, o):
+ if markers is not None:
+ if id(o) in markers:
+ raise ValueError("Circular reference detected")
+ markers[id(o)] = None
- return text
+ def _remove_markers(self, markers, o):
+ if markers is not None:
+ del markers[id(o)]
-
- if (_one_shot and c_make_encoder is not None
- and not self.indent and not self.sort_keys):
- _iterencode = c_make_encoder(
- markers, self.default, _encoder, self.indent,
- self.key_separator, self.item_separator, self.sort_keys,
- self.skipkeys, self.allow_nan)
- else:
- _iterencode = _make_iterencode(
- markers, self.default, _encoder, self.indent, floatstr,
- self.key_separator, self.item_separator, self.sort_keys,
- self.skipkeys, _one_shot)
- return _iterencode(o, 0)
-
-def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
- _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
- ## HACK: hand-optimized bytecode; turn globals into locals
- ValueError=ValueError,
- basestring=basestring,
- dict=dict,
- float=float,
- id=id,
- int=int,
- isinstance=isinstance,
- list=list,
- long=long,
- str=str,
- tuple=tuple,
- ):
-
- def _iterencode_list(lst, _current_indent_level):
+ def _iterencode_list(self, lst, markers, _current_indent_level):
if not lst:
yield '[]'
return
- if markers is not None:
- markerid = id(lst)
- if markerid in markers:
- raise ValueError("Circular reference detected")
- markers[markerid] = lst
+ self._mark_markers(markers, lst)
buf = '['
- if _indent is not None:
+ if self.indent is not None:
_current_indent_level += 1
- newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
- separator = _item_separator + newline_indent
+ newline_indent = '\n' + (' ' * (self.indent *
+ _current_indent_level))
+ separator = self.item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
- separator = _item_separator
+ separator = self.item_separator
first = True
for value in lst:
if first:
@@ -304,7 +374,7 @@
else:
buf = separator
if isinstance(value, basestring):
- yield buf + _encoder(value)
+ yield buf + '"' + self.encoder(value) + '"'
elif value is None:
yield buf + 'null'
elif value is True:
@@ -314,44 +384,43 @@
elif isinstance(value, (int, long)):
yield buf + str(value)
elif isinstance(value, float):
- yield buf + _floatstr(value)
+ yield buf + self._floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
- chunks = _iterencode_list(value, _current_indent_level)
+ chunks = self._iterencode_list(value, markers,
+ _current_indent_level)
elif isinstance(value, dict):
- chunks = _iterencode_dict(value, _current_indent_level)
+ chunks = self._iterencode_dict(value, markers,
+ _current_indent_level)
else:
- chunks = _iterencode(value, _current_indent_level)
+ chunks = self._iterencode(value, markers,
+ _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
- yield '\n' + (' ' * (_indent * _current_indent_level))
+ yield '\n' + (' ' * (self.indent * _current_indent_level))
yield ']'
- if markers is not None:
- del markers[markerid]
+ self._remove_markers(markers, lst)
- def _iterencode_dict(dct, _current_indent_level):
+ def _iterencode_dict(self, dct, markers, _current_indent_level):
if not dct:
yield '{}'
return
- if markers is not None:
- markerid = id(dct)
- if markerid in markers:
- raise ValueError("Circular reference detected")
- markers[markerid] = dct
+ self._mark_markers(markers, dct)
yield '{'
- if _indent is not None:
+ if self.indent is not None:
_current_indent_level += 1
- newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
- item_separator = _item_separator + newline_indent
+ newline_indent = '\n' + (' ' * (self.indent *
+ _current_indent_level))
+ item_separator = self.item_separator + newline_indent
yield newline_indent
else:
newline_indent = None
- item_separator = _item_separator
+ item_separator = self.item_separator
first = True
- if _sort_keys:
+ if self.sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
else:
items = dct.iteritems()
@@ -361,7 +430,7 @@
# JavaScript is weakly typed for these, so it makes sense to
# also allow them. Many encoders seem to do something like this.
elif isinstance(key, float):
- key = _floatstr(key)
+ key = self._floatstr(key)
elif key is True:
key = 'true'
elif key is False:
@@ -370,7 +439,7 @@
key = 'null'
elif isinstance(key, (int, long)):
key = str(key)
- elif _skipkeys:
+ elif self.skipkeys:
continue
else:
raise TypeError("key " + repr(key) + " is not a string")
@@ -378,10 +447,10 @@
first = False
else:
yield item_separator
- yield _encoder(key)
- yield _key_separator
+ yield '"' + self.encoder(key) + '"'
+ yield self.key_separator
if isinstance(value, basestring):
- yield _encoder(value)
+ yield '"' + self.encoder(value) + '"'
elif value is None:
yield 'null'
elif value is True:
@@ -391,26 +460,28 @@
elif isinstance(value, (int, long)):
yield str(value)
elif isinstance(value, float):
- yield _floatstr(value)
+ yield self._floatstr(value)
else:
if isinstance(value, (list, tuple)):
- chunks = _iterencode_list(value, _current_indent_level)
+ chunks = self._iterencode_list(value, markers,
+ _current_indent_level)
elif isinstance(value, dict):
- chunks = _iterencode_dict(value, _current_indent_level)
+ chunks = self._iterencode_dict(value, markers,
+ _current_indent_level)
else:
- chunks = _iterencode(value, _current_indent_level)
+ chunks = self._iterencode(value, markers,
+ _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
- yield '\n' + (' ' * (_indent * _current_indent_level))
+ yield '\n' + (' ' * (self.indent * _current_indent_level))
yield '}'
- if markers is not None:
- del markers[markerid]
+ self._remove_markers(markers, dct)
- def _iterencode(o, _current_indent_level):
+ def _iterencode(self, o, markers, _current_indent_level):
if isinstance(o, basestring):
- yield _encoder(o)
+ yield '"' + self.encoder(o) + '"'
elif o is None:
yield 'null'
elif o is True:
@@ -420,23 +491,19 @@
elif isinstance(o, (int, long)):
yield str(o)
elif isinstance(o, float):
- yield _floatstr(o)
+ yield self._floatstr(o)
elif isinstance(o, (list, tuple)):
- for chunk in _iterencode_list(o, _current_indent_level):
+ for chunk in self._iterencode_list(o, markers,
+ _current_indent_level):
yield chunk
elif isinstance(o, dict):
- for chunk in _iterencode_dict(o, _current_indent_level):
+ for chunk in self._iterencode_dict(o, markers,
+ _current_indent_level):
yield chunk
else:
- if markers is not None:
- markerid = id(o)
- if markerid in markers:
- raise ValueError("Circular reference detected")
- markers[markerid] = o
- o = _default(o)
- for chunk in _iterencode(o, _current_indent_level):
+ self._mark_markers(markers, o)
+ obj = self.default(o)
+ for chunk in self._iterencode(obj, markers,
+ _current_indent_level):
yield chunk
- if markers is not None:
- del markers[markerid]
-
- return _iterencode
+ self._remove_markers(markers, o)
diff --git a/lib-python/modified-2.7/json/tests/test_unicode.py b/lib-python/modified-2.7/json/tests/test_unicode.py
--- a/lib-python/modified-2.7/json/tests/test_unicode.py
+++ b/lib-python/modified-2.7/json/tests/test_unicode.py
@@ -80,3 +80,9 @@
self.assertEqual(type(json.loads(u'["a"]')[0]), unicode)
# Issue 10038.
self.assertEqual(type(json.loads('"foo"')), unicode)
+
+ def test_encode_not_utf_8(self):
+ self.assertEqual(json.dumps('\xb1\xe6', encoding='iso8859-2'),
+ '"\\u0105\\u0107"')
+ self.assertEqual(json.dumps(['\xb1\xe6'], encoding='iso8859-2'),
+ '["\\u0105\\u0107"]')
More information about the pypy-commit
mailing list