[pypy-commit] pypy utf8-unicode2: Fix error handling in unicode_internal codec
waedt
noreply at buildbot.pypy.org
Sat Aug 9 08:47:00 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72725:36fc05030a2a
Date: 2014-08-09 01:45 -0500
http://bitbucket.org/pypy/pypy/changeset/36fc05030a2a/
Log: Fix error handling in unicode_internal codec
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -1351,12 +1351,8 @@
def str_decode_unicode_internal(s, size, errors, final=False,
errorhandler=None):
- if BYTEORDER == 'little':
- result, length, byteorder = str_decode_utf_32_helper(
- s, size, errors, final, errorhandler, "little", "unicode_internal")
- else:
- result, length, byteorder = str_decode_utf_32_helper(
- s, size, errors, final, errorhandler, "internal", "unicode_internal")
+ result, length = str_decode_unicode_internal_helper(
+ s, size, errors, final, errorhandler)
return result, length
def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
@@ -1365,6 +1361,46 @@
else:
return unicode_encode_utf_32_be(s, size, errors, errorhandler)
+def str_decode_unicode_internal_helper(s, size, errors, final=True,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+
+ if BYTEORDER == 'little':
+ iorder = [0, 1, 2, 3]
+ else:
+ iorder = [3, 2, 1, 0]
+
+ if size == 0:
+ return Utf8Str(''), 0
+
+ pos = 0
+ result = Utf8Builder(size // 4)
+
+ while pos < size:
+ # remaining bytes at the end? (size should be divisible by 4)
+ if len(s) - pos < 4:
+ if not final:
+ break
+ r, pos = errorhandler(errors, "unicode_internal", "truncated data",
+ s, pos, len(s))
+ result.append_utf8(r)
+ if len(s) - pos < 4:
+ break
+ continue
+ ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
+ (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+ if ch >= 0x110000:
+ r, pos = errorhandler(errors, "unicode_internal",
+ "codepoint not in range(0x110000)",
+ s, pos, pos + 4)
+ result.append_utf8(r)
+ continue
+
+ result.append_codepoint(ch)
+ pos += 4
+ return result.build(), pos
+
# }}}
# ____________________________________________________________
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -727,3 +727,12 @@
_codecs.register_error("test.test_codecs_not_a_string", f)
raises(TypeError, u'\u1234'.encode, 'ascii',
'test.test_codecs_not_a_string')
+
+ def test_decode_callback(self):
+ import codecs
+ codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+ decoder = codecs.getdecoder("unicode_internal")
+ ab = u"ab".encode("unicode_internal")
+ ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
+ "UnicodeInternalTest")
+ assert (u"ab", 12) == ignored
More information about the pypy-commit
mailing list