[pypy-svn] r48606 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Nov 12 18:49:48 CET 2007
Author: cfbolz
Date: Mon Nov 12 18:49:47 2007
New Revision: 48606
Modified:
pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
tests for decoding errors, fix a small bug in utf8
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py Mon Nov 12 18:49:47 2007
@@ -59,6 +59,8 @@
r, pos = errorhandler(errors, "utf8",
"unexpected end of data", s, pos, size)
p += r
+ if (pos + n > size):
+ break
if n == 0:
res = errorhandler(errors, "utf8", "unexpected code byte",
s, pos, pos + 1)
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py Mon Nov 12 18:49:47 2007
@@ -46,6 +46,31 @@
assert called[0]
assert "42424242" in result
+ def checkdecodeerror(self, s, encoding, start, stop, addstuff=True):
+ called = [0]
+ def errorhandler(errors, enc, msg, t, startingpos,
+ endingpos, decode=True):
+ called[0] += 1
+ if called[0] == 1:
+ assert errors == "foo!"
+ assert enc == encoding
+ assert t is s
+ assert start == startingpos
+ assert stop == endingpos
+ assert decode
+ return u"42424242", stop
+ return "", endingpos
+ decoder = getattr(runicode,
+ "str_decode_%s" % encoding.replace("-", ""))
+ if addstuff:
+ s += "some rest in ascii"
+ result, _ = decoder(s, len(s), "foo!", True, errorhandler)
+ assert called[0] > 0
+ assert "42424242" in result
+ if addstuff:
+ assert result.endswith(u"some rest in ascii")
+
+
class TestDecoding(UnicodeTests):
# XXX test bom recognition in utf-16
@@ -61,7 +86,6 @@
for encoding in "utf8 latin1 utf16 utf-16-be utf-16-le".split():
self.checkdecode(unichr(i), encoding)
-
def test_random(self):
for i in range(10000):
uni = unichr(random.randrange(sys.maxunicode))
@@ -72,6 +96,28 @@
for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
self.checkdecode(s, "utf8")
+ def test_utf8_errors(self):
+ for s in [# unexpected end of data
+ "\xd7", "\xd6", "\xeb\x96", "\xf0\x90\x91"]:
+ self.checkdecodeerror(s, "utf8", 0, len(s), addstuff=False)
+
+ for s in [# unexpected code byte
+ "\x81", "\xbf",
+ # invalid data 2 byte
+ "\xd7\x50", "\xd6\x06", "\xd6\xD6",
+ # invalid data 3 byte
+ "\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95",
+ "\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5",
+ # invalid data 4 byte
+ "\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93",
+ "\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93",
+ "\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3",
+ ]:
+ self.checkdecodeerror(s, "utf8", 0, len(s), addstuff=True)
+
+ def test_ascii_error(self):
+ self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
+
class TestEncoding(UnicodeTests):
def test_all_ascii(self):
More information about the Pypy-commit
mailing list