[pypy-svn] r48606 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test

Mon Nov 12 18:49:48 CET 2007

Author: cfbolz
Date: Mon Nov 12 18:49:47 2007
New Revision: 48606

Modified:
   pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
   pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
tests for decoding errors, fix a small bug in utf8


Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================

--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py	(original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py	Mon Nov 12 18:49:47 2007
@@ -59,6 +59,8 @@
                 r, pos = errorhandler(errors, "utf8",
                                       "unexpected end of data", s,  pos, size)
                 p += r
+                if (pos + n > size):
+                    break
         if n == 0:
             res = errorhandler(errors, "utf8", "unexpected code byte",
                                s,  pos, pos + 1)

Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py	(original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py	Mon Nov 12 18:49:47 2007
@@ -46,6 +46,31 @@
         assert called[0]
         assert "42424242" in result
 
+    def checkdecodeerror(self, s, encoding, start, stop, addstuff=True):
+        called = [0]
+        def errorhandler(errors, enc, msg, t, startingpos,
+                         endingpos, decode=True):
+            called[0] += 1
+            if called[0] == 1:
+                assert errors == "foo!"
+                assert enc == encoding
+                assert t is s
+                assert start == startingpos
+                assert stop == endingpos
+                assert decode
+                return u"42424242", stop
+            return "", endingpos
+        decoder = getattr(runicode,
+                          "str_decode_%s" % encoding.replace("-", ""))
+        if addstuff:
+            s += "some rest in ascii"
+        result, _ = decoder(s, len(s), "foo!", True, errorhandler)
+        assert called[0] > 0
+        assert "42424242" in result
+        if addstuff:
+            assert result.endswith(u"some rest in ascii")
+
+
 class TestDecoding(UnicodeTests):
     
     # XXX test bom recognition in utf-16
@@ -61,7 +86,6 @@
             for encoding in "utf8 latin1 utf16 utf-16-be utf-16-le".split():
                 self.checkdecode(unichr(i), encoding)
 
-
     def test_random(self):
         for i in range(10000):
             uni = unichr(random.randrange(sys.maxunicode))
@@ -72,6 +96,28 @@
         for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
             self.checkdecode(s, "utf8")
 
+    def test_utf8_errors(self):
+        for s in [# unexpected end of data
+                  "\xd7", "\xd6", "\xeb\x96", "\xf0\x90\x91"]:
+            self.checkdecodeerror(s, "utf8", 0, len(s), addstuff=False)
+            
+        for s in [# unexpected code byte
+                  "\x81", "\xbf",
+                  # invalid data 2 byte
+                  "\xd7\x50", "\xd6\x06", "\xd6\xD6",
+                  # invalid data 3 byte
+                  "\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95",
+                  "\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5",
+                  # invalid data 4 byte
+                  "\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93", 
+                  "\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93", 
+                  "\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3", 
+                  ]:
+            self.checkdecodeerror(s, "utf8", 0, len(s), addstuff=True)
+
+    def test_ascii_error(self):
+        self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
+
 
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):