[pypy-commit] pypy py3k: Fix utf-8 encoding; all test_runicode passes.

Tue Oct 18 00:37:53 CEST 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r48163:8942f2c46162
Date: 2011-10-17 20:18 +0200
http://bitbucket.org/pypy/pypy/changeset/8942f2c46162/

Log:	Fix utf-8 encoding; all test_runicode passes.

diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -253,6 +253,8 @@
     result.append((chr((0x80 | (ch & 0x3f)))))
 
 def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_encode
     assert(size >= 0)
     result = StringBuilder(size)
     pos = 0
@@ -279,11 +281,14 @@
                         pos += 1
                         _encodeUCS4(result, ch3)
                         continue
-                r, pos = errorhandler(errors, 'utf-8',
-                                      'surrogates not allowed',
-                                      s, pos-1, pos)
-                result.append(r)
-                continue
+                    r, pos = errorhandler(errors, 'utf-8',
+                                          'surrogates not allowed',
+                                          s, pos-1, pos)
+                    result.append(r)
+                    continue
+                result.append((chr((0xe0 | (ch >> 12)))))
+                result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+                result.append((chr((0x80 | (ch & 0x3f)))))
             else:
                 _encodeUCS4(result, ch)
     return result.build()
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -118,6 +118,9 @@
         for i in range(10000):
             for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
+                if encoding == 'utf-8' and 0xd800 <= i <= 0xdfff:
+                    # Don't try to encode lone surrogates
+                    continue
                 self.checkdecode(unichr(i), encoding)
 
     def test_random(self):