[pypy-commit] pypy default: Consider utf16 surrogates when encoding to raw_unicode_escape,

Mon Apr 23 23:59:28 CEST 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r54700:7fc6072593dd
Date: 2012-04-23 23:54 +0200
http://bitbucket.org/pypy/pypy/changeset/7fc6072593dd/

Log:	Consider utf16 surrogates when encoding to raw_unicode_escape, like
	the unicode_escape, but in both case this must be done only in
	narrow unicode build!

diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -1234,7 +1234,7 @@
             pos += 1
             continue
 
-        if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+        if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
             # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
             pos += 1
             oc2 = ord(s[pos])
@@ -1350,6 +1350,20 @@
     pos = 0
     while pos < size:
         oc = ord(s[pos])
+
+        if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+            # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+            pos += 1
+            oc2 = ord(s[pos])
+
+            if 0xDC00 <= oc2 <= 0xDFFF:
+                ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+                raw_unicode_escape_helper(result, ucs)
+                pos += 1
+                continue
+            # Fall through: isolated surrogates are copied as-is
+            pos -= 1
+
         if oc < 0x100:
             result.append(chr(oc))
         else:
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -728,3 +728,18 @@
 
         res = interpret(f, [0x10140])
         assert res == 0x10140
+
+    def test_encode_surrogate_pair(self):
+        u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+        if runicode.MAXUNICODE < 65536:
+            # Narrow unicode build, consider utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+        else:
+            # Wide unicode build, don't merge utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'