[pypy-commit] pypy default: Consider utf16 surrogates when encoding to raw_unicode_escape,
amauryfa
noreply at buildbot.pypy.org
Mon Apr 23 23:59:28 CEST 2012
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r54700:7fc6072593dd
Date: 2012-04-23 23:54 +0200
http://bitbucket.org/pypy/pypy/changeset/7fc6072593dd/
Log: Consider utf16 surrogates when encoding to raw_unicode_escape, like
the unicode_escape, but in both case this must be done only in
narrow unicode build!
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -1234,7 +1234,7 @@
pos += 1
continue
- if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+ if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
# Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
pos += 1
oc2 = ord(s[pos])
@@ -1350,6 +1350,20 @@
pos = 0
while pos < size:
oc = ord(s[pos])
+
+ if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+ # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+ pos += 1
+ oc2 = ord(s[pos])
+
+ if 0xDC00 <= oc2 <= 0xDFFF:
+ ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+ raw_unicode_escape_helper(result, ucs)
+ pos += 1
+ continue
+ # Fall through: isolated surrogates are copied as-is
+ pos -= 1
+
if oc < 0x100:
result.append(chr(oc))
else:
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -728,3 +728,18 @@
res = interpret(f, [0x10140])
assert res == 0x10140
+
+ def test_encode_surrogate_pair(self):
+ u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+ if runicode.MAXUNICODE < 65536:
+ # Narrow unicode build, consider utf16 surrogate pairs
+ assert runicode.unicode_encode_unicode_escape(
+ u, len(u), True) == r'\U00010000'
+ assert runicode.unicode_encode_raw_unicode_escape(
+ u, len(u), True) == r'\U00010000'
+ else:
+ # Wide unicode build, don't merge utf16 surrogate pairs
+ assert runicode.unicode_encode_unicode_escape(
+ u, len(u), True) == r'\ud800\udc00'
+ assert runicode.unicode_encode_raw_unicode_escape(
+ u, len(u), True) == r'\ud800\udc00'
More information about the pypy-commit
mailing list