[pypy-commit] pypy py3.5: encode_utf8sp, decode_utf8sp

Sat Dec 10 10:02:10 EST 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88999:22b1b835c734
Date: 2016-12-10 16:01 +0100
http://bitbucket.org/pypy/pypy/changeset/22b1b835c734/

Log:	encode_utf8sp, decode_utf8sp

diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,5 +1,6 @@
 import py
 from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
 class Hit(Exception):
@@ -31,6 +32,14 @@
     got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True)
     assert got == "\xf0\x90\x80\x80"
 
+def test_encode_utf8sp():
+    sp = FakeSpace()
+    assert encode_utf8sp(sp, u"\ud800") == "\xed\xa0\x80"
+    assert encode_utf8sp(sp, u"\udc00") == "\xed\xb0\x80"
+    c = u"\udc00"
+    got = encode_utf8sp(sp, u"\ud800" + c)
+    assert got == "\xed\xa0\x80\xed\xb0\x80"
+
 def test_decode_utf8():
     space = FakeSpace()
     assert decode_utf8(space, "abc") == u"abc"
@@ -49,3 +58,12 @@
     assert map(ord, got) == [0xd800, 0xdc00]
     got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True)
     assert map(ord, got) == [0x10000]
+
+def test_decode_utf8sp():
+    space = FakeSpace()
+    assert decode_utf8sp(space, "\xed\xa0\x80") == u"\ud800"
+    assert decode_utf8sp(space, "\xed\xb0\x80") == u"\udc00"
+    got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80")
+    assert map(ord, got) == [0xd800, 0xdc00]
+    got = decode_utf8sp(space, "\xf0\x90\x80\x80")
+    assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -150,8 +150,21 @@
     # If allow_surrogates=True, then revert to the Python 2 behavior
     # which never raises UnicodeEncodeError.  Surrogate pairs are then
     # allowed, either paired or lone.  A paired surrogate is considered
-    # like the non-BMP character it stands for.
+    # like the non-BMP character it stands for.  See also unicode_utf8sp().
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),
         allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is
+    # decode_utf8sp().
+    return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+    # Surrogate-preserving utf-8 decoding.  Assuming there is no
+    # encoding error, it should always be reversible, and the reverse is
+    # encode_utf8sp().
+    return decode_utf8(space, string, allow_surrogates=True)