[pypy-commit] pypy py3.5: encode_utf8sp, decode_utf8sp
arigo
pypy.commits at gmail.com
Sat Dec 10 10:02:10 EST 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88999:22b1b835c734
Date: 2016-12-10 16:01 +0100
http://bitbucket.org/pypy/pypy/changeset/22b1b835c734/
Log: encode_utf8sp, decode_utf8sp
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,5 +1,6 @@
import py
from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
class Hit(Exception):
@@ -31,6 +32,14 @@
got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True)
assert got == "\xf0\x90\x80\x80"
+def test_encode_utf8sp():
+ sp = FakeSpace()
+ assert encode_utf8sp(sp, u"\ud800") == "\xed\xa0\x80"
+ assert encode_utf8sp(sp, u"\udc00") == "\xed\xb0\x80"
+ c = u"\udc00"
+ got = encode_utf8sp(sp, u"\ud800" + c)
+ assert got == "\xed\xa0\x80\xed\xb0\x80"
+
def test_decode_utf8():
space = FakeSpace()
assert decode_utf8(space, "abc") == u"abc"
@@ -49,3 +58,12 @@
assert map(ord, got) == [0xd800, 0xdc00]
got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True)
assert map(ord, got) == [0x10000]
+
+def test_decode_utf8sp():
+ space = FakeSpace()
+ assert decode_utf8sp(space, "\xed\xa0\x80") == u"\ud800"
+ assert decode_utf8sp(space, "\xed\xb0\x80") == u"\udc00"
+ got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80")
+ assert map(ord, got) == [0xd800, 0xdc00]
+ got = decode_utf8sp(space, "\xf0\x90\x80\x80")
+ assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -150,8 +150,21 @@
# If allow_surrogates=True, then revert to the Python 2 behavior
# which never raises UnicodeEncodeError. Surrogate pairs are then
# allowed, either paired or lone. A paired surrogate is considered
- # like the non-BMP character it stands for.
+ # like the non-BMP character it stands for. See also unicode_utf8sp().
return runicode.unicode_encode_utf_8(
uni, len(uni), "strict",
errorhandler=encode_error_handler(space),
allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni):
+ # Surrogate-preserving utf-8 encoding. Any surrogate character
+ # turns into its 3-bytes encoding, whether it is paired or not.
+ # This should always be reversible, and the reverse is
+ # decode_utf8sp().
+ return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+ # Surrogate-preserving utf-8 decoding. Assuming there is no
+ # encoding error, it should always be reversible, and the reverse is
+ # encode_utf8sp().
+ return decode_utf8(space, string, allow_surrogates=True)
More information about the pypy-commit
mailing list