[pypy-commit] pypy default: Give up the fact that str.encode('utf-8') doesn't accept surrogates
arigo
pypy.commits at gmail.com
Sun Feb 19 10:54:49 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r90205:08b2dac171f5
Date: 2017-02-19 16:54 +0100
http://bitbucket.org/pypy/pypy/changeset/08b2dac171f5/
Log: Give up the fact that str.encode('utf-8') doesn't accept surrogates
in RPython. This is just asking for troubles because the same code
accepts surrogates when run as plain Python (2.7).
diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -25,11 +25,7 @@
assert value is not None
errorhandler = runicode.default_unicode_error_decode
u, pos = runicode.str_decode_utf_8_elidable(
- value, len(value), 'strict', True, errorhandler, False)
- # XXX should it really be 'allow_surrogates=False'? In RPython,
- # unicode.decode('utf-8') happily accepts surrogates. This
- # makes it hard to test untranslated (it's the cause of a
- # failure in lib-python's test_warnings on PyPy3, for example)
+ value, len(value), 'strict', True, errorhandler, True)
# XXX maybe the whole ''.decode('utf-8') should be not RPython.
return self.ll.llunicode(u)
@@ -397,7 +393,7 @@
errorhandler = runicode.default_unicode_error_encode
bytes = runicode.unicode_encode_utf_8_elidable(
s, len(s), 'strict',
- errorhandler=errorhandler, allow_surrogates=False)
+ errorhandler=errorhandler, allow_surrogates=True)
return self.ll.llstr(bytes)
def rtype_method_encode(self, hop):
diff --git a/rpython/rtyper/test/test_runicode.py b/rpython/rtyper/test/test_runicode.py
--- a/rpython/rtyper/test/test_runicode.py
+++ b/rpython/rtyper/test/test_runicode.py
@@ -110,7 +110,13 @@
x = u'\ud800' + unichr(n)
return x.encode('utf-8')
- self.interpret_raises(UnicodeEncodeError, g, [38])
+ # used to raise in RPython, but not when run as plain Python,
+ # which just makes code very hard to test. Nowadays, .encode()
+ # and .decode() accept surrogates like in Python 2.7. Use
+ # functions from the rlib.runicode module if you need stricter
+ # behavior.
+ #self.interpret_raises(UnicodeEncodeError, g, [38])
+ assert self.ll_to_string(self.interpret(g, [38])) == g(38)
def test_utf_8_encoding_annotation(self):
from rpython.rlib.runicode import unicode_encode_utf_8
More information about the pypy-commit
mailing list