[pypy-commit] pypy default: Give up the fact that str.encode('utf-8') doesn't accept surrogates

Sun Feb 19 10:54:49 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r90205:08b2dac171f5
Date: 2017-02-19 16:54 +0100
http://bitbucket.org/pypy/pypy/changeset/08b2dac171f5/

Log:	Give up the fact that str.encode('utf-8') doesn't accept surrogates
	in RPython. This is just asking for troubles because the same code
	accepts surrogates when run as plain Python (2.7).

diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -25,11 +25,7 @@
         assert value is not None
         errorhandler = runicode.default_unicode_error_decode
         u, pos = runicode.str_decode_utf_8_elidable(
-            value, len(value), 'strict', True, errorhandler, False)
-        # XXX should it really be 'allow_surrogates=False'?  In RPython,
-        # unicode.decode('utf-8') happily accepts surrogates.  This
-        # makes it hard to test untranslated (it's the cause of a
-        # failure in lib-python's test_warnings on PyPy3, for example)
+            value, len(value), 'strict', True, errorhandler, True)
         # XXX maybe the whole ''.decode('utf-8') should be not RPython.
         return self.ll.llunicode(u)
 
@@ -397,7 +393,7 @@
         errorhandler = runicode.default_unicode_error_encode
         bytes = runicode.unicode_encode_utf_8_elidable(
             s, len(s), 'strict',
-            errorhandler=errorhandler, allow_surrogates=False)
+            errorhandler=errorhandler, allow_surrogates=True)
         return self.ll.llstr(bytes)
 
     def rtype_method_encode(self, hop):
diff --git a/rpython/rtyper/test/test_runicode.py b/rpython/rtyper/test/test_runicode.py
--- a/rpython/rtyper/test/test_runicode.py
+++ b/rpython/rtyper/test/test_runicode.py
@@ -110,7 +110,13 @@
             x = u'\ud800' + unichr(n)
             return x.encode('utf-8')
 
-        self.interpret_raises(UnicodeEncodeError, g, [38])
+        # used to raise in RPython, but not when run as plain Python,
+        # which just makes code very hard to test.  Nowadays, .encode()
+        # and .decode() accept surrogates like in Python 2.7.  Use
+        # functions from the rlib.runicode module if you need stricter
+        # behavior.
+        #self.interpret_raises(UnicodeEncodeError, g, [38])
+        assert self.ll_to_string(self.interpret(g, [38])) == g(38)
 
     def test_utf_8_encoding_annotation(self):
         from rpython.rlib.runicode import unicode_encode_utf_8