[pypy-commit] pypy py3.5-newtext: Start

Fri Dec 16 09:01:04 EST 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5-newtext
Changeset: r89098:e5f85b6b5bbf
Date: 2016-12-16 14:51 +0100
http://bitbucket.org/pypy/pypy/changeset/e5f85b6b5bbf/

Log:	Start

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -6,7 +6,7 @@
 from rpython.rlib.buffer import StringBuffer
 from rpython.rlib.debug import make_sure_not_resized
 from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
-     compute_unique_id, specialize)
+     compute_unique_id, specialize, not_rpython)
 from rpython.rlib.signature import signature
 from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \
     INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX
@@ -255,6 +255,9 @@
     def identifier_w(self, space):
         self._typed_unwrap_error(space, "string")
 
+    def text_w(self, space):
+        self._typed_unwrap_error(space, "string")
+
     def bytearray_list_of_chars_w(self, space):
         self._typed_unwrap_error(space, "bytearray")
 
@@ -1570,18 +1573,20 @@
         return None if self.is_none(w_obj) else self.str_w(w_obj)
 
     def text_or_None_w(self, w_obj):
-        return None if self.is_none(w_obj) else self.identifier_w(w_obj)
+        return None if self.is_none(w_obj) else self.text_w(w_obj)
 
+    @not_rpython
     def str_w(self, w_obj):
         """
-        if w_obj is unicode, call identifier_w() (i.e., return the UTF-8
+        if w_obj is unicode, call text_w() (i.e., return the UTF-8-nosg
         encoded string). Else, call bytes_w().
 
-        Maybe we should kill str_w completely and manually substitute it with
-        identifier_w/bytes_w at all call sites?
+        We should kill str_w completely and manually substitute it with
+        text_w/identifier_w/bytes_w at all call sites.  It remains for
+        now for tests only.
         """
         if self.isinstance_w(w_obj, self.w_unicode):
-            return w_obj.identifier_w(self)
+            return w_obj.text_w(self)
         else:
             return w_obj.bytes_w(self)
 
@@ -1660,11 +1665,22 @@
             raise oefmt(self.w_TypeError, "argument must be a unicode")
         return self.unicode_w(w_obj)
 
+    def text_w(self, w_obj):
+        """
+        Unwrap a unicode object and return a 'utf-8-nosg' byte string
+        ('no surrogate').  This encoding always works and is in one-to-
+        one correspondance with the unicode.
+        """
+        return w_obj.text_w(self)
+
     def identifier_w(self, w_obj):
         """
         Unwrap an object which is used as an identifier (i.e. names of
         variables, methdods, functions, classes etc.). In py3k, identifiers
         are unicode strings and are unwrapped as UTF-8 encoded byte strings.
+        This differs from space.text_w() because it raises an app-level
+        UnicodeEncodeError if the unicode string contains surrogates.
+        This corresponds exactly to 'str.encode(obj, "utf-8")' at app-level.
         """
         return w_obj.identifier_w(self)
 
diff --git a/pypy/module/__pypy__/interp_stderrprinter.py b/pypy/module/__pypy__/interp_stderrprinter.py
--- a/pypy/module/__pypy__/interp_stderrprinter.py
+++ b/pypy/module/__pypy__/interp_stderrprinter.py
@@ -34,8 +34,8 @@
         return space.wrap(res)
 
     def descr_write(self, space, w_data):
-        # Encode to UTF-8.
-        data = space.identifier_w(w_data)
+        # Encode to UTF-8-nosg.
+        data = space.text_w(w_data)
 
         try:
             n = os.write(self.fd, data)
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -30,6 +30,16 @@
                 space.w_unicode, "__new__", space.w_unicode, w_uni)
         assert w_new is w_uni
 
+    def test_identifier_or_text_w(self):
+        space = self.space
+        w_uni = space.wrap(u'abcd')
+        assert space.identifier_w(w_uni) == 'abcd'
+        assert space.text_w(w_uni) == 'abcd'
+        w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
+        space.raises_w(space.w_UnicodeEncodeError, space.identifier_w, w_uni)
+        assert space.text_w(w_uni) == '\xed\xa4\xa1\xed\xb7\x9d'
+        #                             ^^^ and not the 4-bytes combined character
+
 
 class AppTestUnicodeStringStdOnly:
     def test_compares(self):
diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -1073,7 +1073,7 @@
             "__slots__ items must be strings, not '%T'", w_name)
     if not _isidentifier(space.unicode_w(w_name)):
         raise oefmt(space.w_TypeError, "__slots__ must be identifiers")
-    return w_name.identifier_w(space)
+    return w_name.text_w(space)
 
 def create_all_slots(w_self, hasoldstylebase, w_bestbase, force_new_layout):
     from pypy.objspace.std.listobject import StringSort
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,9 @@
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.runicode import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+    unicode_encode_utf8sp, unicode_encode_utf8_forbid_surrogates,
+    SurrogateError)
 from rpython.rlib import jit
 
 from pypy.interpreter import unicodehelper
@@ -77,24 +79,35 @@
     def unicode_w(self, space):
         return self._value
 
-    def identifier_w(self, space):
+    def _identifier_or_text_w(self, space, ignore_sg):
         try:
             identifier = jit.conditional_call_elidable(
                                 self._utf8, g_encode_utf8, self._value)
             if not jit.isconstant(self):
                 self._utf8 = identifier
-        except UnicodeEncodeError:
-            # bah, this is just to get an official app-level
-            # UnicodeEncodeError
+        except SurrogateError:
+            # If 'ignore_sg' is False, this logic is here only
+            # to get an official app-level UnicodeEncodeError.
+            # If 'ignore_sg' is True, we encode instead using
+            # unicode_encode_utf8sp().
             u = self._value
-            eh = unicodehelper.rpy_encode_error_handler()
-            try:
-                identifier = unicode_encode_utf_8(u, len(u), None,
-                                                  errorhandler=eh)
-            except unicodehelper.RUnicodeEncodeError as ue:
-                raise wrap_encode_error(space, ue)
+            if ignore_sg:
+                identifier = unicode_encode_utf8sp(u, len(u))
+            else:
+                eh = unicodehelper.rpy_encode_error_handler()
+                try:
+                    identifier = unicode_encode_utf_8(u, len(u), None,
+                                                      errorhandler=eh)
+                except unicodehelper.RUnicodeEncodeError as ue:
+                    raise wrap_encode_error(space, ue)
         return identifier
 
+    def text_w(self, space):
+        return self._identifier_or_text_w(space, ignore_sg=True)
+
+    def identifier_w(self, space):
+        return self._identifier_or_text_w(space, ignore_sg=False)
+
     def listview_unicode(self):
         return _create_list_from_unicode(self._value)
 
@@ -1279,7 +1292,7 @@
 @jit.elidable
 def g_encode_utf8(value):
     """This is a global function because of jit.conditional_call_value"""
-    return value.encode('utf-8')
+    return unicode_encode_utf8_forbid_surrogates(value, len(value))
 
 _repr_function, _ = make_unicode_escape_function(
     pass_printable=True, unicode_output=True, quotes=True, prefix='')
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -428,6 +428,37 @@
             _encodeUCS4(result, ch)
     return result.build()
 
+class SurrogateError(Exception):
+    pass
+
+def unicode_encode_utf8_forbid_surrogates(s, size):
+    # Strict surrogate-forbidding utf-8 encoding.  Any surrogate character
+    # raises an interp-level SurrogateError, even on 16-bit hosts.
+    # --- XXX check in detail what occurs on 16-bit hosts in PyPy 3 ---
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        elif ch < 0x10000:
+            if 0xD800 <= ch <= 0xDFFF:
+                raise SurrogateError
+            # Encode UCS2 Unicode ordinals
+            result.append((chr((0xe0 | (ch >> 12)))))
+            result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+            result.append((chr((0x80 | (ch & 0x3f)))))
+        else:
+            _encodeUCS4(result, ch)
+    return result.build()
+
 # ____________________________________________________________
 # utf-16