[pypy-commit] pypy py3.5-newtext: Start
arigo
pypy.commits at gmail.com
Fri Dec 16 09:01:04 EST 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5-newtext
Changeset: r89098:e5f85b6b5bbf
Date: 2016-12-16 14:51 +0100
http://bitbucket.org/pypy/pypy/changeset/e5f85b6b5bbf/
Log: Start
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -6,7 +6,7 @@
from rpython.rlib.buffer import StringBuffer
from rpython.rlib.debug import make_sure_not_resized
from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
- compute_unique_id, specialize)
+ compute_unique_id, specialize, not_rpython)
from rpython.rlib.signature import signature
from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \
INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX
@@ -255,6 +255,9 @@
def identifier_w(self, space):
self._typed_unwrap_error(space, "string")
+ def text_w(self, space):
+ self._typed_unwrap_error(space, "string")
+
def bytearray_list_of_chars_w(self, space):
self._typed_unwrap_error(space, "bytearray")
@@ -1570,18 +1573,20 @@
return None if self.is_none(w_obj) else self.str_w(w_obj)
def text_or_None_w(self, w_obj):
- return None if self.is_none(w_obj) else self.identifier_w(w_obj)
+ return None if self.is_none(w_obj) else self.text_w(w_obj)
+ @not_rpython
def str_w(self, w_obj):
"""
- if w_obj is unicode, call identifier_w() (i.e., return the UTF-8
+ if w_obj is unicode, call text_w() (i.e., return the UTF-8-nosg
encoded string). Else, call bytes_w().
- Maybe we should kill str_w completely and manually substitute it with
- identifier_w/bytes_w at all call sites?
+ We should kill str_w completely and manually substitute it with
+ text_w/identifier_w/bytes_w at all call sites. It remains for
+ now for tests only.
"""
if self.isinstance_w(w_obj, self.w_unicode):
- return w_obj.identifier_w(self)
+ return w_obj.text_w(self)
else:
return w_obj.bytes_w(self)
@@ -1660,11 +1665,22 @@
raise oefmt(self.w_TypeError, "argument must be a unicode")
return self.unicode_w(w_obj)
+ def text_w(self, w_obj):
+ """
+ Unwrap a unicode object and return a 'utf-8-nosg' byte string
+ ('no surrogate'). This encoding always works and is in one-to-
+ one correspondance with the unicode.
+ """
+ return w_obj.text_w(self)
+
def identifier_w(self, w_obj):
"""
Unwrap an object which is used as an identifier (i.e. names of
variables, methdods, functions, classes etc.). In py3k, identifiers
are unicode strings and are unwrapped as UTF-8 encoded byte strings.
+ This differs from space.text_w() because it raises an app-level
+ UnicodeEncodeError if the unicode string contains surrogates.
+ This corresponds exactly to 'str.encode(obj, "utf-8")' at app-level.
"""
return w_obj.identifier_w(self)
diff --git a/pypy/module/__pypy__/interp_stderrprinter.py b/pypy/module/__pypy__/interp_stderrprinter.py
--- a/pypy/module/__pypy__/interp_stderrprinter.py
+++ b/pypy/module/__pypy__/interp_stderrprinter.py
@@ -34,8 +34,8 @@
return space.wrap(res)
def descr_write(self, space, w_data):
- # Encode to UTF-8.
- data = space.identifier_w(w_data)
+ # Encode to UTF-8-nosg.
+ data = space.text_w(w_data)
try:
n = os.write(self.fd, data)
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -30,6 +30,16 @@
space.w_unicode, "__new__", space.w_unicode, w_uni)
assert w_new is w_uni
+ def test_identifier_or_text_w(self):
+ space = self.space
+ w_uni = space.wrap(u'abcd')
+ assert space.identifier_w(w_uni) == 'abcd'
+ assert space.text_w(w_uni) == 'abcd'
+ w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
+ space.raises_w(space.w_UnicodeEncodeError, space.identifier_w, w_uni)
+ assert space.text_w(w_uni) == '\xed\xa4\xa1\xed\xb7\x9d'
+ # ^^^ and not the 4-bytes combined character
+
class AppTestUnicodeStringStdOnly:
def test_compares(self):
diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -1073,7 +1073,7 @@
"__slots__ items must be strings, not '%T'", w_name)
if not _isidentifier(space.unicode_w(w_name)):
raise oefmt(space.w_TypeError, "__slots__ must be identifiers")
- return w_name.identifier_w(space)
+ return w_name.text_w(space)
def create_all_slots(w_self, hasoldstylebase, w_bestbase, force_new_layout):
from pypy.objspace.std.listobject import StringSort
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,9 @@
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+ unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+ unicode_encode_utf8sp, unicode_encode_utf8_forbid_surrogates,
+ SurrogateError)
from rpython.rlib import jit
from pypy.interpreter import unicodehelper
@@ -77,24 +79,35 @@
def unicode_w(self, space):
return self._value
- def identifier_w(self, space):
+ def _identifier_or_text_w(self, space, ignore_sg):
try:
identifier = jit.conditional_call_elidable(
self._utf8, g_encode_utf8, self._value)
if not jit.isconstant(self):
self._utf8 = identifier
- except UnicodeEncodeError:
- # bah, this is just to get an official app-level
- # UnicodeEncodeError
+ except SurrogateError:
+ # If 'ignore_sg' is False, this logic is here only
+ # to get an official app-level UnicodeEncodeError.
+ # If 'ignore_sg' is True, we encode instead using
+ # unicode_encode_utf8sp().
u = self._value
- eh = unicodehelper.rpy_encode_error_handler()
- try:
- identifier = unicode_encode_utf_8(u, len(u), None,
- errorhandler=eh)
- except unicodehelper.RUnicodeEncodeError as ue:
- raise wrap_encode_error(space, ue)
+ if ignore_sg:
+ identifier = unicode_encode_utf8sp(u, len(u))
+ else:
+ eh = unicodehelper.rpy_encode_error_handler()
+ try:
+ identifier = unicode_encode_utf_8(u, len(u), None,
+ errorhandler=eh)
+ except unicodehelper.RUnicodeEncodeError as ue:
+ raise wrap_encode_error(space, ue)
return identifier
+ def text_w(self, space):
+ return self._identifier_or_text_w(space, ignore_sg=True)
+
+ def identifier_w(self, space):
+ return self._identifier_or_text_w(space, ignore_sg=False)
+
def listview_unicode(self):
return _create_list_from_unicode(self._value)
@@ -1279,7 +1292,7 @@
@jit.elidable
def g_encode_utf8(value):
"""This is a global function because of jit.conditional_call_value"""
- return value.encode('utf-8')
+ return unicode_encode_utf8_forbid_surrogates(value, len(value))
_repr_function, _ = make_unicode_escape_function(
pass_printable=True, unicode_output=True, quotes=True, prefix='')
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -428,6 +428,37 @@
_encodeUCS4(result, ch)
return result.build()
+class SurrogateError(Exception):
+ pass
+
+def unicode_encode_utf8_forbid_surrogates(s, size):
+ # Strict surrogate-forbidding utf-8 encoding. Any surrogate character
+ # raises an interp-level SurrogateError, even on 16-bit hosts.
+ # --- XXX check in detail what occurs on 16-bit hosts in PyPy 3 ---
+ assert(size >= 0)
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+ if ch < 0x80:
+ # Encode ASCII
+ result.append(chr(ch))
+ elif ch < 0x0800:
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ elif ch < 0x10000:
+ if 0xD800 <= ch <= 0xDFFF:
+ raise SurrogateError
+ # Encode UCS2 Unicode ordinals
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ else:
+ _encodeUCS4(result, ch)
+ return result.build()
+
# ____________________________________________________________
# utf-16
More information about the pypy-commit
mailing list