[pypy-commit] pypy unicode-utf8: Start fighting on a new branch with utf8 being the default storage

Tue Feb 21 06:17:07 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90253:c9309766eac6
Date: 2017-02-21 12:17 +0100
http://bitbucket.org/pypy/pypy/changeset/c9309766eac6/

Log:	Start fighting on a new branch with utf8 being the default storage
	for W_Unicode object

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -57,6 +57,7 @@
             assert 0 <= ps <= q
             substr = s[ps:q]
         else:
+            xxx
             substr = decode_unicode_utf8(space, s, ps, q)
         if rawmode:
             v = unicodehelper.decode_raw_unicode_escape(space, substr)
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -38,14 +38,16 @@
 def decode_unicode_escape(space, string):
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result, consumed = runicode.str_decode_unicode_escape(
+    # XXX pick better length, maybe
+    result, consumed = runicode.str_decode_utf8_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         unicodedata_handler=unicodedata_handler)
     return result
 
 def decode_raw_unicode_escape(space, string):
-    result, consumed = runicode.str_decode_raw_unicode_escape(
+    # XXX pick better length, maybe
+    result, consumed = runicode.str_decode_raw_utf8_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space))
     return result
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -5,7 +5,7 @@
 from pypy.interpreter import gateway
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import unwrap_spec, WrappedDefault
-from rpython.rlib.runicode import UNICHR
+from rpython.rlib.runicode import UNICHR, unichr_as_utf8
 from rpython.rlib.rfloat import isnan, isinf, round_double
 from rpython.rlib import rfloat
 import __builtin__
@@ -25,12 +25,12 @@
 @unwrap_spec(code=int)
 def unichr(space, code):
     "Return a Unicode string of one character with the given ordinal."
-    # XXX range checking!
+    # XXX this assumes unichr would be happy to return you surrogates
     try:
-        c = UNICHR(code)
+        s = unichr_as_utf8(code)
     except ValueError:
         raise oefmt(space.w_ValueError, "unichr() arg out of range")
-    return space.newunicode(c)
+    return space.newunicode(s)
 
 def len(space, w_obj):
     "len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -1,11 +1,10 @@
 """The builtin str implementation"""
 
 from rpython.rlib import jit
-from rpython.rlib.jit import we_are_jitted
 from rpython.rlib.objectmodel import (
     compute_hash, compute_unique_id, import_from_mixin)
 from rpython.rlib.buffer import StringBuffer
-from rpython.rlib.rstring import StringBuilder, replace
+from rpython.rlib.rstring import StringBuilder
 
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.interpreter.error import OperationError, oefmt
@@ -18,7 +17,7 @@
 from pypy.objspace.std.stringmethods import StringMethods
 from pypy.objspace.std.unicodeobject import (
     decode_object, unicode_from_encoded_object,
-    unicode_from_string, getdefaultencoding)
+    getdefaultencoding)
 from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
 
 
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -346,10 +346,10 @@
             return self.w_None
         return self.newtext(s)
 
-    def newunicode(self, uni):
-        assert uni is not None
-        assert isinstance(uni, unicode)
-        return W_UnicodeObject(uni)
+    def newunicode(self, utf8s):
+        assert utf8s is not None
+        assert isinstance(utf8s, str)
+        return W_UnicodeObject(utf8s)
 
     def type(self, w_obj):
         jit.promote(w_obj.__class__)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -7,7 +7,8 @@
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.runicode import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+    check_ascii, AsciiCheckError)
 
 from pypy.interpreter import unicodehelper
 from pypy.interpreter.baseobjspace import W_Root
@@ -23,21 +24,24 @@
 
 __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
            'encode_object', 'decode_object', 'unicode_from_object',
-           'unicode_from_string', 'unicode_to_decimal_w']
+           'utf8_from_string', 'unicode_to_decimal_w']
 
 
 class W_UnicodeObject(W_Root):
     import_from_mixin(StringMethods)
-    _immutable_fields_ = ['_value']
+    _immutable_fields_ = ['_utf8']
 
-    @enforceargs(uni=unicode)
-    def __init__(self, unistr):
-        assert isinstance(unistr, unicode)
-        self._value = unistr
+    @enforceargs(utf8str=str)
+    def __init__(self, utf8str, ucs4str=None):
+        assert isinstance(utf8str, str)
+        if ucs4str is not None:
+            assert isinstance(ucs4str, unicode)
+        self._utf8 = utf8str
+        self._ucs4 = ucs4str
 
     def __repr__(self):
         """representation for debugging purposes"""
-        return "%s(%r)" % (self.__class__.__name__, self._value)
+        return "%s(%r)" % (self.__class__.__name__, self._utf8)
 
     def unwrap(self, space):
         # for testing
@@ -79,8 +83,8 @@
     def str_w(self, space):
         return space.text_w(space.str(self))
 
-    def unicode_w(self, space):
-        return self._value
+    def utf8_w(self, space):
+        return self._utf8
 
     def readbuf_w(self, space):
         from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
@@ -117,7 +121,7 @@
     def _len(self):
         return len(self._value)
 
-    _val = unicode_w
+    _val = utf8_w
 
     @staticmethod
     def _use_rstr_ops(space, w_other):
@@ -128,9 +132,10 @@
     @staticmethod
     def _op_val(space, w_other, strict=None):
         if isinstance(w_other, W_UnicodeObject):
-            return w_other._value
+            return w_other._utf8
         if space.isinstance_w(w_other, space.w_bytes):
-            return unicode_from_string(space, w_other)._value
+            return utf8_from_string(space, w_other)._utf8
+        yyy
         if strict:
             raise oefmt(space.w_TypeError,
                 "%s arg must be None, unicode or str", strict)
@@ -235,7 +240,7 @@
         return encode_object(space, self, None, None)
 
     def descr_hash(self, space):
-        x = compute_hash(self._value)
+        x = compute_hash(self._utf8)
         return space.newint(x)
 
     def descr_eq(self, space, w_other):
@@ -564,17 +569,19 @@
     return unicode_from_encoded_object(space, w_res, None, "strict")
 
 
-def unicode_from_string(space, w_bytes):
+def utf8_from_string(space, w_bytes):
     # this is a performance and bootstrapping hack
     encoding = getdefaultencoding(space)
     if encoding != 'ascii':
+        xxx
         return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
     s = space.bytes_w(w_bytes)
     try:
-        return W_UnicodeObject(s.decode("ascii"))
-    except UnicodeDecodeError:
+        check_ascii(s)
+    except AsciiCheckError:
         # raising UnicodeDecodeError is messy, "please crash for me"
         return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+    return W_UnicodeObject(s)
 
 
 class UnicodeDocstrings:
@@ -1121,7 +1128,7 @@
     return [s for s in value]
 
 
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('')
 
 
 # Helper for converting int/long
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -43,6 +43,36 @@
             return ord(u[0])
         raise TypeError
 
+def unichr_as_utf8(code):
+    """ Encode code (numeric value) as utf8 encoded string
+    """
+    if code < 0:
+        raise ValueError
+    if code < 0x80:
+        # Encode ASCII
+        return chr(code)
+    if code < 0x0800:
+        # Encode Latin-1
+        return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
+    if code < 0x10000:
+        return (chr((0xe0 | (code >> 12))) +
+                chr((0x80 | ((code >> 6) & 0x3f))) +
+                chr((0x80 | (code & 0x3f))))
+    if code < 0x10ffff:
+        return (chr((0xf0 | (code >> 18))) +
+                chr((0x80 | ((code >> 12) & 0x3f))) +
+                chr((0x80 | ((code >> 6) & 0x3f))) +
+                chr((0x80 | (code & 0x3f))))
+    raise ValueError
+
+class AsciiCheckError(Exception):
+    pass
+
+def check_ascii(s):
+    for i in range(0, len(s)):
+        if ord(s[i]) & 0x80:
+            raise AsciiCheckError
+        
 if MAXUNICODE > sys.maxunicode:
     # A version of unichr which allows codes outside the BMP
     # even on narrow unicode builds.
@@ -1377,6 +1407,129 @@
 
     return builder.build(), pos
 
+def str_decode_utf8_escape(s, size, errors, final=False,
+                              errorhandler=None,
+                              unicodedata_handler=None):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+
+    if size == 0:
+        return '', 0
+
+    builder = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            builder.append(ch)
+            pos += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n': pass
+        elif ch == '\\': builder.append('\\')
+        elif ch == '\'': builder.append('\'')
+        elif ch == '\"': builder.append('\"')
+        elif ch == 'b' : builder.append('\b')
+        elif ch == 'f' : builder.append('\f')
+        elif ch == 't' : builder.append('\t')
+        elif ch == 'n' : builder.append('\n')
+        elif ch == 'r' : builder.append('\r')
+        elif ch == 'v' : builder.append('\v')
+        elif ch == 'a' : builder.append('\a')
+        elif '0' <= ch <= '7':
+            xxx
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            builder.append(unichr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            xxx
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \uXXXX
+        elif ch == 'u':
+            xxx
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            xxx
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \N{name}
+        elif ch == 'N' and unicodedata_handler is not None:
+            xxx
+            message = "malformed \\N character escape"
+            look = pos
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = unicodedata_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    if code <= MAXUNICODE:
+                        builder.append(UNICHR(code))
+                    else:
+                        code -= 0x10000L
+                        builder.append(unichr(0xD800 + (code >> 10)))
+                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            builder.append('\\')
+            builder.append(ch)
+
+    return builder.build(), pos
+
 def make_unicode_escape_function(pass_printable=False, unicode_output=False,
                                  quotes=False, prefix=None):
     # Python3 has two similar escape functions: One to implement
@@ -1497,6 +1650,54 @@
 # ____________________________________________________________
 # Raw unicode escape
 
+def str_decode_raw_utf8_escape(s, size, errors, final=False,
+                               errorhandler=None):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    if size == 0:
+        return '', 0
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            result.append(ch)
+            pos += 1
+            continue
+
+        # \u-escapes are only interpreted iff the number of leading
+        # backslashes is odd
+        bs = pos
+        while pos < size:
+            pos += 1
+            if pos == size or s[pos] != '\\':
+                break
+            result.append('\\')
+
+        # we have a backslash at the end of the string, stop here
+        if pos >= size:
+            result.append('\\')
+            break
+
+        if ((pos - bs) & 1 == 0 or
+            pos >= size or
+            (s[pos] != 'u' and s[pos] != 'U')):
+            result.append('\\')
+            result.append(s[pos])
+            pos += 1
+            continue
+
+        digits = 4 if s[pos] == 'u' else 8
+        message = "truncated \\uXXXX"
+        pos += 1
+        xxx # change hexescape to deal with utf8
+        pos = hexescape(result, s, pos, digits,
+                        "rawunicodeescape", errorhandler, message, errors)
+
+    return result.build(), pos
+
 def str_decode_raw_unicode_escape(s, size, errors, final=False,
                                   errorhandler=None):
     if errorhandler is None: