[pypy-commit] pypy unicode-utf8: start passing some unicode tests. no UCS4 support yet completely

Wed Feb 22 12:02:47 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90309:f05bed30187f
Date: 2017-02-22 18:02 +0100
http://bitbucket.org/pypy/pypy/changeset/f05bed30187f/

Log:	start passing some unicode tests. no UCS4 support yet completely

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -58,7 +58,8 @@
     # you still get two surrogate unicode characters in the result.
     # These are the Python2 rules; Python3 differs.
     consumed, length = rutf8.str_check_utf8(
-        string, "strict", final=True, errorhandler=decode_error_handler(space),
+        string, len(string), "strict", final=True,
+        errorhandler=decode_error_handler(space),
         allow_surrogates=True)
     return length
 
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -16,7 +16,7 @@
 from pypy.objspace.std.formatting import mod_format
 from pypy.objspace.std.stringmethods import StringMethods
 from pypy.objspace.std.unicodeobject import (
-    decode_object, utf8_from_encoded_object,
+    decode_object, unicode_from_encoded_object,
     getdefaultencoding)
 from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
 
@@ -717,7 +717,7 @@
             self_as_unicode = unicode_from_encoded_object(space, self, None,
                                                           None)
             return space.newbool(
-                self_as_unicode._value.find(w_sub._value) >= 0)
+                self_as_unicode._utf8.find(w_sub._utf8) >= 0)
         return self._StringMethods_descr_contains(space, w_sub)
 
     _StringMethods_descr_replace = descr_replace
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -94,7 +94,7 @@
         else:
             return space.fromcache(BytesListStrategy)
 
-    elif type(w_firstobj) is W_UnicodeObject:
+    elif False and type(w_firstobj) is W_UnicodeObject: # disable unicode list strat
         # check for all-unicodes
         for i in range(1, len(list_w)):
             if type(list_w[i]) is not W_UnicodeObject:
@@ -195,6 +195,7 @@
 
     @staticmethod
     def newlist_unicode(space, list_u):
+        xxxx
         strategy = space.fromcache(UnicodeListStrategy)
         storage = strategy.erase(list_u)
         return W_ListObject.from_storage_and_strategy(space, storage, strategy)
@@ -958,8 +959,8 @@
             strategy = self.space.fromcache(IntegerListStrategy)
         elif type(w_item) is W_BytesObject:
             strategy = self.space.fromcache(BytesListStrategy)
-        elif type(w_item) is W_UnicodeObject:
-            strategy = self.space.fromcache(UnicodeListStrategy)
+        #elif type(w_item) is W_UnicodeObject:
+        #    strategy = self.space.fromcache(UnicodeListStrategy)
         elif type(w_item) is W_FloatObject:
             strategy = self.space.fromcache(FloatListStrategy)
         else:
@@ -2005,7 +2006,7 @@
         return self.space.newunicode(stringval)
 
     def unwrap(self, w_string):
-        return self.space.unicode_w(w_string)
+        return self.space.utf8_w(w_string)
 
     erase, unerase = rerased.new_erasing_pair("unicode")
     erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -299,6 +299,7 @@
     newlist_text = newlist_bytes
 
     def newlist_unicode(self, list_u):
+        return self.newlist(list_u)
         return W_ListObject.newlist_unicode(self, list_u)
 
     def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -2,13 +2,13 @@
 
 from rpython.rlib.objectmodel import (
     compute_hash, compute_unique_id, import_from_mixin,
-    enforceargs)
+    enforceargs, newlist_hint)
 from rpython.rlib.buffer import StringBuffer
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.runicode import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
     unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
-from rpython.rlib import rutf8
+from rpython.rlib import rutf8, jit
 
 from pypy.interpreter import unicodehelper
 from pypy.interpreter.baseobjspace import W_Root
@@ -110,8 +110,8 @@
                          "found", len(self._value))
         return space.newint(ord(self._value[0]))
 
-    def _new(self, value):
-        return W_UnicodeObject(value)
+    def _new(self, value, length):
+        return W_UnicodeObject(value, length)
 
     def _new_from_list(self, value):
         return W_UnicodeObject(u''.join(value))
@@ -120,7 +120,7 @@
         return W_UnicodeObject.EMPTY
 
     def _len(self):
-        return len(self._value)
+        return self._length
 
     _val = utf8_w
 
@@ -135,18 +135,25 @@
         if isinstance(w_other, W_UnicodeObject):
             return w_other._utf8
         if space.isinstance_w(w_other, space.w_bytes):
-            return utf8_from_string(space, w_other)._utf8
+            return unicode_from_string(space, w_other)._utf8
         if strict:
             raise oefmt(space.w_TypeError,
                 "%s arg must be None, unicode or str", strict)
-        return utf8_from_encoded_object(
-            space, w_other, None, "strict")._value
+        return unicode_from_encoded_object(
+            space, w_other, None, "strict")._utf8
+
+    def _convert_to_unicode(self, space, w_other):
+        if isinstance(w_other, W_UnicodeObject):
+            return w_other
+        if space.isinstance_w(w_other, space.w_bytes):
+            return unicode_from_string(space, w_other)
+        return unicode_from_encoded_object(space, w_other, None, "strict")
 
     def _chr(self, char):
         assert len(char) == 1
         return unicode(char)[0]
 
-    _builder = UnicodeBuilder
+    _builder = StringBuilder
 
     def _isupper(self, ch):
         return unicodedb.isupper(ord(ch))
@@ -423,6 +430,46 @@
     def _starts_ends_overflow(self, prefix):
         return len(prefix) == 0
 
+    def descr_add(self, space, w_other):
+        try:
+            w_other = self._convert_to_unicode(space, w_other)
+        except OperationError as e:
+            if e.match(space, space.w_TypeError):
+                return space.w_NotImplemented
+            raise
+        return W_UnicodeObject(self._utf8 + w_other._utf8,
+                               self._length + w_other._length)
+
+    @jit.look_inside_iff(lambda self, space, list_w, size:
+                         jit.loop_unrolling_heuristic(list_w, size))
+    def _str_join_many_items(self, space, list_w, size):
+        value = self._utf8
+        lgt = self._length * (size - 1)
+
+        prealloc_size = len(value) * (size - 1)
+        unwrapped = newlist_hint(size)
+        for i in range(size):
+            w_s = list_w[i]
+            check_item = self._join_check_item(space, w_s)
+            if check_item == 1:
+                raise oefmt(space.w_TypeError,
+                            "sequence item %d: expected string, %T found",
+                            i, w_s)
+            elif check_item == 2:
+                return self._join_autoconvert(space, list_w)
+            # XXX Maybe the extra copy here is okay? It was basically going to
+            #     happen anyway, what with being placed into the builder
+            w_u = self._convert_to_unicode(space, w_s)
+            unwrapped.append(w_u._utf8)
+            lgt += w_u._length
+            prealloc_size += len(unwrapped[i])
+
+        sb = self._builder(prealloc_size)
+        for i in range(size):
+            if value and i != 0:
+                sb.append(value)
+            sb.append(unwrapped[i])
+        return self._new(sb.build(), lgt)
 
 def wrapunicode(space, uni):
     return W_UnicodeObject(uni)
@@ -515,7 +562,7 @@
                 unicodehelper.decode_error_handler(space)(None,
                     'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
                 assert False
-            return space.newunicode(s)
+            return space.newunicode(s, len(s))
         if encoding == 'utf-8':
             yyy
             s = space.charbuf_w(w_obj)
@@ -534,7 +581,7 @@
     return w_retval
 
 
-def utf8_from_encoded_object(space, w_obj, encoding, errors):
+def unicode_from_encoded_object(space, w_obj, encoding, errors):
     # explicitly block bytearray on 2.7
     from .bytearrayobject import W_BytearrayObject
     if isinstance(w_obj, W_BytearrayObject):
@@ -571,7 +618,7 @@
     return unicode_from_encoded_object(space, w_res, None, "strict")
 
 
-def utf8_from_string(space, w_bytes):
+def unicode_from_string(space, w_bytes):
     # this is a performance and bootstrapping hack
     encoding = getdefaultencoding(space)
     if encoding != 'ascii':
@@ -582,7 +629,7 @@
         rutf8.check_ascii(s)
     except rutf8.AsciiCheckError:
         # raising UnicodeDecodeError is messy, "please crash for me"
-        return utf8_from_encoded_object(space, w_bytes, "ascii", "strict")
+        return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
     return W_UnicodeObject(s, len(s))
 
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -243,7 +243,7 @@
         errorhandler = default_unicode_error_decode
 
     if size == 0:
-        return '', 0
+        return '', 0, 0
 
     lgt = 0
     builder = StringBuilder(size)