[pypy-commit] pypy unicode-utf8: * whack whack whack

fijal pypy.commits at gmail.com
Sun Feb 26 17:42:13 EST 2017


Author: fijal
Branch: unicode-utf8
Changeset: r90369:50071ee2bad7
Date: 2017-02-26 20:07 +0100
http://bitbucket.org/pypy/pypy/changeset/50071ee2bad7/

Log:	* whack whack whack
	* have a more sensible 'check for utf8' and a slower version if we
	pass something awkward as errorhandler

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1679,6 +1679,13 @@
     def utf8_w(self, w_obj):
         return w_obj.utf8_w(self)
 
+    def unicode_w(self, w_obj):
+        return self.utf8_w(w_obj).decode('utf8')
+
+    def newunicode(self, u):
+        assert isinstance(u, unicode)
+        return self.newutf8(u.encode("utf8"), len(u))
+
     def convert_to_w_unicode(self, w_obj):
         return w_obj.convert_to_w_unicode(self)
 
@@ -1693,13 +1700,12 @@
                         "characters")
         return rstring.assert_str0(result)
 
-    def realunicode_w(self, w_obj):
-        # Like unicode_w, but only works if w_obj is really of type
+    def realutf8_w(self, w_obj):
+        # Like utf8_w, but only works if w_obj is really of type
         # 'unicode'.
-        xxx
         if not self.isinstance_w(w_obj, self.w_unicode):
             raise oefmt(self.w_TypeError, "argument must be a unicode")
-        return self.unicode_w(w_obj)
+        return self.utf8_w(w_obj)
 
     def bool_w(self, w_obj):
         # Unwraps a bool, also accepting an int for compatibility.
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -59,10 +59,12 @@
     # If there happen to be two 3-bytes encoding a pair of surrogates,
     # you still get two surrogate unicode characters in the result.
     # These are the Python2 rules; Python3 differs.
-    consumed, length = rutf8.str_check_utf8(
-        string, len(string), "strict", final=True,
-        errorhandler=decode_error_handler(space),
-        allow_surrogates=True)
+    try:
+        consumed, length = rutf8.str_check_utf8(string, len(string), True)
+    except rutf8.Utf8CheckError as e:
+        decode_error_handler(space)('strict', 'utf8', e.msg, string, e.startpos,
+                                    e.endpos)
+        raise False, "unreachable"
     return length
 
 def encode_utf8(space, uni):
@@ -78,4 +80,69 @@
 def utf8_encode_ascii(utf8, utf8len, errors, errorhandler):
     if len(utf8) == utf8len:
         return utf8
-    xxx
+    return rutf8.utf8_encode_ascii(utf8, errors, 'ascii',
+                                   'ordinal not in range (128)',
+                                   errorhandler)
+
+def str_decode_ascii(s, slen, errors, final, errorhandler):
+    try:
+        rutf8.check_ascii(s)
+        return s
+    except rutf8.AsciiCheckError:
+        return rutf8.str_decode_ascii(s, errors, errorhandler)
+
+# XXX wrappers, think about speed
+
+class DecodeWrapper(object):
+    def __init__(self, handler):
+        self.orig = handler
+
+    def handle(self, errors, encoding, msg, s, pos, endpos):
+        s, p, lgt = self.orig(errors, encoding, msg, s, pos, endpos)
+        return s.decode("utf8"), p
+
+class EncodeWrapper(object):
+    def __init__(self, handler):
+        self.orig = handler
+
+    def handle(self, errors, encoding, msg, s, pos, endpos):
+        s, rs, p, lgt = self.orig(errors, encoding, msg, s.encode("utf8"), pos, endpos)
+        return s, rs, p
+
+def utf8_encode_utf_7(utf8, utf8len, errors, errorhandler):
+    u = utf8.decode("utf8")
+    w = EncodeWrapper(errorhandler)
+    return runicode.unicode_encode_utf_7(u, len(u), errors, w.handle)
+
+def str_decode_utf_7(string, lgt, errors, final, errorhandler):
+    w = DecodeWrapper(errorhandler)
+    u, pos = runicode.str_decode_utf_7(string, lgt, errors, final, w.handle)
+    return u.encode('utf8'), pos, len(u)
+
+def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler):
+    w = DecodeWrapper(errorhandler)
+    u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, w.handle,
+                                                ud_handler)
+    return u.encode('utf8'), pos, len(u)
+
+def str_decode_raw_unicode_escape(s, slen, errors, final, errorhandler):
+    w = DecodeWrapper(errorhandler)
+    u, pos = runicode.str_decode_raw_unicode_escape(s, slen, errors, final,
+                                                    w.handle)
+    return u.encode('utf8'), pos, len(u)
+
+def str_decode_utf8(s, slen, errors, final, errorhandler):
+    w = DecodeWrapper(errorhandler)
+    u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle,
+        runicode.allow_surrogate_by_default)
+    return u.encode('utf8'), pos, len(u)
+
+def utf8_encode_utf_16(utf8, utf8len, errors, errorhandler):
+    w = EncodeWrapper(errorhandler)
+    u = utf8.decode("utf8")
+    return runicode.unicode_encode_utf_16(u, len(u), errors, w.handle)
+
+def utf8_encode_latin_1(utf8, utf8len, errors, errorhandler):
+    w = EncodeWrapper(errorhandler)
+    u = utf8.decode("utf8")
+    return runicode.unicode_encode_latin_1(u, len(u), errors, w.handle)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,4 +1,4 @@
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib.rstring import UnicodeBuilder
 from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
@@ -39,7 +39,7 @@
                 w_input = space.newbytes(input)
             else:
                 w_cls = space.w_UnicodeEncodeError
-                w_input = space.newunicode(input)
+                w_input = space.newutf8(input, -1)
             w_exc =  space.call_function(
                 w_cls,
                 space.newtext(encoding),
@@ -65,8 +65,8 @@
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
-            replace = space.unicode_w(w_replace)
-            return replace, newpos
+            w_replace = space.convert_to_w_unicode(w_replace)
+            return w_replace._utf8, newpos, w_replace._length
         return call_errorhandler
 
     def make_decode_errorhandler(self, space):
@@ -76,9 +76,9 @@
         errorhandler = self._make_errorhandler(space, False)
         def encode_call_errorhandler(errors, encoding, reason, input, startpos,
                                      endpos):
-            replace, newpos = errorhandler(errors, encoding, reason, input,
+            replace, newpos, lgt = errorhandler(errors, encoding, reason, input,
                                            startpos, endpos)
-            return replace, None, newpos
+            return replace, None, newpos, lgt
         return encode_call_errorhandler
 
     def get_unicodedata_handler(self, space):
@@ -190,7 +190,9 @@
 def ignore_errors(space, w_exc):
     check_exception(space, w_exc)
     w_end = space.getattr(w_exc, space.newtext('end'))
-    return space.newtuple([space.newunicode(u''), w_end])
+    return space.newtuple([space.newutf8('', 0), w_end])
+
+REPLACEMENT = u'\ufffd'.encode('utf8')
 
 def replace_errors(space, w_exc):
     check_exception(space, w_exc)
@@ -198,14 +200,14 @@
     w_end = space.getattr(w_exc, space.newtext('end'))
     size = space.int_w(w_end) - space.int_w(w_start)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        text = u'?' * size
-        return space.newtuple([space.newunicode(text), w_end])
+        text = '?' * size
+        return space.newtuple([space.newutf8(text, size), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
-        text = u'\ufffd'
-        return space.newtuple([space.newunicode(text), w_end])
+        text = REPLACEMENT
+        return space.newtuple([space.newutf8(text, 1), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
-        text = u'\ufffd' * size
-        return space.newtuple([space.newunicode(text), w_end])
+        text = REPLACEMENT * size
+        return space.newtuple([space.newutf8(text, size), w_end])
     else:
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
@@ -392,14 +394,17 @@
     @unwrap_spec(string='bufferstr', errors='str_or_None',
                  w_final=WrappedDefault(False))
     def wrap_decoder(space, string, errors="strict", w_final=None):
+        from pypy.interpreter import unicodehelper
+
         if errors is None:
             errors = 'strict'
         final = space.is_true(w_final)
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
-        result, consumed = func(string, len(string), errors,
+        func = getattr(unicodehelper, rname)
+        result, consumed, length = func(string, len(string), errors,
                                 final, state.decode_error_handler)
-        return space.newtuple([space.newunicode(result), space.newint(consumed)])
+        return space.newtuple([space.newutf8(result, length),
+                               space.newint(consumed)])
     wrap_decoder.func_name = rname
     globals()[name] = wrap_decoder
 
@@ -441,33 +446,42 @@
 # "allow_surrogates=True"
 @unwrap_spec(utf8='utf8', errors='str_or_None')
 def utf_8_encode(space, utf8, utf8len, errors="strict"):
-    if errors is None:
-        errors = 'strict'
-    xxx
-    state = space.fromcache(CodecState)
-    # NB. can't call unicode_encode_utf_8() directly because that's
-    # an @elidable function nowadays.  Instead, we need the _impl().
-    # (The problem is the errorhandler, which calls arbitrary Python.)
-    result = runicode.unicode_encode_utf_8_impl(
-        uni, len(uni), errors, state.encode_error_handler,
-        allow_surrogates=True)
-    return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+    return space.newtuple([space.newbytes(utf8), space.newint(utf8len)])
 
 @unwrap_spec(string='bufferstr', errors='str_or_None',
              w_final = WrappedDefault(False))
 def utf_8_decode(space, string, errors="strict", w_final=None):
+    from pypy.interpreter import unicodehelper
+
     if errors is None:
         errors = 'strict'
     final = space.is_true(w_final)
     state = space.fromcache(CodecState)
-    # NB. can't call str_decode_utf_8() directly because that's
-    # an @elidable function nowadays.  Instead, we need the _impl().
-    # (The problem is the errorhandler, which calls arbitrary Python.)
-    result, consumed = runicode.str_decode_utf_8_impl(
-        string, len(string), errors,
-        final, state.decode_error_handler,
-        allow_surrogates=True)
-    return space.newtuple([space.newunicode(result), space.newint(consumed)])
+    # call the fast version for checking
+    try:
+        consumed, lgt = rutf8.str_check_utf8(string, len(string), final)
+    except rutf8.Utf8CheckError as e:
+        if errors == 'strict':
+            # just raise
+            state.decode_error_handler(errors, 'utf8', e.msg, string,
+                                       e.startpos, e.endpos)
+            assert False, "raises"
+        # XXX do the way aroun runicode - we can optimize it later if we
+        # decide we care about obscure cases
+        res, consumed, lgt = unicodehelper.str_decode_utf8(string, len(string),
+            errors, final, state.decode_error_handler)
+        return space.newtuple([space.newutf8(res, lgt),
+                           space.newint(consumed)])
+    #result, consumed = runicode.str_decode_utf_8_impl(
+    #    string, len(string), errors,
+    #    final, state.decode_error_handler,
+    #    allow_surrogates=True)
+    if final or consumed == len(string):
+        return space.newtuple([space.newutf8(string, lgt),
+                               space.newint(consumed)])
+
+    return space.newtuple([space.newutf8(string[:consumed], lgt),
+                           space.newint(consumed)])
 
 @unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
              w_final=WrappedDefault(False))
@@ -655,6 +669,8 @@
 @unwrap_spec(string='bufferstr', errors='str_or_None',
              w_final=WrappedDefault(False))
 def unicode_escape_decode(space, string, errors="strict", w_final=None):
+    from pypy.interpreter import unicodehelper
+
     if errors is None:
         errors = 'strict'
     final = space.is_true(w_final)
@@ -662,12 +678,12 @@
 
     unicode_name_handler = state.get_unicodedata_handler(space)
 
-    result, consumed = runicode.str_decode_unicode_escape(
+    result, consumed, lgt = unicodehelper.str_decode_unicode_escape(
         string, len(string), errors,
         final, state.decode_error_handler,
         unicode_name_handler)
 
-    return space.newtuple([space.newunicode(result), space.newint(consumed)])
+    return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)])
 
 # ____________________________________________________________
 # Unicode-internal
diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -719,7 +719,7 @@
     def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason):
         # typechecking
         space.realstr_w(w_encoding)
-        space.realunicode_w(w_object)
+        space.realutf8_w(w_object)
         space.int_w(w_start)
         space.int_w(w_end)
         space.realstr_w(w_reason)
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -460,6 +460,11 @@
     def str_w(self, space):
         return self._value
 
+    def utf8_w(self, space):
+        # Use the default encoding.                                             
+        encoding = getdefaultencoding(space)
+        return space.utf8_w(decode_object(space, self, encoding, None))
+
     def buffer_w(self, space, flags):
         space.check_buf_flags(flags, True)
         return StringBuffer(self._value)
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -650,9 +650,9 @@
         if type(w_key) is self.space.StringObjectCls:
             self.switch_to_bytes_strategy(w_dict)
             return
-        elif type(w_key) is self.space.UnicodeObjectCls:
-            self.switch_to_unicode_strategy(w_dict)
-            return
+        #elif type(w_key) is self.space.UnicodeObjectCls:
+        #    self.switch_to_unicode_strategy(w_dict)
+        #    return
         w_type = self.space.type(w_key)
         if self.space.is_w(w_type, self.space.w_int):
             self.switch_to_int_strategy(w_dict)
@@ -668,6 +668,7 @@
         w_dict.dstorage = storage
 
     def switch_to_unicode_strategy(self, w_dict):
+        xxx
         strategy = self.space.fromcache(UnicodeDictStrategy)
         storage = strategy.get_empty_storage()
         w_dict.set_strategy(strategy)
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -511,7 +511,8 @@
             pass
         else:
             return space.newbytes(result)
-    fmt = space.unicode_w(w_fmt)
+    # XXX for now, this is performance critical
+    fmt = space.utf8_w(w_fmt).decode("utf8")
     formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
     result = formatter.format()
     return space.newunicode(result)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -491,8 +491,8 @@
             return w_obj.listview_unicode()
         if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
             return w_obj.listview_unicode()
-        if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
-            return w_obj.listview_unicode()
+        #if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
+        #    return w_obj.listview_unicode()
         if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
             return w_obj.getitems_unicode()
         return None
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -799,8 +799,8 @@
             strategy = self.space.fromcache(IntegerSetStrategy)
         elif type(w_key) is W_BytesObject:
             strategy = self.space.fromcache(BytesSetStrategy)
-        elif type(w_key) is W_UnicodeObject:
-            strategy = self.space.fromcache(UnicodeSetStrategy)
+        #elif type(w_key) is W_UnicodeObject:
+        #    strategy = self.space.fromcache(UnicodeSetStrategy)
         elif self.space.type(w_key).compares_by_identity():
             strategy = self.space.fromcache(IdentitySetStrategy)
         else:
@@ -1640,13 +1640,13 @@
         return
 
     # check for unicode
-    for w_item in iterable_w:
-        if type(w_item) is not W_UnicodeObject:
-            break
-    else:
-        w_set.strategy = space.fromcache(UnicodeSetStrategy)
-        w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
-        return
+    #for w_item in iterable_w:
+    #    if type(w_item) is not W_UnicodeObject:
+    #        break
+    #else:
+    #    w_set.strategy = space.fromcache(UnicodeSetStrategy)
+    #    w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
+    #    return
 
     # check for compares by identity
     for w_item in iterable_w:
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -142,6 +142,7 @@
         assert self.space.listview_bytes(w_d) == ["a", "b"]
 
     def test_listview_unicode_dict(self):
+        py.test.skip("listview_unicode disabled")
         w = self.space.wrap
         w_d = self.space.newdict()
         w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
@@ -175,6 +176,7 @@
         # XXX: it would be nice if the test passed without monkeypatch.undo(),
         # but we need space.newlist_unicode for it
         monkeypatch.undo() 
+        py.test.skip("listview_unicode disabled")
         w_d = self.space.newdict()
         w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(6))])
         w_l = self.space.call_method(w_d, "keys")
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -20,8 +20,8 @@
                           IntegerListStrategy)
         assert isinstance(W_ListObject(space, [wb('a'), wb('b')]).strategy,
                           BytesListStrategy)
-        assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
-                          UnicodeListStrategy)
+        #assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
+        #                  UnicodeListStrategy)
         assert isinstance(W_ListObject(space, [w(u'a'), wb('b')]).strategy,
                           ObjectListStrategy) # mixed unicode and bytes
 
@@ -47,7 +47,7 @@
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
         l.append(w(u'a'))
-        assert isinstance(l.strategy, UnicodeListStrategy)
+        #assert isinstance(l.strategy, UnicodeListStrategy)
 
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
@@ -74,6 +74,7 @@
         assert isinstance(l.strategy, ObjectListStrategy)
 
     def test_unicode_to_any(self):
+        py.test.skip("disabled")
         space = self.space
         l = W_ListObject(space, [space.wrap(u'a'), space.wrap(u'b'), space.wrap(u'c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
@@ -117,7 +118,7 @@
 
         # UnicodeStrategy to ObjectStrategy
         l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
-        assert isinstance(l.strategy, UnicodeListStrategy)
+        #assert isinstance(l.strategy, UnicodeListStrategy)
         l.setitem(0, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -145,7 +146,7 @@
 
         # UnicodeStrategy
         l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
-        assert isinstance(l.strategy, UnicodeListStrategy)
+        #assert isinstance(l.strategy, UnicodeListStrategy)
         l.insert(3, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -225,7 +226,7 @@
 
         # UnicodeStrategy to ObjectStrategy
         l = W_ListObject(space, [w(u'a'), w(u'b'), w(u'c')])
-        assert isinstance(l.strategy, UnicodeListStrategy)
+        #assert isinstance(l.strategy, UnicodeListStrategy)
         l.setslice(0, 1, 2, W_ListObject(space, [w(1), w(2), w(3)]))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -275,7 +276,7 @@
         l = W_ListObject(space, wrapitems([u"a",u"b",u"c",u"d",u"e"]))
         other = W_ListObject(space, wrapitems([u"a", u"b", u"c"]))
         keep_other_strategy(l, 0, 2, other.length(), other)
-        assert l.strategy is space.fromcache(UnicodeListStrategy)
+        #assert l.strategy is space.fromcache(UnicodeListStrategy)
 
         l = W_ListObject(space, wrapitems([1.1, 2.2, 3.3, 4.4, 5.5]))
         other = W_ListObject(space, [])
@@ -345,7 +346,7 @@
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
         empty.extend(W_ListObject(space, [w(u"a"), w(u"b"), w(u"c")]))
-        assert isinstance(empty.strategy, UnicodeListStrategy)
+        #assert isinstance(empty.strategy, UnicodeListStrategy)
 
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
@@ -601,7 +602,7 @@
         l1 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newbytes("zwei")])
         assert isinstance(l1.strategy, BytesListStrategy)
         l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"), self.space.newunicode(u"zwei")])
-        assert isinstance(l2.strategy, UnicodeListStrategy)
+        #assert isinstance(l2.strategy, UnicodeListStrategy)
         l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newunicode(u"zwei")])
         assert isinstance(l3.strategy, ObjectListStrategy)
 
@@ -612,6 +613,7 @@
         assert space.listview_bytes(w_l) == ["a", "b"]
 
     def test_listview_unicode(self):
+        py.test.skip("disabled")
         space = self.space
         assert space.listview_unicode(space.wrap(1)) == None
         w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
@@ -624,6 +626,7 @@
         assert space.str_w(space.call_method(space.wrap("c"), "join", w_l)) == "acb"
         #
         # the same for unicode
+        py.test.skip("disabled")
         w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
         w_l.getitems = None
         assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb"
@@ -636,6 +639,7 @@
         assert space.is_w(space.call_method(space.wrap(" -- "), "join", w_l), w_text)
         #
         # the same for unicode
+        py.test.skip("disabled")
         w_text = space.wrap(u"text")
         w_l = self.space.newlist([w_text])
         w_l.getitems = None
@@ -665,6 +669,7 @@
         assert space.listview_bytes(w_l4) == ["a", "b", "c"]
 
     def test_unicode_uses_newlist_unicode(self):
+        py.test.skip("disabled")
         space = self.space
         w_u = space.wrap(u"a b c")
         space.newlist = None
@@ -720,6 +725,7 @@
         assert self.space.listview_bytes(w_l) == ["a", "b"]
 
     def test_listview_unicode_list(self):
+        py.test.skip("disabled")
         space = self.space
         w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
         assert self.space.listview_unicode(w_l) == [u"a", u"b"]
diff --git a/pypy/objspace/std/test/test_obj.py b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -17,7 +17,7 @@
         cls.w_cpython_apptest = space.wrap(option.runappdirect and not hasattr(sys, 'pypy_translation_info'))
 
         def w_unwrap_wrap_unicode(space, w_obj):
-            return space.wrap(space.unicode_w(w_obj))
+            return space.newutf8(space.utf8_w(w_obj), w_obj._length)
         cls.w_unwrap_wrap_unicode = space.wrap(gateway.interp2app(w_unwrap_wrap_unicode))
         def w_unwrap_wrap_str(space, w_obj):
             return space.wrap(space.str_w(w_obj))
@@ -194,7 +194,8 @@
         assert id('') == (256 << 4) | 11     # always
         assert id(u'') == (257 << 4) | 11
         assert id('a') == (ord('a') << 4) | 11
-        assert id(u'\u1234') == ((~0x1234) << 4) | 11
+        # we no longer cache unicodes <128
+        # assert id(u'\u1234') == ((~0x1234) << 4) | 11
 
     def test_id_of_tuples(self):
         l = []
diff --git a/pypy/objspace/std/test/test_setobject.py b/pypy/objspace/std/test/test_setobject.py
--- a/pypy/objspace/std/test/test_setobject.py
+++ b/pypy/objspace/std/test/test_setobject.py
@@ -105,8 +105,8 @@
         w_list = self.space.iter(W_ListObject(self.space, [w(u"1"), w(u"2"), w(u"3")]))
         w_set = W_SetObject(self.space)
         _initialize_set(self.space, w_set, w_list)
-        assert w_set.strategy is self.space.fromcache(UnicodeSetStrategy)
-        assert w_set.strategy.unerase(w_set.sstorage) == {u"1":None, u"2":None, u"3":None}
+        #assert w_set.strategy is self.space.fromcache(UnicodeSetStrategy)
+        #assert w_set.strategy.unerase(w_set.sstorage) == {u"1":None, u"2":None, u"3":None}
 
         w_list = W_ListObject(self.space, [w("1"), w(2), w("3")])
         w_set = W_SetObject(self.space)
diff --git a/pypy/objspace/std/test/test_setstrategies.py b/pypy/objspace/std/test/test_setstrategies.py
--- a/pypy/objspace/std/test/test_setstrategies.py
+++ b/pypy/objspace/std/test/test_setstrategies.py
@@ -1,3 +1,5 @@
+
+import py
 from pypy.objspace.std.setobject import W_SetObject
 from pypy.objspace.std.setobject import (
     BytesIteratorImplementation, BytesSetStrategy, EmptySetStrategy,
@@ -26,8 +28,8 @@
         s = W_SetObject(self.space, self.wrapped(["a", "b"]))
         assert s.strategy is self.space.fromcache(BytesSetStrategy)
 
-        s = W_SetObject(self.space, self.wrapped([u"a", u"b"]))
-        assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
+        #s = W_SetObject(self.space, self.wrapped([u"a", u"b"]))
+        #assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
 
     def test_switch_to_object(self):
         s = W_SetObject(self.space, self.wrapped([1,2,3,4,5]))
@@ -40,6 +42,7 @@
         assert s1.strategy is self.space.fromcache(ObjectSetStrategy)
 
     def test_switch_to_unicode(self):
+        py.test.skip("disabled")
         s = W_SetObject(self.space, self.wrapped([]))
         s.add(self.space.wrap(u"six"))
         assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
@@ -128,11 +131,11 @@
         assert space.unwrap(it.next()) == "a"
         assert space.unwrap(it.next()) == "b"
         #
-        s = W_SetObject(space, self.wrapped([u"a", u"b"]))
-        it = s.iter()
-        assert isinstance(it, UnicodeIteratorImplementation)
-        assert space.unwrap(it.next()) == u"a"
-        assert space.unwrap(it.next()) == u"b"
+        #s = W_SetObject(space, self.wrapped([u"a", u"b"]))
+        #it = s.iter()
+        #assert isinstance(it, UnicodeIteratorImplementation)
+        #assert space.unwrap(it.next()) == u"a"
+        #assert space.unwrap(it.next()) == u"b"
 
     def test_listview(self):
         space = self.space
@@ -142,5 +145,5 @@
         s = W_SetObject(space, self.wrapped(["a", "b"]))
         assert sorted(space.listview_bytes(s)) == ["a", "b"]
         #
-        s = W_SetObject(space, self.wrapped([u"a", u"b"]))
-        assert sorted(space.listview_unicode(s)) == [u"a", u"b"]
+        #s = W_SetObject(space, self.wrapped([u"a", u"b"]))
+        #assert sorted(space.listview_unicode(s)) == [u"a", u"b"]
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -518,7 +518,7 @@
         raises(TypeError, u'hello'.translate)
         raises(TypeError, u'abababc'.translate, {ord('a'):''})
 
-    def test_unicode_form_encoded_object(self):
+    def test_unicode_from_encoded_object(self):
         assert unicode('x', 'utf-8') == u'x'
         assert unicode('x', 'utf-8', 'strict') == u'x'
 
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -70,12 +70,14 @@
     def immutable_unique_id(self, space):
         if self.user_overridden_class:
             return None
-        s = space.unicode_w(self)
-        if len(s) > 1:
+        s = space.utf8_w(self)
+        if len(s) > 2:
             uid = compute_unique_id(s)
         else:            # strings of len <= 1 are unique-ified
             if len(s) == 1:
                 base = ~ord(s[0])      # negative base values
+            elif len(s) == 2:
+                base = ~((ord(s[1]) << 8) | ord(s[0]))
             else:
                 base = 257       # empty unicode string: base value 257
             uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
@@ -88,9 +90,11 @@
         return self._utf8
 
     def readbuf_w(self, space):
+        # XXX for now
         from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
-        builder = StringBuilder(len(self._value) * UNICODE_SIZE)
-        for unich in self._value:
+        v = self._utf8.decode("utf8")
+        builder = StringBuilder(len(v) * UNICODE_SIZE)
+        for unich in v:
             pack_unichar(unich, builder)
         return StringBuffer(builder.build())
 
@@ -331,7 +335,8 @@
         formatter = newformat.unicode_formatter(space, spec)
         self2 = unicode_from_object(space, self)
         assert isinstance(self2, W_UnicodeObject)
-        return formatter.format_string(self2._value)
+        # XXX
+        return formatter.format_string(self2._utf8.decode("utf8"))
 
     def descr_mod(self, space, w_values):
         return mod_format(space, self, w_values, do_unicode=True)
@@ -617,21 +622,28 @@
         if errors is None or errors == 'strict':
             try:
                 if encoding == 'ascii':
-                    u = space.unicode_w(w_object)
-                    eh = unicodehelper.raise_unicode_exception_encode
-                    return space.newbytes(unicode_encode_ascii(
-                            u, len(u), None, errorhandler=eh))
+                    s = space.utf8_w(w_object)
+                    try:
+                        rutf8.check_ascii(s)
+                    except rutf8.AsciiCheckError as a:
+                        eh = unicodehelper.raise_unicode_exception_encode
+                        eh(None, "ascii", "ordinal not in range(128)", s,
+                            a.pos, a.pos + 1)
+                        assert False, "always raises"
+                    return space.newbytes(s)
                 if encoding == 'utf-8':
-                    u = space.unicode_w(w_object)
-                    eh = unicodehelper.raise_unicode_exception_encode
-                    return space.newbytes(unicode_encode_utf_8(
-                            u, len(u), None, errorhandler=eh,
-                            allow_surrogates=True))
+                    u = space.utf8_w(w_object)
+                    return space.newbytes(u)
+                    # XXX is this enough?
+                    #eh = unicodehelper.raise_unicode_exception_encode
+                    #return space.newbytes(unicode_encode_utf_8(
+                    #        u, len(u), None, errorhandler=eh,
+                    #        allow_surrogates=True))
             except unicodehelper.RUnicodeEncodeError as ue:
                 raise OperationError(space.w_UnicodeEncodeError,
                                      space.newtuple([
                     space.newtext(ue.encoding),
-                    space.newunicode(ue.object),
+                    space.newutf8(ue.object, -1),
                     space.newint(ue.start),
                     space.newint(ue.end),
                     space.newtext(ue.reason)]))
@@ -665,13 +677,15 @@
                 assert False
             return space.newutf8(s, len(s))
         if encoding == 'utf-8':
-            yyy
             s = space.charbuf_w(w_obj)
             eh = unicodehelper.decode_error_handler(space)
-            return space.newunicode(str_decode_utf_8(
-                    s, len(s), None, final=True, errorhandler=eh,
-                    allow_surrogates=True)[0])
-    xxx
+            try:
+                _, lgt = rutf8.str_check_utf8(s, len(s), final=True,
+                                              allow_surrogates=True)
+            except rutf8.Utf8CheckError as e:
+                eh(None, 'utf8', e.msg, s, e.startpos, e.endpos)
+                assert False, "has to raise"
+            return space.newutf8(s, lgt)
     w_codecs = space.getbuiltinmodule("_codecs")
     w_decode = space.getattr(w_codecs, space.newtext("decode"))
     if errors is None:
@@ -723,7 +737,6 @@
     # this is a performance and bootstrapping hack
     encoding = getdefaultencoding(space)
     if encoding != 'ascii':
-        xxx
         return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
     s = space.bytes_w(w_bytes)
     try:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,9 +1,6 @@
 
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import runicode, jit
-from rpython.rlib.rarithmetic import r_uint
-from rpython.rlib.nonconst import NonConstant
-from rpython.tool.sourcetools import func_with_new_name
 
 def unichr_as_utf8(code):
     """ Encode code (numeric value) as utf8 encoded string
@@ -81,6 +78,38 @@
 def default_unicode_error_check(*args):
     xxx
 
+def utf8_encode_ascii(s, errors, encoding, msg, errorhandler):
+    res = StringBuilder(len(s))
+    u_pos = 0
+    pos = 0
+    while pos < len(s):
+        chr1 = s[pos]
+        if ord(chr1) < 0x80:
+            res.append(chr1)
+        else:
+            repl, _, _, _ = errorhandler(errors, encoding, msg, s, u_pos, u_pos + 1)
+            res.append(repl)
+        u_pos += 1
+        pos = next_codepoint_pos(s, pos)
+    return res.build()
+
+def str_decode_ascii(s, errors, errorhandler):
+    # ASCII is equivalent to the first 128 ordinals in Unicode.
+    size = len(s)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        c = s[pos]
+        if ord(c) < 128:
+            result.append(c)
+        else:
+            r, _, _ = errorhandler(errors, "ascii", "ordinal not in range(128)",
+                                   s,  pos, pos + 1)
+            result.append(r)
+        pos += 1
+    return result.build(), pos, -1
+
+
 def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
     if errors == 'replace':
         return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
@@ -105,30 +134,28 @@
         return chr3 == 0xa8 or chr3 == 0xa9
     return False
 
-# if you can't use the @elidable version, call str_check_utf8_impl()
-# directly
+class Utf8CheckError(Exception):
+    def __init__(self, msg, startpos, endpos):
+        self.msg = msg
+        self.startpos = startpos
+        self.endpos = endpos
+
 @jit.elidable
-def str_check_utf8(s, size, errors, final=False,
-                   errorhandler=None,
+def str_check_utf8(s, size, final=False,
                    allow_surrogates=runicode.allow_surrogate_by_default):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_check
-    # XXX unclear, fix
+    """ A simplified version of utf8 encoder - it only works with 'strict'
+    error handling.
+    """
+    # XXX do the following in a cleaner way, e.g. via signature
     # NB. a bit messy because rtyper/rstr.py also calls the same
     # function.  Make sure we annotate for the args it passes, too
-    if NonConstant(False):
-        s = NonConstant('?????')
-        size = NonConstant(12345)
-        errors = NonConstant('strict')
-        final = NonConstant(True)
-        WTF # goes here
-        errorhandler = ll_unicode_error_decode
-        allow_surrogates = NonConstant(True)
-    return str_check_utf8_elidable(s, size, errors, final, errorhandler,
-                                   allow_surrogates=allow_surrogates)
-
-def str_check_utf8_impl(s, size, errors, final, errorhandler,
-                        allow_surrogates):
+    #if NonConstant(False):
+    #    s = NonConstant('?????')
+    #    size = NonConstant(12345)
+    #    errors = NonConstant('strict')
+    #    final = NonConstant(True)
+    #    errorhandler = ll_unicode_error_decode
+    #    allow_surrogates = NonConstant(True)
     if size == 0:
         return 0, 0
 
@@ -155,46 +182,43 @@
             # in case we need to continue running this loop
             if not charsleft:
                 # there's only the start byte and nothing else
-                errorhandler(errors, 'utf8', 'unexpected end of data',
-                             s, pos, pos+1)
+                raise Utf8CheckError('unexpected end of data', pos, pos + 1)
             ordch2 = ord(s[pos+1])
             if n == 3:
                 # 3-bytes seq with only a continuation byte
                 if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                     # second byte invalid, take the first and continue
-                    errorhandler(errors, 'utf8', 'invalid continuation byte',
-                                 s, pos, pos+1)
+                    raise Utf8CheckError('invalid continuation byte', pos,
+                                         pos + 1)
                 else:
                     # second byte valid, but third byte missing
-                    errorhandler(errors, 'utf8', 'unexpected end of data',
-                                 s, pos, pos+2)
+                    raise Utf8CheckError('unexpected end of data', pos, pos + 2)
             elif n == 4:
                 # 4-bytes seq with 1 or 2 continuation bytes
                 if runicode._invalid_byte_2_of_4(ordch1, ordch2):
                     # second byte invalid, take the first and continue
-                    errorhandler(errors, 'utf8', 'invalid continuation byte',
-                                 s, pos, pos+1)
+                    raise Utf8CheckError('invalid continuation byte', pos,
+                                         pos + 1)
                 elif charsleft == 2 and runicode._invalid_byte_3_of_4(ord(s[pos+2])):
                     # third byte invalid, take the first two and continue
-                    errorhandler(errors, 'utf8', 'invalid continuation byte',
-                                 s, pos, pos+2)
+                    raise Utf8CheckError('invalid continuation byte', pos,
+                                         pos + 2)
                 else:
                     # there's only 1 or 2 valid cb, but the others are missing
-                    errorhandler(errors, 'utf8', 'unexpected end of data',
-                                 s, pos, pos+charsleft+1)
+                    raise Utf8CheckError('unexpected end of data', pos,
+                                         pos + charsleft + 1)
             raise AssertionError("unreachable")
 
         if n == 0:
-            errorhandler(errors, 'utf8', 'invalid start byte', s, pos, pos+1)
+            raise Utf8CheckError('invalid start byte', pos, pos + 1)
         elif n == 1:
             assert 0, "ascii should have gone through the fast path"
 
         elif n == 2:
             ordch2 = ord(s[pos+1])
             if runicode._invalid_byte_2_of_2(ordch2):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+2)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 2)
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             lgt += 1
             pos += 2
@@ -203,13 +227,11 @@
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
             if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+1)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 1)
             elif runicode._invalid_byte_3_of_3(ordch3):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+2)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 2)
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
             lgt += 1
             pos += 3
@@ -219,17 +241,14 @@
             ordch3 = ord(s[pos+2])
             ordch4 = ord(s[pos+3])
             if runicode._invalid_byte_2_of_4(ordch1, ordch2):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+1)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 1)
             elif runicode._invalid_byte_3_of_4(ordch3):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+2)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 2)
             elif runicode._invalid_byte_4_of_4(ordch4):
-                errorhandler(errors, 'utf8', 'invalid continuation byte',
-                             s, pos, pos+3)
-                assert False, "unreachable"
+                raise Utf8CheckError('invalid continuation byte', pos,
+                                     pos + 3)
             # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
             c = (((ordch1 & 0x07) << 18) +      # 0b00000111
                  ((ordch2 & 0x3F) << 12) +      # 0b00111111
@@ -243,5 +262,3 @@
             pos += 4
 
     return pos, lgt
-str_check_utf8_elidable = jit.elidable(
-    func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))


More information about the pypy-commit mailing list