[pypy-commit] pypy unicode-utf8: (fijal, argio) whack until we run into a serious problem

Mon Feb 27 06:12:22 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90376:85fee86ba1f7
Date: 2017-02-27 12:11 +0100
http://bitbucket.org/pypy/pypy/changeset/85fee86ba1f7/

Log:	(fijal, argio) whack until we run into a serious problem

diff --git a/pypy/module/__pypy__/__init__.py b/pypy/module/__pypy__/__init__.py
--- a/pypy/module/__pypy__/__init__.py
+++ b/pypy/module/__pypy__/__init__.py
@@ -10,7 +10,7 @@
 
     interpleveldefs = {
         "StringBuilder": "interp_builders.W_StringBuilder",
-        "UnicodeBuilder": "interp_builders.W_UnicodeBuilder",
+        #"UnicodeBuilder": "interp_builders.W_UnicodeBuilder",
     }
 
 class TimeModule(MixedModule):
diff --git a/pypy/module/__pypy__/interp_builders.py b/pypy/module/__pypy__/interp_builders.py
--- a/pypy/module/__pypy__/interp_builders.py
+++ b/pypy/module/__pypy__/interp_builders.py
@@ -64,4 +64,4 @@
     return W_Builder
 
 W_StringBuilder = create_builder("StringBuilder", str, StringBuilder, "newbytes")
-W_UnicodeBuilder = create_builder("UnicodeBuilder", unicode, UnicodeBuilder, "newunicode")
+#W_UnicodeBuilder = create_builder("UnicodeBuilder", unicode, UnicodeBuilder, "newunicode")
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -486,6 +486,7 @@
 @unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
              w_final=WrappedDefault(False))
 def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
+    assert False, "fix in the future"
     if errors is None:
         errors = 'strict'
     final = space.is_true(w_final)
@@ -507,6 +508,7 @@
 @unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
              w_final=WrappedDefault(False))
 def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
+    assert False, "fix in the future"
     final = space.is_true(w_final)
     state = space.fromcache(CodecState)
     if byteorder == 0:
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -47,8 +47,8 @@
         return NonConstant("foobar")
     identifier_w = bytes_w = str_w
 
-    def unicode_w(self, space):
-        return NonConstant(u"foobar")
+    def utf8_w(self, space):
+        return NonConstant("foobar")
 
     def int_w(self, space, allow_conversion=True):
         return NonConstant(-42)
@@ -208,7 +208,7 @@
     def newbytes(self, x):
         return w_some_obj()
 
-    def newunicode(self, x):
+    def newutf8(self, x, l):
         return w_some_obj()
 
     newtext = newbytes
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -17,7 +17,7 @@
 from pypy.objspace.std.stringmethods import StringMethods
 from pypy.objspace.std.unicodeobject import (
     decode_object, unicode_from_encoded_object,
-    getdefaultencoding)
+    getdefaultencoding, unicode_from_string)
 from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
 
 
@@ -53,17 +53,7 @@
         return space.newint(uid)
 
     def convert_to_w_unicode(self, space):
-        # Use the default encoding.
-        encoding = getdefaultencoding(space)
-        if encoding == 'ascii':
-            try:
-                rutf8.check_ascii(self._value)
-                return space.newutf8(self._value, len(self._value))
-            except rutf8.AsciiCheckError:
-                xxx
-        else:
-            xxx
-        return space.unicode_w(decode_object(space, self, encoding, None))
+        return unicode_from_string(space, self)
 
     def descr_add(self, space, w_other):
         """x.__add__(y) <==> x+y"""
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1032,7 +1032,7 @@
 
         unilist = space.listview_unicode(w_iterable)
         if unilist is not None:
-            xxx
+            assert False, "disabled"
             w_list.strategy = strategy = space.fromcache(UnicodeListStrategy)
             # need to copy because intlist can share with w_iterable
             w_list.lstorage = strategy.erase(unilist[:])
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -203,6 +203,7 @@
         return unichr(unicodedb.totitle(ord(ch)))
 
     def _newlist_unwrapped(self, space, lst):
+        assert False, "should not be called"
         return space.newlist_unicode(lst)
 
     @staticmethod
diff --git a/rpython/annotator/listdef.py b/rpython/annotator/listdef.py
--- a/rpython/annotator/listdef.py
+++ b/rpython/annotator/listdef.py
@@ -107,6 +107,9 @@
             self.bookkeeper.annotator.reflowfromposition(position_key)
 
     def generalize(self, s_other_value):
+        if hasattr(self.s_value, 'can_be_None') and not self.s_value.can_be_None and getattr(s_other_value, 'can_be_None', False):
+            import pdb
+            pdb.set_trace()
         s_new_value = unionof(self.s_value, s_other_value)
         updated = s_new_value != self.s_value
         if updated:
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -671,7 +671,7 @@
         return getbookkeeper().newlist(s_item)
 
     def method_rsplit(self, patt, max=-1):
-        s_item = self.basestringclass(no_nul=self.no_nul)
+        s_item = self.basestringclass(no_nul=self.no_nul, can_be_None=False)
         return getbookkeeper().newlist(s_item)
 
     def method_replace(self, s1, s2):
@@ -696,7 +696,7 @@
         if not s_enc.is_constant():
             raise AnnotatorError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1', 'utf-8'):
+        if enc not in ('ascii', 'latin-1', 'utf-8', 'utf8'):
             raise AnnotatorError("Encoding %s not supported for unicode" % (enc,))
         return SomeString(no_nul=self.no_nul)
     method_encode.can_only_throw = []
@@ -729,7 +729,7 @@
         if not s_enc.is_constant():
             raise AnnotatorError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1', 'utf-8'):
+        if enc not in ('ascii', 'latin-1', 'utf-8', 'utf8'):
             raise AnnotatorError("Encoding %s not supported for strings" % (enc,))
         return SomeUnicodeString(no_nul=self.no_nul)
     method_decode.can_only_throw = [UnicodeDecodeError]
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -346,8 +346,6 @@
     #
     # See also unicode_encode_utf8sp().
     #
-    if errorhandler is None:
-        errorhandler = default_unicode_error_encode
     # NB. a bit messy because rtyper/rstr.py also calls the same
     # function.  Make sure we annotate for the args it passes, too
     if NonConstant(False):
@@ -361,6 +359,9 @@
 
 def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
                               allow_surrogates=False):
+    # XXX hack
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
     assert(size >= 0)
     result = StringBuilder(size)
     pos = 0
diff --git a/rpython/rlib/streamio.py b/rpython/rlib/streamio.py
--- a/rpython/rlib/streamio.py
+++ b/rpython/rlib/streamio.py
@@ -708,7 +708,9 @@
                     assert stop >= 0
                     chunks.append(self.buf[:stop])
                     break
-                chunks.append(self.buf)
+                buf = self.buf
+                assert buf is not None
+                chunks.append(buf)
             return ''.join(chunks)
 
     def readline(self):
diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -335,7 +335,7 @@
             return hop.gendirectcall(self.ll.ll_str2unicode, v_self)
         elif encoding == 'latin-1':
             return hop.gendirectcall(self.ll_decode_latin1, v_self)
-        elif encoding == 'utf-8':
+        elif encoding == 'utf-8' or encoding == 'utf8':
             return hop.gendirectcall(self.ll_decode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
@@ -408,7 +408,7 @@
             return hop.gendirectcall(self.ll_str, v_self)
         elif encoding == "latin-1":
             return hop.gendirectcall(self.ll_encode_latin1, v_self)
-        elif encoding == 'utf-8':
+        elif encoding == 'utf-8' or encoding == 'utf8':
             return hop.gendirectcall(self.ll_encode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))