[pypy-commit] pypy unicode-utf8: start passing some unicode tests. no UCS4 support yet completely
fijal
pypy.commits at gmail.com
Wed Feb 22 12:02:47 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90309:f05bed30187f
Date: 2017-02-22 18:02 +0100
http://bitbucket.org/pypy/pypy/changeset/f05bed30187f/
Log: start passing some unicode tests. no UCS4 support yet completely
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -58,7 +58,8 @@
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
consumed, length = rutf8.str_check_utf8(
- string, "strict", final=True, errorhandler=decode_error_handler(space),
+ string, len(string), "strict", final=True,
+ errorhandler=decode_error_handler(space),
allow_surrogates=True)
return length
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -16,7 +16,7 @@
from pypy.objspace.std.formatting import mod_format
from pypy.objspace.std.stringmethods import StringMethods
from pypy.objspace.std.unicodeobject import (
- decode_object, utf8_from_encoded_object,
+ decode_object, unicode_from_encoded_object,
getdefaultencoding)
from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
@@ -717,7 +717,7 @@
self_as_unicode = unicode_from_encoded_object(space, self, None,
None)
return space.newbool(
- self_as_unicode._value.find(w_sub._value) >= 0)
+ self_as_unicode._utf8.find(w_sub._utf8) >= 0)
return self._StringMethods_descr_contains(space, w_sub)
_StringMethods_descr_replace = descr_replace
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -94,7 +94,7 @@
else:
return space.fromcache(BytesListStrategy)
- elif type(w_firstobj) is W_UnicodeObject:
+ elif False and type(w_firstobj) is W_UnicodeObject: # disable unicode list strat
# check for all-unicodes
for i in range(1, len(list_w)):
if type(list_w[i]) is not W_UnicodeObject:
@@ -195,6 +195,7 @@
@staticmethod
def newlist_unicode(space, list_u):
+ xxxx
strategy = space.fromcache(UnicodeListStrategy)
storage = strategy.erase(list_u)
return W_ListObject.from_storage_and_strategy(space, storage, strategy)
@@ -958,8 +959,8 @@
strategy = self.space.fromcache(IntegerListStrategy)
elif type(w_item) is W_BytesObject:
strategy = self.space.fromcache(BytesListStrategy)
- elif type(w_item) is W_UnicodeObject:
- strategy = self.space.fromcache(UnicodeListStrategy)
+ #elif type(w_item) is W_UnicodeObject:
+ # strategy = self.space.fromcache(UnicodeListStrategy)
elif type(w_item) is W_FloatObject:
strategy = self.space.fromcache(FloatListStrategy)
else:
@@ -2005,7 +2006,7 @@
return self.space.newunicode(stringval)
def unwrap(self, w_string):
- return self.space.unicode_w(w_string)
+ return self.space.utf8_w(w_string)
erase, unerase = rerased.new_erasing_pair("unicode")
erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -299,6 +299,7 @@
newlist_text = newlist_bytes
def newlist_unicode(self, list_u):
+ return self.newlist(list_u)
return W_ListObject.newlist_unicode(self, list_u)
def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -2,13 +2,13 @@
from rpython.rlib.objectmodel import (
compute_hash, compute_unique_id, import_from_mixin,
- enforceargs)
+ enforceargs, newlist_hint)
from rpython.rlib.buffer import StringBuffer
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
-from rpython.rlib import rutf8
+from rpython.rlib import rutf8, jit
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
@@ -110,8 +110,8 @@
"found", len(self._value))
return space.newint(ord(self._value[0]))
- def _new(self, value):
- return W_UnicodeObject(value)
+ def _new(self, value, length):
+ return W_UnicodeObject(value, length)
def _new_from_list(self, value):
return W_UnicodeObject(u''.join(value))
@@ -120,7 +120,7 @@
return W_UnicodeObject.EMPTY
def _len(self):
- return len(self._value)
+ return self._length
_val = utf8_w
@@ -135,18 +135,25 @@
if isinstance(w_other, W_UnicodeObject):
return w_other._utf8
if space.isinstance_w(w_other, space.w_bytes):
- return utf8_from_string(space, w_other)._utf8
+ return unicode_from_string(space, w_other)._utf8
if strict:
raise oefmt(space.w_TypeError,
"%s arg must be None, unicode or str", strict)
- return utf8_from_encoded_object(
- space, w_other, None, "strict")._value
+ return unicode_from_encoded_object(
+ space, w_other, None, "strict")._utf8
+
+ def _convert_to_unicode(self, space, w_other):
+ if isinstance(w_other, W_UnicodeObject):
+ return w_other
+ if space.isinstance_w(w_other, space.w_bytes):
+ return unicode_from_string(space, w_other)
+ return unicode_from_encoded_object(space, w_other, None, "strict")
def _chr(self, char):
assert len(char) == 1
return unicode(char)[0]
- _builder = UnicodeBuilder
+ _builder = StringBuilder
def _isupper(self, ch):
return unicodedb.isupper(ord(ch))
@@ -423,6 +430,46 @@
def _starts_ends_overflow(self, prefix):
return len(prefix) == 0
+ def descr_add(self, space, w_other):
+ try:
+ w_other = self._convert_to_unicode(space, w_other)
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ return space.w_NotImplemented
+ raise
+ return W_UnicodeObject(self._utf8 + w_other._utf8,
+ self._length + w_other._length)
+
+ @jit.look_inside_iff(lambda self, space, list_w, size:
+ jit.loop_unrolling_heuristic(list_w, size))
+ def _str_join_many_items(self, space, list_w, size):
+ value = self._utf8
+ lgt = self._length * (size - 1)
+
+ prealloc_size = len(value) * (size - 1)
+ unwrapped = newlist_hint(size)
+ for i in range(size):
+ w_s = list_w[i]
+ check_item = self._join_check_item(space, w_s)
+ if check_item == 1:
+ raise oefmt(space.w_TypeError,
+ "sequence item %d: expected string, %T found",
+ i, w_s)
+ elif check_item == 2:
+ return self._join_autoconvert(space, list_w)
+ # XXX Maybe the extra copy here is okay? It was basically going to
+ # happen anyway, what with being placed into the builder
+ w_u = self._convert_to_unicode(space, w_s)
+ unwrapped.append(w_u._utf8)
+ lgt += w_u._length
+ prealloc_size += len(unwrapped[i])
+
+ sb = self._builder(prealloc_size)
+ for i in range(size):
+ if value and i != 0:
+ sb.append(value)
+ sb.append(unwrapped[i])
+ return self._new(sb.build(), lgt)
def wrapunicode(space, uni):
return W_UnicodeObject(uni)
@@ -515,7 +562,7 @@
unicodehelper.decode_error_handler(space)(None,
'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
assert False
- return space.newunicode(s)
+ return space.newunicode(s, len(s))
if encoding == 'utf-8':
yyy
s = space.charbuf_w(w_obj)
@@ -534,7 +581,7 @@
return w_retval
-def utf8_from_encoded_object(space, w_obj, encoding, errors):
+def unicode_from_encoded_object(space, w_obj, encoding, errors):
# explicitly block bytearray on 2.7
from .bytearrayobject import W_BytearrayObject
if isinstance(w_obj, W_BytearrayObject):
@@ -571,7 +618,7 @@
return unicode_from_encoded_object(space, w_res, None, "strict")
-def utf8_from_string(space, w_bytes):
+def unicode_from_string(space, w_bytes):
# this is a performance and bootstrapping hack
encoding = getdefaultencoding(space)
if encoding != 'ascii':
@@ -582,7 +629,7 @@
rutf8.check_ascii(s)
except rutf8.AsciiCheckError:
# raising UnicodeDecodeError is messy, "please crash for me"
- return utf8_from_encoded_object(space, w_bytes, "ascii", "strict")
+ return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
return W_UnicodeObject(s, len(s))
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -243,7 +243,7 @@
errorhandler = default_unicode_error_decode
if size == 0:
- return '', 0
+ return '', 0, 0
lgt = 0
builder = StringBuilder(size)
More information about the pypy-commit
mailing list