[pypy-commit] pypy unicode-utf8: * whack whack whack
fijal
pypy.commits at gmail.com
Sun Feb 26 17:42:13 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90369:50071ee2bad7
Date: 2017-02-26 20:07 +0100
http://bitbucket.org/pypy/pypy/changeset/50071ee2bad7/
Log: * whack whack whack
* have a more sensible 'check for utf8' and a slower version if we
pass something awkward as errorhandler
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1679,6 +1679,13 @@
def utf8_w(self, w_obj):
return w_obj.utf8_w(self)
+ def unicode_w(self, w_obj):
+ return self.utf8_w(w_obj).decode('utf8')
+
+ def newunicode(self, u):
+ assert isinstance(u, unicode)
+ return self.newutf8(u.encode("utf8"), len(u))
+
def convert_to_w_unicode(self, w_obj):
return w_obj.convert_to_w_unicode(self)
@@ -1693,13 +1700,12 @@
"characters")
return rstring.assert_str0(result)
- def realunicode_w(self, w_obj):
- # Like unicode_w, but only works if w_obj is really of type
+ def realutf8_w(self, w_obj):
+ # Like utf8_w, but only works if w_obj is really of type
# 'unicode'.
- xxx
if not self.isinstance_w(w_obj, self.w_unicode):
raise oefmt(self.w_TypeError, "argument must be a unicode")
- return self.unicode_w(w_obj)
+ return self.utf8_w(w_obj)
def bool_w(self, w_obj):
# Unwraps a bool, also accepting an int for compatibility.
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -59,10 +59,12 @@
# If there happen to be two 3-bytes encoding a pair of surrogates,
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
- consumed, length = rutf8.str_check_utf8(
- string, len(string), "strict", final=True,
- errorhandler=decode_error_handler(space),
- allow_surrogates=True)
+ try:
+ consumed, length = rutf8.str_check_utf8(string, len(string), True)
+ except rutf8.Utf8CheckError as e:
+ decode_error_handler(space)('strict', 'utf8', e.msg, string, e.startpos,
+ e.endpos)
+ raise False, "unreachable"
return length
def encode_utf8(space, uni):
@@ -78,4 +80,69 @@
def utf8_encode_ascii(utf8, utf8len, errors, errorhandler):
if len(utf8) == utf8len:
return utf8
- xxx
+ return rutf8.utf8_encode_ascii(utf8, errors, 'ascii',
+ 'ordinal not in range (128)',
+ errorhandler)
+
+def str_decode_ascii(s, slen, errors, final, errorhandler):
+ try:
+ rutf8.check_ascii(s)
+ return s
+ except rutf8.AsciiCheckError:
+ return rutf8.str_decode_ascii(s, errors, errorhandler)
+
+# XXX wrappers, think about speed
+
+class DecodeWrapper(object):
+ def __init__(self, handler):
+ self.orig = handler
+
+ def handle(self, errors, encoding, msg, s, pos, endpos):
+ s, p, lgt = self.orig(errors, encoding, msg, s, pos, endpos)
+ return s.decode("utf8"), p
+
+class EncodeWrapper(object):
+ def __init__(self, handler):
+ self.orig = handler
+
+ def handle(self, errors, encoding, msg, s, pos, endpos):
+ s, rs, p, lgt = self.orig(errors, encoding, msg, s.encode("utf8"), pos, endpos)
+ return s, rs, p
+
+def utf8_encode_utf_7(utf8, utf8len, errors, errorhandler):
+ u = utf8.decode("utf8")
+ w = EncodeWrapper(errorhandler)
+ return runicode.unicode_encode_utf_7(u, len(u), errors, w.handle)
+
+def str_decode_utf_7(string, lgt, errors, final, errorhandler):
+ w = DecodeWrapper(errorhandler)
+ u, pos = runicode.str_decode_utf_7(string, lgt, errors, final, w.handle)
+ return u.encode('utf8'), pos, len(u)
+
+def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler):
+ w = DecodeWrapper(errorhandler)
+ u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, w.handle,
+ ud_handler)
+ return u.encode('utf8'), pos, len(u)
+
+def str_decode_raw_unicode_escape(s, slen, errors, final, errorhandler):
+ w = DecodeWrapper(errorhandler)
+ u, pos = runicode.str_decode_raw_unicode_escape(s, slen, errors, final,
+ w.handle)
+ return u.encode('utf8'), pos, len(u)
+
+def str_decode_utf8(s, slen, errors, final, errorhandler):
+ w = DecodeWrapper(errorhandler)
+ u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle,
+ runicode.allow_surrogate_by_default)
+ return u.encode('utf8'), pos, len(u)
+
+def utf8_encode_utf_16(utf8, utf8len, errors, errorhandler):
+ w = EncodeWrapper(errorhandler)
+ u = utf8.decode("utf8")
+ return runicode.unicode_encode_utf_16(u, len(u), errors, w.handle)
+
+def utf8_encode_latin_1(utf8, utf8len, errors, errorhandler):
+ w = EncodeWrapper(errorhandler)
+ u = utf8.decode("utf8")
+ return runicode.unicode_encode_latin_1(u, len(u), errors, w.handle)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,4 +1,4 @@
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.rstring import UnicodeBuilder
from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
@@ -39,7 +39,7 @@
w_input = space.newbytes(input)
else:
w_cls = space.w_UnicodeEncodeError
- w_input = space.newunicode(input)
+ w_input = space.newutf8(input, -1)
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -65,8 +65,8 @@
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
newpos)
- replace = space.unicode_w(w_replace)
- return replace, newpos
+ w_replace = space.convert_to_w_unicode(w_replace)
+ return w_replace._utf8, newpos, w_replace._length
return call_errorhandler
def make_decode_errorhandler(self, space):
@@ -76,9 +76,9 @@
errorhandler = self._make_errorhandler(space, False)
def encode_call_errorhandler(errors, encoding, reason, input, startpos,
endpos):
- replace, newpos = errorhandler(errors, encoding, reason, input,
+ replace, newpos, lgt = errorhandler(errors, encoding, reason, input,
startpos, endpos)
- return replace, None, newpos
+ return replace, None, newpos, lgt
return encode_call_errorhandler
def get_unicodedata_handler(self, space):
@@ -190,7 +190,9 @@
def ignore_errors(space, w_exc):
check_exception(space, w_exc)
w_end = space.getattr(w_exc, space.newtext('end'))
- return space.newtuple([space.newunicode(u''), w_end])
+ return space.newtuple([space.newutf8('', 0), w_end])
+
+REPLACEMENT = u'\ufffd'.encode('utf8')
def replace_errors(space, w_exc):
check_exception(space, w_exc)
@@ -198,14 +200,14 @@
w_end = space.getattr(w_exc, space.newtext('end'))
size = space.int_w(w_end) - space.int_w(w_start)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- text = u'?' * size
- return space.newtuple([space.newunicode(text), w_end])
+ text = '?' * size
+ return space.newtuple([space.newutf8(text, size), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
- text = u'\ufffd'
- return space.newtuple([space.newunicode(text), w_end])
+ text = REPLACEMENT
+ return space.newtuple([space.newutf8(text, 1), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
- text = u'\ufffd' * size
- return space.newtuple([space.newunicode(text), w_end])
+ text = REPLACEMENT * size
+ return space.newtuple([space.newutf8(text, size), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -392,14 +394,17 @@
@unwrap_spec(string='bufferstr', errors='str_or_None',
w_final=WrappedDefault(False))
def wrap_decoder(space, string, errors="strict", w_final=None):
+ from pypy.interpreter import unicodehelper
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- func = getattr(runicode, rname)
- result, consumed = func(string, len(string), errors,
+ func = getattr(unicodehelper, rname)
+ result, consumed, length = func(string, len(string), errors,
final, state.decode_error_handler)
- return space.newtuple([space.newunicode(result), space.newint(consumed)])
+ return space.newtuple([space.newutf8(result, length),
+ space.newint(consumed)])
wrap_decoder.func_name = rname
globals()[name] = wrap_decoder
@@ -441,33 +446,42 @@
# "allow_surrogates=True"
@unwrap_spec(utf8='utf8', errors='str_or_None')
def utf_8_encode(space, utf8, utf8len, errors="strict"):
- if errors is None:
- errors = 'strict'
- xxx
- state = space.fromcache(CodecState)
- # NB. can't call unicode_encode_utf_8() directly because that's
- # an @elidable function nowadays. Instead, we need the _impl().
- # (The problem is the errorhandler, which calls arbitrary Python.)
- result = runicode.unicode_encode_utf_8_impl(
- uni, len(uni), errors, state.encode_error_handler,
- allow_surrogates=True)
- return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+ return space.newtuple([space.newbytes(utf8), space.newint(utf8len)])
@unwrap_spec(string='bufferstr', errors='str_or_None',
w_final = WrappedDefault(False))
def utf_8_decode(space, string, errors="strict", w_final=None):
+ from pypy.interpreter import unicodehelper
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- # NB. can't call str_decode_utf_8() directly because that's
- # an @elidable function nowadays. Instead, we need the _impl().
- # (The problem is the errorhandler, which calls arbitrary Python.)
- result, consumed = runicode.str_decode_utf_8_impl(
- string, len(string), errors,
- final, state.decode_error_handler,
- allow_surrogates=True)
- return space.newtuple([space.newunicode(result), space.newint(consumed)])
+ # call the fast version for checking
+ try:
+ consumed, lgt = rutf8.str_check_utf8(string, len(string), final)
+ except rutf8.Utf8CheckError as e:
+ if errors == 'strict':
+ # just raise
+ state.decode_error_handler(errors, 'utf8', e.msg, string,
+ e.startpos, e.endpos)
+ assert False, "raises"
+ # XXX do the way aroun runicode - we can optimize it later if we
+ # decide we care about obscure cases
+ res, consumed, lgt = unicodehelper.str_decode_utf8(string, len(string),
+ errors, final, state.decode_error_handler)
+ return space.newtuple([space.newutf8(res, lgt),
+ space.newint(consumed)])
+ #result, consumed = runicode.str_decode_utf_8_impl(
+ # string, len(string), errors,
+ # final, state.decode_error_handler,
+ # allow_surrogates=True)
+ if final or consumed == len(string):
+ return space.newtuple([space.newutf8(string, lgt),
+ space.newint(consumed)])
+
+ return space.newtuple([space.newutf8(string[:consumed], lgt),
+ space.newint(consumed)])
@unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
w_final=WrappedDefault(False))
@@ -655,6 +669,8 @@
@unwrap_spec(string='bufferstr', errors='str_or_None',
w_final=WrappedDefault(False))
def unicode_escape_decode(space, string, errors="strict", w_final=None):
+ from pypy.interpreter import unicodehelper
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
@@ -662,12 +678,12 @@
unicode_name_handler = state.get_unicodedata_handler(space)
- result, consumed = runicode.str_decode_unicode_escape(
+ result, consumed, lgt = unicodehelper.str_decode_unicode_escape(
string, len(string), errors,
final, state.decode_error_handler,
unicode_name_handler)
- return space.newtuple([space.newunicode(result), space.newint(consumed)])
+ return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)])
# ____________________________________________________________
# Unicode-internal
diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -719,7 +719,7 @@
def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason):
# typechecking
space.realstr_w(w_encoding)
- space.realunicode_w(w_object)
+ space.realutf8_w(w_object)
space.int_w(w_start)
space.int_w(w_end)
space.realstr_w(w_reason)
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -460,6 +460,11 @@
def str_w(self, space):
return self._value
+ def utf8_w(self, space):
+ # Use the default encoding.
+ encoding = getdefaultencoding(space)
+ return space.utf8_w(decode_object(space, self, encoding, None))
+
def buffer_w(self, space, flags):
space.check_buf_flags(flags, True)
return StringBuffer(self._value)
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -650,9 +650,9 @@
if type(w_key) is self.space.StringObjectCls:
self.switch_to_bytes_strategy(w_dict)
return
- elif type(w_key) is self.space.UnicodeObjectCls:
- self.switch_to_unicode_strategy(w_dict)
- return
+ #elif type(w_key) is self.space.UnicodeObjectCls:
+ # self.switch_to_unicode_strategy(w_dict)
+ # return
w_type = self.space.type(w_key)
if self.space.is_w(w_type, self.space.w_int):
self.switch_to_int_strategy(w_dict)
@@ -668,6 +668,7 @@
w_dict.dstorage = storage
def switch_to_unicode_strategy(self, w_dict):
+ xxx
strategy = self.space.fromcache(UnicodeDictStrategy)
storage = strategy.get_empty_storage()
w_dict.set_strategy(strategy)
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -511,7 +511,8 @@
pass
else:
return space.newbytes(result)
- fmt = space.unicode_w(w_fmt)
+ # XXX for now, this is performance critical
+ fmt = space.utf8_w(w_fmt).decode("utf8")
formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
result = formatter.format()
return space.newunicode(result)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -491,8 +491,8 @@
return w_obj.listview_unicode()
if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
return w_obj.listview_unicode()
- if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
- return w_obj.listview_unicode()
+ #if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
+ # return w_obj.listview_unicode()
if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
return w_obj.getitems_unicode()
return None
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -799,8 +799,8 @@
strategy = self.space.fromcache(IntegerSetStrategy)
elif type(w_key) is W_BytesObject:
strategy = self.space.fromcache(BytesSetStrategy)
- elif type(w_key) is W_UnicodeObject:
- strategy = self.space.fromcache(UnicodeSetStrategy)
+ #elif type(w_key) is W_UnicodeObject:
+ # strategy = self.space.fromcache(UnicodeSetStrategy)
elif self.space.type(w_key).compares_by_identity():
strategy = self.space.fromcache(IdentitySetStrategy)
else:
@@ -1640,13 +1640,13 @@
return
# check for unicode
- for w_item in iterable_w:
- if type(w_item) is not W_UnicodeObject:
- break
- else:
- w_set.strategy = space.fromcache(UnicodeSetStrategy)
- w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
- return
+ #for w_item in iterable_w:
+ # if type(w_item) is not W_UnicodeObject:
+ # break
+ #else:
+ # w_set.strategy = space.fromcache(UnicodeSetStrategy)
+ # w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
+ # return
# check for compares by identity
for w_item in iterable_w:
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -142,6 +142,7 @@
assert self.space.listview_bytes(w_d) == ["a", "b"]
def test_listview_unicode_dict(self):
+ py.test.skip("listview_unicode disabled")
w = self.space.wrap
w_d = self.space.newdict()
w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
@@ -175,6 +176,7 @@
# XXX: it would be nice if the test passed without monkeypatch.undo(),
# but we need space.newlist_unicode for it
monkeypatch.undo()
+ py.test.skip("listview_unicode disabled")
w_d = self.space.newdict()
w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(6))])
w_l = self.space.call_method(w_d, "keys")
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -20,8 +20,8 @@
IntegerListStrategy)
assert isinstance(W_ListObject(space, [wb('a'), wb('b')]).strategy,
BytesListStrategy)
- assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
- UnicodeListStrategy)
+ #assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
+ # UnicodeListStrategy)
assert isinstance(W_ListObject(space, [w(u'a'), wb('b')]).strategy,
ObjectListStrategy) # mixed unicode and bytes
@@ -47,7 +47,7 @@
l = W_ListObject(space, [])
assert isinstance(l.strategy, EmptyListStrategy)
l.append(w(u'a'))
- assert isinstance(l.strategy, UnicodeListStrategy)
+ #assert isinstance(l.strategy, UnicodeListStrategy)
l = W_ListObject(space, [])
assert isinstance(l.strategy, EmptyListStrategy)
@@ -74,6 +74,7 @@
assert isinstance(l.strategy, ObjectListStrategy)
def test_unicode_to_any(self):
+ py.test.skip("disabled")
space = self.space
l = W_ListObject(space, [space.wrap(u'a'), space.wrap(u'b'), space.wrap(u'c')])
assert isinstance(l.strategy, UnicodeListStrategy)
@@ -117,7 +118,7 @@
# UnicodeStrategy to ObjectStrategy
l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
- assert isinstance(l.strategy, UnicodeListStrategy)
+ #assert isinstance(l.strategy, UnicodeListStrategy)
l.setitem(0, w(2))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -145,7 +146,7 @@
# UnicodeStrategy
l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
- assert isinstance(l.strategy, UnicodeListStrategy)
+ #assert isinstance(l.strategy, UnicodeListStrategy)
l.insert(3, w(2))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -225,7 +226,7 @@
# UnicodeStrategy to ObjectStrategy
l = W_ListObject(space, [w(u'a'), w(u'b'), w(u'c')])
- assert isinstance(l.strategy, UnicodeListStrategy)
+ #assert isinstance(l.strategy, UnicodeListStrategy)
l.setslice(0, 1, 2, W_ListObject(space, [w(1), w(2), w(3)]))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -275,7 +276,7 @@
l = W_ListObject(space, wrapitems([u"a",u"b",u"c",u"d",u"e"]))
other = W_ListObject(space, wrapitems([u"a", u"b", u"c"]))
keep_other_strategy(l, 0, 2, other.length(), other)
- assert l.strategy is space.fromcache(UnicodeListStrategy)
+ #assert l.strategy is space.fromcache(UnicodeListStrategy)
l = W_ListObject(space, wrapitems([1.1, 2.2, 3.3, 4.4, 5.5]))
other = W_ListObject(space, [])
@@ -345,7 +346,7 @@
empty = W_ListObject(space, [])
assert isinstance(empty.strategy, EmptyListStrategy)
empty.extend(W_ListObject(space, [w(u"a"), w(u"b"), w(u"c")]))
- assert isinstance(empty.strategy, UnicodeListStrategy)
+ #assert isinstance(empty.strategy, UnicodeListStrategy)
empty = W_ListObject(space, [])
assert isinstance(empty.strategy, EmptyListStrategy)
@@ -601,7 +602,7 @@
l1 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newbytes("zwei")])
assert isinstance(l1.strategy, BytesListStrategy)
l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"), self.space.newunicode(u"zwei")])
- assert isinstance(l2.strategy, UnicodeListStrategy)
+ #assert isinstance(l2.strategy, UnicodeListStrategy)
l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newunicode(u"zwei")])
assert isinstance(l3.strategy, ObjectListStrategy)
@@ -612,6 +613,7 @@
assert space.listview_bytes(w_l) == ["a", "b"]
def test_listview_unicode(self):
+ py.test.skip("disabled")
space = self.space
assert space.listview_unicode(space.wrap(1)) == None
w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
@@ -624,6 +626,7 @@
assert space.str_w(space.call_method(space.wrap("c"), "join", w_l)) == "acb"
#
# the same for unicode
+ py.test.skip("disabled")
w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
w_l.getitems = None
assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb"
@@ -636,6 +639,7 @@
assert space.is_w(space.call_method(space.wrap(" -- "), "join", w_l), w_text)
#
# the same for unicode
+ py.test.skip("disabled")
w_text = space.wrap(u"text")
w_l = self.space.newlist([w_text])
w_l.getitems = None
@@ -665,6 +669,7 @@
assert space.listview_bytes(w_l4) == ["a", "b", "c"]
def test_unicode_uses_newlist_unicode(self):
+ py.test.skip("disabled")
space = self.space
w_u = space.wrap(u"a b c")
space.newlist = None
@@ -720,6 +725,7 @@
assert self.space.listview_bytes(w_l) == ["a", "b"]
def test_listview_unicode_list(self):
+ py.test.skip("disabled")
space = self.space
w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
assert self.space.listview_unicode(w_l) == [u"a", u"b"]
diff --git a/pypy/objspace/std/test/test_obj.py b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -17,7 +17,7 @@
cls.w_cpython_apptest = space.wrap(option.runappdirect and not hasattr(sys, 'pypy_translation_info'))
def w_unwrap_wrap_unicode(space, w_obj):
- return space.wrap(space.unicode_w(w_obj))
+ return space.newutf8(space.utf8_w(w_obj), w_obj._length)
cls.w_unwrap_wrap_unicode = space.wrap(gateway.interp2app(w_unwrap_wrap_unicode))
def w_unwrap_wrap_str(space, w_obj):
return space.wrap(space.str_w(w_obj))
@@ -194,7 +194,8 @@
assert id('') == (256 << 4) | 11 # always
assert id(u'') == (257 << 4) | 11
assert id('a') == (ord('a') << 4) | 11
- assert id(u'\u1234') == ((~0x1234) << 4) | 11
+ # we no longer cache unicodes <128
+ # assert id(u'\u1234') == ((~0x1234) << 4) | 11
def test_id_of_tuples(self):
l = []
diff --git a/pypy/objspace/std/test/test_setobject.py b/pypy/objspace/std/test/test_setobject.py
--- a/pypy/objspace/std/test/test_setobject.py
+++ b/pypy/objspace/std/test/test_setobject.py
@@ -105,8 +105,8 @@
w_list = self.space.iter(W_ListObject(self.space, [w(u"1"), w(u"2"), w(u"3")]))
w_set = W_SetObject(self.space)
_initialize_set(self.space, w_set, w_list)
- assert w_set.strategy is self.space.fromcache(UnicodeSetStrategy)
- assert w_set.strategy.unerase(w_set.sstorage) == {u"1":None, u"2":None, u"3":None}
+ #assert w_set.strategy is self.space.fromcache(UnicodeSetStrategy)
+ #assert w_set.strategy.unerase(w_set.sstorage) == {u"1":None, u"2":None, u"3":None}
w_list = W_ListObject(self.space, [w("1"), w(2), w("3")])
w_set = W_SetObject(self.space)
diff --git a/pypy/objspace/std/test/test_setstrategies.py b/pypy/objspace/std/test/test_setstrategies.py
--- a/pypy/objspace/std/test/test_setstrategies.py
+++ b/pypy/objspace/std/test/test_setstrategies.py
@@ -1,3 +1,5 @@
+
+import py
from pypy.objspace.std.setobject import W_SetObject
from pypy.objspace.std.setobject import (
BytesIteratorImplementation, BytesSetStrategy, EmptySetStrategy,
@@ -26,8 +28,8 @@
s = W_SetObject(self.space, self.wrapped(["a", "b"]))
assert s.strategy is self.space.fromcache(BytesSetStrategy)
- s = W_SetObject(self.space, self.wrapped([u"a", u"b"]))
- assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
+ #s = W_SetObject(self.space, self.wrapped([u"a", u"b"]))
+ #assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
def test_switch_to_object(self):
s = W_SetObject(self.space, self.wrapped([1,2,3,4,5]))
@@ -40,6 +42,7 @@
assert s1.strategy is self.space.fromcache(ObjectSetStrategy)
def test_switch_to_unicode(self):
+ py.test.skip("disabled")
s = W_SetObject(self.space, self.wrapped([]))
s.add(self.space.wrap(u"six"))
assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
@@ -128,11 +131,11 @@
assert space.unwrap(it.next()) == "a"
assert space.unwrap(it.next()) == "b"
#
- s = W_SetObject(space, self.wrapped([u"a", u"b"]))
- it = s.iter()
- assert isinstance(it, UnicodeIteratorImplementation)
- assert space.unwrap(it.next()) == u"a"
- assert space.unwrap(it.next()) == u"b"
+ #s = W_SetObject(space, self.wrapped([u"a", u"b"]))
+ #it = s.iter()
+ #assert isinstance(it, UnicodeIteratorImplementation)
+ #assert space.unwrap(it.next()) == u"a"
+ #assert space.unwrap(it.next()) == u"b"
def test_listview(self):
space = self.space
@@ -142,5 +145,5 @@
s = W_SetObject(space, self.wrapped(["a", "b"]))
assert sorted(space.listview_bytes(s)) == ["a", "b"]
#
- s = W_SetObject(space, self.wrapped([u"a", u"b"]))
- assert sorted(space.listview_unicode(s)) == [u"a", u"b"]
+ #s = W_SetObject(space, self.wrapped([u"a", u"b"]))
+ #assert sorted(space.listview_unicode(s)) == [u"a", u"b"]
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -518,7 +518,7 @@
raises(TypeError, u'hello'.translate)
raises(TypeError, u'abababc'.translate, {ord('a'):''})
- def test_unicode_form_encoded_object(self):
+ def test_unicode_from_encoded_object(self):
assert unicode('x', 'utf-8') == u'x'
assert unicode('x', 'utf-8', 'strict') == u'x'
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -70,12 +70,14 @@
def immutable_unique_id(self, space):
if self.user_overridden_class:
return None
- s = space.unicode_w(self)
- if len(s) > 1:
+ s = space.utf8_w(self)
+ if len(s) > 2:
uid = compute_unique_id(s)
else: # strings of len <= 1 are unique-ified
if len(s) == 1:
base = ~ord(s[0]) # negative base values
+ elif len(s) == 2:
+ base = ~((ord(s[1]) << 8) | ord(s[0]))
else:
base = 257 # empty unicode string: base value 257
uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
@@ -88,9 +90,11 @@
return self._utf8
def readbuf_w(self, space):
+ # XXX for now
from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
- builder = StringBuilder(len(self._value) * UNICODE_SIZE)
- for unich in self._value:
+ v = self._utf8.decode("utf8")
+ builder = StringBuilder(len(v) * UNICODE_SIZE)
+ for unich in v:
pack_unichar(unich, builder)
return StringBuffer(builder.build())
@@ -331,7 +335,8 @@
formatter = newformat.unicode_formatter(space, spec)
self2 = unicode_from_object(space, self)
assert isinstance(self2, W_UnicodeObject)
- return formatter.format_string(self2._value)
+ # XXX
+ return formatter.format_string(self2._utf8.decode("utf8"))
def descr_mod(self, space, w_values):
return mod_format(space, self, w_values, do_unicode=True)
@@ -617,21 +622,28 @@
if errors is None or errors == 'strict':
try:
if encoding == 'ascii':
- u = space.unicode_w(w_object)
- eh = unicodehelper.raise_unicode_exception_encode
- return space.newbytes(unicode_encode_ascii(
- u, len(u), None, errorhandler=eh))
+ s = space.utf8_w(w_object)
+ try:
+ rutf8.check_ascii(s)
+ except rutf8.AsciiCheckError as a:
+ eh = unicodehelper.raise_unicode_exception_encode
+ eh(None, "ascii", "ordinal not in range(128)", s,
+ a.pos, a.pos + 1)
+ assert False, "always raises"
+ return space.newbytes(s)
if encoding == 'utf-8':
- u = space.unicode_w(w_object)
- eh = unicodehelper.raise_unicode_exception_encode
- return space.newbytes(unicode_encode_utf_8(
- u, len(u), None, errorhandler=eh,
- allow_surrogates=True))
+ u = space.utf8_w(w_object)
+ return space.newbytes(u)
+ # XXX is this enough?
+ #eh = unicodehelper.raise_unicode_exception_encode
+ #return space.newbytes(unicode_encode_utf_8(
+ # u, len(u), None, errorhandler=eh,
+ # allow_surrogates=True))
except unicodehelper.RUnicodeEncodeError as ue:
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([
space.newtext(ue.encoding),
- space.newunicode(ue.object),
+ space.newutf8(ue.object, -1),
space.newint(ue.start),
space.newint(ue.end),
space.newtext(ue.reason)]))
@@ -665,13 +677,15 @@
assert False
return space.newutf8(s, len(s))
if encoding == 'utf-8':
- yyy
s = space.charbuf_w(w_obj)
eh = unicodehelper.decode_error_handler(space)
- return space.newunicode(str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0])
- xxx
+ try:
+ _, lgt = rutf8.str_check_utf8(s, len(s), final=True,
+ allow_surrogates=True)
+ except rutf8.Utf8CheckError as e:
+ eh(None, 'utf8', e.msg, s, e.startpos, e.endpos)
+ assert False, "has to raise"
+ return space.newutf8(s, lgt)
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.newtext("decode"))
if errors is None:
@@ -723,7 +737,6 @@
# this is a performance and bootstrapping hack
encoding = getdefaultencoding(space)
if encoding != 'ascii':
- xxx
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
try:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,9 +1,6 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import runicode, jit
-from rpython.rlib.rarithmetic import r_uint
-from rpython.rlib.nonconst import NonConstant
-from rpython.tool.sourcetools import func_with_new_name
def unichr_as_utf8(code):
""" Encode code (numeric value) as utf8 encoded string
@@ -81,6 +78,38 @@
def default_unicode_error_check(*args):
xxx
+def utf8_encode_ascii(s, errors, encoding, msg, errorhandler):
+ res = StringBuilder(len(s))
+ u_pos = 0
+ pos = 0
+ while pos < len(s):
+ chr1 = s[pos]
+ if ord(chr1) < 0x80:
+ res.append(chr1)
+ else:
+ repl, _, _, _ = errorhandler(errors, encoding, msg, s, u_pos, u_pos + 1)
+ res.append(repl)
+ u_pos += 1
+ pos = next_codepoint_pos(s, pos)
+ return res.build()
+
+def str_decode_ascii(s, errors, errorhandler):
+ # ASCII is equivalent to the first 128 ordinals in Unicode.
+ size = len(s)
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ c = s[pos]
+ if ord(c) < 128:
+ result.append(c)
+ else:
+ r, _, _ = errorhandler(errors, "ascii", "ordinal not in range(128)",
+ s, pos, pos + 1)
+ result.append(r)
+ pos += 1
+ return result.build(), pos, -1
+
+
def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
if errors == 'replace':
return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
@@ -105,30 +134,28 @@
return chr3 == 0xa8 or chr3 == 0xa9
return False
-# if you can't use the @elidable version, call str_check_utf8_impl()
-# directly
+class Utf8CheckError(Exception):
+ def __init__(self, msg, startpos, endpos):
+ self.msg = msg
+ self.startpos = startpos
+ self.endpos = endpos
+
@jit.elidable
-def str_check_utf8(s, size, errors, final=False,
- errorhandler=None,
+def str_check_utf8(s, size, final=False,
allow_surrogates=runicode.allow_surrogate_by_default):
- if errorhandler is None:
- errorhandler = default_unicode_error_check
- # XXX unclear, fix
+ """ A simplified version of utf8 encoder - it only works with 'strict'
+ error handling.
+ """
+ # XXX do the following in a cleaner way, e.g. via signature
# NB. a bit messy because rtyper/rstr.py also calls the same
# function. Make sure we annotate for the args it passes, too
- if NonConstant(False):
- s = NonConstant('?????')
- size = NonConstant(12345)
- errors = NonConstant('strict')
- final = NonConstant(True)
- WTF # goes here
- errorhandler = ll_unicode_error_decode
- allow_surrogates = NonConstant(True)
- return str_check_utf8_elidable(s, size, errors, final, errorhandler,
- allow_surrogates=allow_surrogates)
-
-def str_check_utf8_impl(s, size, errors, final, errorhandler,
- allow_surrogates):
+ #if NonConstant(False):
+ # s = NonConstant('?????')
+ # size = NonConstant(12345)
+ # errors = NonConstant('strict')
+ # final = NonConstant(True)
+ # errorhandler = ll_unicode_error_decode
+ # allow_surrogates = NonConstant(True)
if size == 0:
return 0, 0
@@ -155,46 +182,43 @@
# in case we need to continue running this loop
if not charsleft:
# there's only the start byte and nothing else
- errorhandler(errors, 'utf8', 'unexpected end of data',
- s, pos, pos+1)
+ raise Utf8CheckError('unexpected end of data', pos, pos + 1)
ordch2 = ord(s[pos+1])
if n == 3:
# 3-bytes seq with only a continuation byte
if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
# second byte invalid, take the first and continue
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+1)
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 1)
else:
# second byte valid, but third byte missing
- errorhandler(errors, 'utf8', 'unexpected end of data',
- s, pos, pos+2)
+ raise Utf8CheckError('unexpected end of data', pos, pos + 2)
elif n == 4:
# 4-bytes seq with 1 or 2 continuation bytes
if runicode._invalid_byte_2_of_4(ordch1, ordch2):
# second byte invalid, take the first and continue
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+1)
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 1)
elif charsleft == 2 and runicode._invalid_byte_3_of_4(ord(s[pos+2])):
# third byte invalid, take the first two and continue
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+2)
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 2)
else:
# there's only 1 or 2 valid cb, but the others are missing
- errorhandler(errors, 'utf8', 'unexpected end of data',
- s, pos, pos+charsleft+1)
+ raise Utf8CheckError('unexpected end of data', pos,
+ pos + charsleft + 1)
raise AssertionError("unreachable")
if n == 0:
- errorhandler(errors, 'utf8', 'invalid start byte', s, pos, pos+1)
+ raise Utf8CheckError('invalid start byte', pos, pos + 1)
elif n == 1:
assert 0, "ascii should have gone through the fast path"
elif n == 2:
ordch2 = ord(s[pos+1])
if runicode._invalid_byte_2_of_2(ordch2):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+2)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 2)
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
lgt += 1
pos += 2
@@ -203,13 +227,11 @@
ordch2 = ord(s[pos+1])
ordch3 = ord(s[pos+2])
if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+1)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 1)
elif runicode._invalid_byte_3_of_3(ordch3):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+2)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 2)
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
lgt += 1
pos += 3
@@ -219,17 +241,14 @@
ordch3 = ord(s[pos+2])
ordch4 = ord(s[pos+3])
if runicode._invalid_byte_2_of_4(ordch1, ordch2):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+1)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 1)
elif runicode._invalid_byte_3_of_4(ordch3):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+2)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 2)
elif runicode._invalid_byte_4_of_4(ordch4):
- errorhandler(errors, 'utf8', 'invalid continuation byte',
- s, pos, pos+3)
- assert False, "unreachable"
+ raise Utf8CheckError('invalid continuation byte', pos,
+ pos + 3)
# 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
c = (((ordch1 & 0x07) << 18) + # 0b00000111
((ordch2 & 0x3F) << 12) + # 0b00111111
@@ -243,5 +262,3 @@
pos += 4
return pos, lgt
-str_check_utf8_elidable = jit.elidable(
- func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))
More information about the pypy-commit
mailing list