[pypy-commit] pypy unicode-utf8: write down check_utf8
fijal
pypy.commits at gmail.com
Tue Feb 21 12:35:54 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90276:07c779b7a698
Date: 2017-02-21 18:35 +0100
http://bitbucket.org/pypy/pypy/changeset/07c779b7a698/
Log: write down check_utf8
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -52,16 +52,15 @@
final=True, errorhandler=decode_error_handler(space))
return result, length
-def decode_utf8(space, string):
+def check_utf8(space, string):
# Surrogates are accepted and not treated specially at all.
# If there happen to be two 3-bytes encoding a pair of surrogates,
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
- result, consumed = runicode.str_decode_utf_8(
- string, len(string), "strict",
- final=True, errorhandler=decode_error_handler(space),
+ consumed, length = rutf8.str_check_utf8(
+ string, "strict", final=True, errorhandler=decode_error_handler(space),
allow_surrogates=True)
- return result
+ return length
def encode_utf8(space, uni):
# Note that this function never raises UnicodeEncodeError,
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -27,10 +27,10 @@
"Return a Unicode string of one character with the given ordinal."
# XXX this assumes unichr would be happy to return you surrogates
try:
- s = unichr_as_utf8(code)
+ s, lgt = unichr_as_utf8(code)
except ValueError:
raise oefmt(space.w_ValueError, "unichr() arg out of range")
- return space.newunicode(s)
+ return space.newunicode(s, lgt)
def len(space, w_obj):
"len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,8 +403,9 @@
@unmarshaller(TYPE_UNICODE)
def unmarshal_unicode(space, u, tc):
- return space.newunicode(unicodehelper.decode_utf8(space, u.get_str()))
-
+ arg = u.get_str()
+ length = unicodehelper.check_utf8(space, arg)
+ return space.newunicode(arg, length)
@marshaller(W_SetObject)
def marshal_set(space, w_set, m):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -583,7 +583,7 @@
except rutf8.AsciiCheckError:
# raising UnicodeDecodeError is messy, "please crash for me"
return utf8_from_encoded_object(space, w_bytes, "ascii", "strict")
- return W_UnicodeObject(s)
+ return W_UnicodeObject(s, len(s))
class UnicodeDocstrings:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,26 +1,32 @@
from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import runicode, jit
+from rpython.rlib.nonconst import NonConstant
+from rpython.tool.sourcetools import func_with_new_name
def unichr_as_utf8(code):
""" Encode code (numeric value) as utf8 encoded string
"""
if code < 0:
raise ValueError
+ lgt = 1
+ if code >= runicode.MAXUNICODE:
+ lgt = 2
if code < 0x80:
# Encode ASCII
- return chr(code)
+ return chr(code), 1
if code < 0x0800:
# Encode Latin-1
- return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
+ return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f))), lgt
if code < 0x10000:
return (chr((0xe0 | (code >> 12))) +
chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
+ chr((0x80 | (code & 0x3f)))), lgt
if code < 0x10ffff:
return (chr((0xf0 | (code >> 18))) +
chr((0x80 | ((code >> 12) & 0x3f))) +
chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
+ chr((0x80 | (code & 0x3f)))), lgt
raise ValueError
class AsciiCheckError(Exception):
@@ -32,6 +38,151 @@
if ord(s[i]) & 0x80:
raise AsciiCheckError(i)
+def default_unicode_error_check(*args):
+ xxx
+
+# if you can't use the @elidable version, call str_check_utf8_impl()
+# directly
+ at jit.elidable
+def str_check_utf8(s, size, errors, final=False,
+ errorhandler=None,
+ allow_surrogates=runicode.allow_surrogate_by_default):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_check
+ # XXX unclear, fix
+ # NB. a bit messy because rtyper/rstr.py also calls the same
+ # function. Make sure we annotate for the args it passes, too
+ if NonConstant(False):
+ s = NonConstant('?????')
+ size = NonConstant(12345)
+ errors = NonConstant('strict')
+ final = NonConstant(True)
+ WTF # goes here
+ errorhandler = ll_unicode_error_decode
+ allow_surrogates = NonConstant(True)
+ return str_check_utf8_elidable(s, size, errors, final, errorhandler,
+ allow_surrogates=allow_surrogates)
+
+def str_check_utf8_impl(s, size, errors, final, errorhandler,
+ allow_surrogates):
+ if size == 0:
+ return 0, 0
+
+ pos = 0
+ lgt = 0
+ while pos < size:
+ ordch1 = ord(s[pos])
+ # fast path for ASCII
+ # XXX maybe use a while loop here
+ if ordch1 < 0x80:
+ lgt += 1
+ pos += 1
+ continue
+
+ n = ord(runicode._utf8_code_length[ordch1 - 0x80])
+ if pos + n > size:
+ if not final:
+ break
+ # argh, this obscure block of code is mostly a copy of
+ # what follows :-(
+ charsleft = size - pos - 1 # either 0, 1, 2
+ # note: when we get the 'unexpected end of data' we need
+ # to care about the pos returned; it can be lower than size,
+ # in case we need to continue running this loop
+ if not charsleft:
+ # there's only the start byte and nothing else
+ errorhandler(errors, 'utf8', 'unexpected end of data',
+ s, pos, pos+1)
+ ordch2 = ord(s[pos+1])
+ if n == 3:
+ # 3-bytes seq with only a continuation byte
+ if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+ # second byte invalid, take the first and continue
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+1)
+ else:
+ # second byte valid, but third byte missing
+ errorhandler(errors, 'utf8', 'unexpected end of data',
+ s, pos, pos+2)
+ elif n == 4:
+ # 4-bytes seq with 1 or 2 continuation bytes
+ if runicode._invalid_byte_2_of_4(ordch1, ordch2):
+ # second byte invalid, take the first and continue
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+1)
+ elif charsleft == 2 and runicode._invalid_byte_3_of_4(ord(s[pos+2])):
+ # third byte invalid, take the first two and continue
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+2)
+ else:
+ # there's only 1 or 2 valid cb, but the others are missing
+ errorhandler(errors, 'utf8', 'unexpected end of data',
+ s, pos, pos+charsleft+1)
+ raise AssertionError("unreachable")
+
+ if n == 0:
+ errorhandler(errors, 'utf8', 'invalid start byte', s, pos, pos+1)
+ elif n == 1:
+ assert 0, "ascii should have gone through the fast path"
+
+ elif n == 2:
+ ordch2 = ord(s[pos+1])
+ if runicode._invalid_byte_2_of_2(ordch2):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+2)
+ assert False, "unreachable"
+ # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+ lgt += 1
+ pos += 2
+
+ elif n == 3:
+ ordch2 = ord(s[pos+1])
+ ordch3 = ord(s[pos+2])
+ if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+1)
+ assert False, "unreachable"
+ elif runicode._invalid_byte_3_of_3(ordch3):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+2)
+ assert False, "unreachable"
+ # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+ lgt += 1
+ pos += 3
+
+ elif n == 4:
+ ordch2 = ord(s[pos+1])
+ ordch3 = ord(s[pos+2])
+ ordch4 = ord(s[pos+3])
+ if runicode._invalid_byte_2_of_4(ordch1, ordch2):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+1)
+ assert False, "unreachable"
+ elif runicode._invalid_byte_3_of_4(ordch3):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+2)
+ assert False, "unreachable"
+ elif runicode._invalid_byte_4_of_4(ordch4):
+ errorhandler(errors, 'utf8', 'invalid continuation byte',
+ s, pos, pos+3)
+ assert False, "unreachable"
+ # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+ c = (((ordch1 & 0x07) << 18) + # 0b00000111
+ ((ordch2 & 0x3F) << 12) + # 0b00111111
+ ((ordch3 & 0x3F) << 6) + # 0b00111111
+ (ordch4 & 0x3F)) # 0b00111111
+ if c <= runicode.MAXUNICODE:
+ lgt += 1
+ else:
+ # append the two surrogates:
+ lgt += 2
+ pos += 4
+
+ return pos, lgt
+str_check_utf8_elidable = jit.elidable(
+ func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))
+
+
def str_decode_raw_utf8_escape(s, size, errors, final=False,
errorhandler=None):
lgt = 0
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -5,7 +5,9 @@
@given(strategies.integers(min_value=0, max_value=runicode.MAXUNICODE))
def test_unichr_as_utf8(i):
- assert rutf8.unichr_as_utf8(i) == runicode.UNICHR(i).encode('utf8')
+ u, lgt = rutf8.unichr_as_utf8(i)
+ r = runicode.UNICHR(i)
+ assert u == r.encode('utf8')
@given(strategies.binary())
def test_check_ascii(s):
@@ -22,6 +24,28 @@
else:
assert not raised
+def error_handler(errors, encoding, msg, char, start, end):
+ raise UnicodeDecodeError(encoding, char, start, end, msg)
+
+ at given(strategies.binary())
+def test_str_check_utf8(s):
+ try:
+ u = s.decode("utf8")
+ valid = True
+ except UnicodeDecodeError as e:
+ valid = False
+ try:
+ consumed, length = rutf8.str_check_utf8(s, len(s), None,
+ errorhandler=error_handler, final=True)
+ except UnicodeDecodeError as a:
+ assert not valid
+ assert a.start == e.start
+ # assert a.end == e.end, ideally
+ else:
+ assert valid
+ assert consumed == len(s)
+ assert length == len(u)
+
@given(strategies.binary())
def test_str_decode_raw_utf8_escape(uni):
return # XXX fix details
More information about the pypy-commit
mailing list