[pypy-commit] pypy unicode-utf8: write down check_utf8

Tue Feb 21 12:35:54 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90276:07c779b7a698
Date: 2017-02-21 18:35 +0100
http://bitbucket.org/pypy/pypy/changeset/07c779b7a698/

Log:	write down check_utf8

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -52,16 +52,15 @@
         final=True, errorhandler=decode_error_handler(space))
     return result, length
 
-def decode_utf8(space, string):
+def check_utf8(space, string):
     # Surrogates are accepted and not treated specially at all.
     # If there happen to be two 3-bytes encoding a pair of surrogates,
     # you still get two surrogate unicode characters in the result.
     # These are the Python2 rules; Python3 differs.
-    result, consumed = runicode.str_decode_utf_8(
-        string, len(string), "strict",
-        final=True, errorhandler=decode_error_handler(space),
+    consumed, length = rutf8.str_check_utf8(
+        string, "strict", final=True, errorhandler=decode_error_handler(space),
         allow_surrogates=True)
-    return result
+    return length
 
 def encode_utf8(space, uni):
     # Note that this function never raises UnicodeEncodeError,
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -27,10 +27,10 @@
     "Return a Unicode string of one character with the given ordinal."
     # XXX this assumes unichr would be happy to return you surrogates
     try:
-        s = unichr_as_utf8(code)
+        s, lgt = unichr_as_utf8(code)
     except ValueError:
         raise oefmt(space.w_ValueError, "unichr() arg out of range")
-    return space.newunicode(s)
+    return space.newunicode(s, lgt)
 
 def len(space, w_obj):
     "len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,8 +403,9 @@
 
 @unmarshaller(TYPE_UNICODE)
 def unmarshal_unicode(space, u, tc):
-    return space.newunicode(unicodehelper.decode_utf8(space, u.get_str()))
-
+    arg = u.get_str()
+    length = unicodehelper.check_utf8(space, arg)
+    return space.newunicode(arg, length)
 
 @marshaller(W_SetObject)
 def marshal_set(space, w_set, m):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -583,7 +583,7 @@
     except rutf8.AsciiCheckError:
         # raising UnicodeDecodeError is messy, "please crash for me"
         return utf8_from_encoded_object(space, w_bytes, "ascii", "strict")
-    return W_UnicodeObject(s)
+    return W_UnicodeObject(s, len(s))
 
 
 class UnicodeDocstrings:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,26 +1,32 @@
 
 from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import runicode, jit
+from rpython.rlib.nonconst import NonConstant
+from rpython.tool.sourcetools import func_with_new_name
 
 def unichr_as_utf8(code):
     """ Encode code (numeric value) as utf8 encoded string
     """
     if code < 0:
         raise ValueError
+    lgt = 1
+    if code >= runicode.MAXUNICODE:
+        lgt = 2
     if code < 0x80:
         # Encode ASCII
-        return chr(code)
+        return chr(code), 1
     if code < 0x0800:
         # Encode Latin-1
-        return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
+        return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f))), lgt
     if code < 0x10000:
         return (chr((0xe0 | (code >> 12))) +
                 chr((0x80 | ((code >> 6) & 0x3f))) +
-                chr((0x80 | (code & 0x3f))))
+                chr((0x80 | (code & 0x3f)))), lgt
     if code < 0x10ffff:
         return (chr((0xf0 | (code >> 18))) +
                 chr((0x80 | ((code >> 12) & 0x3f))) +
                 chr((0x80 | ((code >> 6) & 0x3f))) +
-                chr((0x80 | (code & 0x3f))))
+                chr((0x80 | (code & 0x3f)))), lgt
     raise ValueError
 
 class AsciiCheckError(Exception):
@@ -32,6 +38,151 @@
         if ord(s[i]) & 0x80:
             raise AsciiCheckError(i)
 
+def default_unicode_error_check(*args):
+    xxx
+
+# if you can't use the @elidable version, call str_check_utf8_impl()
+# directly
+ at jit.elidable
+def str_check_utf8(s, size, errors, final=False,
+                   errorhandler=None,
+                   allow_surrogates=runicode.allow_surrogate_by_default):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_check
+    # XXX unclear, fix
+    # NB. a bit messy because rtyper/rstr.py also calls the same
+    # function.  Make sure we annotate for the args it passes, too
+    if NonConstant(False):
+        s = NonConstant('?????')
+        size = NonConstant(12345)
+        errors = NonConstant('strict')
+        final = NonConstant(True)
+        WTF # goes here
+        errorhandler = ll_unicode_error_decode
+        allow_surrogates = NonConstant(True)
+    return str_check_utf8_elidable(s, size, errors, final, errorhandler,
+                                   allow_surrogates=allow_surrogates)
+
+def str_check_utf8_impl(s, size, errors, final, errorhandler,
+                        allow_surrogates):
+    if size == 0:
+        return 0, 0
+
+    pos = 0
+    lgt = 0
+    while pos < size:
+        ordch1 = ord(s[pos])
+        # fast path for ASCII
+        # XXX maybe use a while loop here
+        if ordch1 < 0x80:
+            lgt += 1
+            pos += 1
+            continue
+
+        n = ord(runicode._utf8_code_length[ordch1 - 0x80])
+        if pos + n > size:
+            if not final:
+                break
+            # argh, this obscure block of code is mostly a copy of
+            # what follows :-(
+            charsleft = size - pos - 1 # either 0, 1, 2
+            # note: when we get the 'unexpected end of data' we need
+            # to care about the pos returned; it can be lower than size,
+            # in case we need to continue running this loop
+            if not charsleft:
+                # there's only the start byte and nothing else
+                errorhandler(errors, 'utf8', 'unexpected end of data',
+                             s, pos, pos+1)
+            ordch2 = ord(s[pos+1])
+            if n == 3:
+                # 3-bytes seq with only a continuation byte
+                if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+                    # second byte invalid, take the first and continue
+                    errorhandler(errors, 'utf8', 'invalid continuation byte',
+                                 s, pos, pos+1)
+                else:
+                    # second byte valid, but third byte missing
+                    errorhandler(errors, 'utf8', 'unexpected end of data',
+                                 s, pos, pos+2)
+            elif n == 4:
+                # 4-bytes seq with 1 or 2 continuation bytes
+                if runicode._invalid_byte_2_of_4(ordch1, ordch2):
+                    # second byte invalid, take the first and continue
+                    errorhandler(errors, 'utf8', 'invalid continuation byte',
+                                 s, pos, pos+1)
+                elif charsleft == 2 and runicode._invalid_byte_3_of_4(ord(s[pos+2])):
+                    # third byte invalid, take the first two and continue
+                    errorhandler(errors, 'utf8', 'invalid continuation byte',
+                                 s, pos, pos+2)
+                else:
+                    # there's only 1 or 2 valid cb, but the others are missing
+                    errorhandler(errors, 'utf8', 'unexpected end of data',
+                                 s, pos, pos+charsleft+1)
+            raise AssertionError("unreachable")
+
+        if n == 0:
+            errorhandler(errors, 'utf8', 'invalid start byte', s, pos, pos+1)
+        elif n == 1:
+            assert 0, "ascii should have gone through the fast path"
+
+        elif n == 2:
+            ordch2 = ord(s[pos+1])
+            if runicode._invalid_byte_2_of_2(ordch2):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+2)
+                assert False, "unreachable"
+            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+            lgt += 1
+            pos += 2
+
+        elif n == 3:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            if runicode._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+1)
+                assert False, "unreachable"
+            elif runicode._invalid_byte_3_of_3(ordch3):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+2)
+                assert False, "unreachable"
+            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+            lgt += 1
+            pos += 3
+
+        elif n == 4:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            ordch4 = ord(s[pos+3])
+            if runicode._invalid_byte_2_of_4(ordch1, ordch2):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+1)
+                assert False, "unreachable"
+            elif runicode._invalid_byte_3_of_4(ordch3):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+2)
+                assert False, "unreachable"
+            elif runicode._invalid_byte_4_of_4(ordch4):
+                errorhandler(errors, 'utf8', 'invalid continuation byte',
+                             s, pos, pos+3)
+                assert False, "unreachable"
+            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+            c = (((ordch1 & 0x07) << 18) +      # 0b00000111
+                 ((ordch2 & 0x3F) << 12) +      # 0b00111111
+                 ((ordch3 & 0x3F) << 6) +       # 0b00111111
+                 (ordch4 & 0x3F))               # 0b00111111
+            if c <= runicode.MAXUNICODE:
+                lgt += 1
+            else:
+                # append the two surrogates:
+                lgt += 2
+            pos += 4
+
+    return pos, lgt
+str_check_utf8_elidable = jit.elidable(
+    func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))
+
+
 def str_decode_raw_utf8_escape(s, size, errors, final=False,
                                errorhandler=None):
     lgt = 0
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -5,7 +5,9 @@
 
 @given(strategies.integers(min_value=0, max_value=runicode.MAXUNICODE))
 def test_unichr_as_utf8(i):
-    assert rutf8.unichr_as_utf8(i) == runicode.UNICHR(i).encode('utf8')
+    u, lgt = rutf8.unichr_as_utf8(i)
+    r = runicode.UNICHR(i)
+    assert u == r.encode('utf8')
 
 @given(strategies.binary())
 def test_check_ascii(s):
@@ -22,6 +24,28 @@
     else:
         assert not raised
 
+def error_handler(errors, encoding, msg, char, start, end):
+    raise UnicodeDecodeError(encoding, char, start, end, msg)
+
+ at given(strategies.binary())
+def test_str_check_utf8(s):
+    try:
+        u = s.decode("utf8")
+        valid = True
+    except UnicodeDecodeError as e:
+        valid = False
+    try:
+        consumed, length = rutf8.str_check_utf8(s, len(s), None,
+            errorhandler=error_handler, final=True)
+    except UnicodeDecodeError as a:
+        assert not valid
+        assert a.start == e.start
+        # assert a.end == e.end, ideally
+    else:
+        assert valid
+        assert consumed == len(s)
+        assert length == len(u)
+
 @given(strategies.binary())
 def test_str_decode_raw_utf8_escape(uni):
     return # XXX fix details