[pypy-commit] pypy unicode-utf8: start testing using hypothesis

Tue Feb 21 09:13:00 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90267:422b66748b74
Date: 2017-02-21 15:13 +0100
http://bitbucket.org/pypy/pypy/changeset/422b66748b74/

Log:	start testing using hypothesis

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -60,10 +60,10 @@
             xxx
             substr = decode_unicode_utf8(space, s, ps, q)
         if rawmode:
-            v = unicodehelper.decode_raw_unicode_escape(space, substr)
+            v, length = unicodehelper.decode_raw_unicode_escape(space, substr)
         else:
-            v = unicodehelper.decode_unicode_escape(space, substr)
-        return space.newunicode(v)
+            v, length = unicodehelper.decode_unicode_escape(space, substr)
+        return space.newunicode(v, length)
 
     need_encoding = (encoding is not None and
                      encoding != "utf-8" and encoding != "utf8" and
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -39,18 +39,18 @@
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
     # XXX pick better length, maybe
-    result, consumed = rutf8.str_decode_utf8_escape(
+    result, consumed, length = rutf8.str_decode_utf8_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         unicodedata_handler=unicodedata_handler)
-    return result
+    return result, length
 
 def decode_raw_unicode_escape(space, string):
     # XXX pick better length, maybe
-    result, consumed = rutf8.str_decode_raw_utf8_escape(
+    result, consumed, length = rutf8.str_decode_raw_utf8_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space))
-    return result
+    return result, length
 
 def decode_utf8(space, string):
     # Surrogates are accepted and not treated specially at all.
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -398,7 +398,7 @@
 
 @marshaller(W_UnicodeObject)
 def marshal_unicode(space, w_unicode, m):
-    s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode))
+    s = space.utf8_w(w_unicode)
     m.atom_str(TYPE_UNICODE, s)
 
 @unmarshaller(TYPE_UNICODE)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -346,10 +346,10 @@
             return self.w_None
         return self.newtext(s)
 
-    def newunicode(self, utf8s):
+    def newunicode(self, utf8s, length):
         assert utf8s is not None
         assert isinstance(utf8s, str)
-        return W_UnicodeObject(utf8s)
+        return W_UnicodeObject(utf8s, length)
 
     def type(self, w_obj):
         jit.promote(w_obj.__class__)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -32,11 +32,12 @@
     _immutable_fields_ = ['_utf8']
 
     @enforceargs(utf8str=str)
-    def __init__(self, utf8str, ucs4str=None):
+    def __init__(self, utf8str, length, ucs4str=None):
         assert isinstance(utf8str, str)
         if ucs4str is not None:
             assert isinstance(ucs4str, unicode)
         self._utf8 = utf8str
+        self._length = length
         self._ucs4 = ucs4str
 
     def __repr__(self):
@@ -508,14 +509,13 @@
         if encoding == 'ascii':
             # XXX error handling
             s = space.charbuf_w(w_obj)
-            xxx
             try:
-                u = fast_str_decode_ascii(s)
-            except ValueError:
-                eh = unicodehelper.decode_error_handler(space)
-                u = str_decode_ascii(     # try again, to get the error right
-                    s, len(s), None, final=True, errorhandler=eh)[0]
-            return space.newunicode(u)
+                rutf8.check_ascii(s)
+            except rutf8.AsciiCheckError as e:
+                unicodehelper.decode_error_handler(space)(None,
+                    'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
+                assert False
+            return space.newunicode(s)
         if encoding == 'utf-8':
             yyy
             s = space.charbuf_w(w_obj)
@@ -1130,7 +1130,7 @@
     return [s for s in value]
 
 
-W_UnicodeObject.EMPTY = W_UnicodeObject('')
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
 
 
 # Helper for converting int/long
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -24,19 +24,21 @@
     raise ValueError
 
 class AsciiCheckError(Exception):
-    pass
+    def __init__(self, pos):
+        self.pos = pos
 
 def check_ascii(s):
     for i in range(0, len(s)):
         if ord(s[i]) & 0x80:
-            raise AsciiCheckError
+            raise AsciiCheckError(i)
 
 def str_decode_raw_utf8_escape(s, size, errors, final=False,
                                errorhandler=None):
+    lgt = 0
     if errorhandler is None:
         errorhandler = None # default_unicode_error_decode
     if size == 0:
-        return '', 0
+        return '', 0, 0
     result = StringBuilder(size)
     pos = 0
     while pos < size:
@@ -46,6 +48,7 @@
         if ch != '\\':
             result.append(ch)
             pos += 1
+            lgt += 1
             continue
 
         # \u-escapes are only interpreted iff the number of leading
@@ -55,10 +58,12 @@
             pos += 1
             if pos == size or s[pos] != '\\':
                 break
+            lgt += 1
             result.append('\\')
 
         # we have a backslash at the end of the string, stop here
         if pos >= size:
+            lgt += 1
             result.append('\\')
             break
 
@@ -67,6 +72,7 @@
             (s[pos] != 'u' and s[pos] != 'U')):
             result.append('\\')
             result.append(s[pos])
+            lgt += 2
             pos += 1
             continue
 
@@ -77,7 +83,7 @@
         pos = hexescape(result, s, pos, digits,
                         "rawunicodeescape", errorhandler, message, errors)
 
-    return result.build(), pos
+    return result.build(), pos, lgt
 
 def str_decode_utf8_escape(s, size, errors, final=False,
                               errorhandler=None,
@@ -88,6 +94,7 @@
     if size == 0:
         return '', 0
 
+    lgt = 0
     builder = StringBuilder(size)
     pos = 0
     while pos < size:
@@ -97,6 +104,7 @@
         if ch != '\\':
             builder.append(ch)
             pos += 1
+            lgt += 1
             continue
 
         # - Escapes
@@ -106,22 +114,23 @@
             res, pos = errorhandler(errors, "unicodeescape",
                                     message, s, pos-1, size)
             builder.append(res)
+            lgt += 1
             continue
 
         ch = s[pos]
         pos += 1
         # \x escapes
         if ch == '\n': pass
-        elif ch == '\\': builder.append('\\')
-        elif ch == '\'': builder.append('\'')
-        elif ch == '\"': builder.append('\"')
-        elif ch == 'b' : builder.append('\b')
-        elif ch == 'f' : builder.append('\f')
-        elif ch == 't' : builder.append('\t')
-        elif ch == 'n' : builder.append('\n')
-        elif ch == 'r' : builder.append('\r')
-        elif ch == 'v' : builder.append('\v')
-        elif ch == 'a' : builder.append('\a')
+        elif ch == '\\': builder.append('\\'); lgt += 1
+        elif ch == '\'': builder.append('\''); lgt += 1
+        elif ch == '\"': builder.append('\"'); lgt += 1
+        elif ch == 'b' : builder.append('\b'); lgt += 1
+        elif ch == 'f' : builder.append('\f'); lgt += 1
+        elif ch == 't' : builder.append('\t'); lgt += 1
+        elif ch == 'n' : builder.append('\n'); lgt += 1
+        elif ch == 'r' : builder.append('\r'); lgt += 1
+        elif ch == 'v' : builder.append('\v'); lgt += 1
+        elif ch == 'a' : builder.append('\a'); lgt += 1
         elif '0' <= ch <= '7':
             xxx
             x = ord(ch) - ord('0')
@@ -199,5 +208,6 @@
         else:
             builder.append('\\')
             builder.append(ch)
+            lgt += 2
 
-    return builder.build(), pos
+    return builder.build(), pos, lgt
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/test/test_rutf8.py
@@ -0,0 +1,28 @@
+
+from hypothesis import given, strategies, settings
+
+from rpython.rlib import rutf8, runicode
+
+ at given(strategies.integers(min_value=0, max_value=runicode.MAXUNICODE))
+def test_unichr_as_utf8(i):
+    assert rutf8.unichr_as_utf8(i) == runicode.UNICHR(i).encode('utf8')
+
+ at given(strategies.binary())
+def test_check_ascii(s):
+    raised = False
+    try:
+        s.decode('ascii')
+    except UnicodeDecodeError as e:
+        raised = True
+    try:
+        rutf8.check_ascii(s)
+    except rutf8.AsciiCheckError as a:
+        assert raised
+        assert a.pos == e.start
+    else:
+        assert not raised
+
+ at given(strategies.binary())
+def test_str_decode_raw_utf8_escape(uni):
+    return # XXX fix details
+    rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
\ No newline at end of file