[pypy-commit] pypy unicode-utf8: start testing using hypothesis
fijal
pypy.commits at gmail.com
Tue Feb 21 09:13:00 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90267:422b66748b74
Date: 2017-02-21 15:13 +0100
http://bitbucket.org/pypy/pypy/changeset/422b66748b74/
Log: start testing using hypothesis
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -60,10 +60,10 @@
xxx
substr = decode_unicode_utf8(space, s, ps, q)
if rawmode:
- v = unicodehelper.decode_raw_unicode_escape(space, substr)
+ v, length = unicodehelper.decode_raw_unicode_escape(space, substr)
else:
- v = unicodehelper.decode_unicode_escape(space, substr)
- return space.newunicode(v)
+ v, length = unicodehelper.decode_unicode_escape(space, substr)
+ return space.newunicode(v, length)
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "utf8" and
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -39,18 +39,18 @@
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
# XXX pick better length, maybe
- result, consumed = rutf8.str_decode_utf8_escape(
+ result, consumed, length = rutf8.str_decode_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
unicodedata_handler=unicodedata_handler)
- return result
+ return result, length
def decode_raw_unicode_escape(space, string):
# XXX pick better length, maybe
- result, consumed = rutf8.str_decode_raw_utf8_escape(
+ result, consumed, length = rutf8.str_decode_raw_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space))
- return result
+ return result, length
def decode_utf8(space, string):
# Surrogates are accepted and not treated specially at all.
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -398,7 +398,7 @@
@marshaller(W_UnicodeObject)
def marshal_unicode(space, w_unicode, m):
- s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode))
+ s = space.utf8_w(w_unicode)
m.atom_str(TYPE_UNICODE, s)
@unmarshaller(TYPE_UNICODE)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -346,10 +346,10 @@
return self.w_None
return self.newtext(s)
- def newunicode(self, utf8s):
+ def newunicode(self, utf8s, length):
assert utf8s is not None
assert isinstance(utf8s, str)
- return W_UnicodeObject(utf8s)
+ return W_UnicodeObject(utf8s, length)
def type(self, w_obj):
jit.promote(w_obj.__class__)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -32,11 +32,12 @@
_immutable_fields_ = ['_utf8']
@enforceargs(utf8str=str)
- def __init__(self, utf8str, ucs4str=None):
+ def __init__(self, utf8str, length, ucs4str=None):
assert isinstance(utf8str, str)
if ucs4str is not None:
assert isinstance(ucs4str, unicode)
self._utf8 = utf8str
+ self._length = length
self._ucs4 = ucs4str
def __repr__(self):
@@ -508,14 +509,13 @@
if encoding == 'ascii':
# XXX error handling
s = space.charbuf_w(w_obj)
- xxx
try:
- u = fast_str_decode_ascii(s)
- except ValueError:
- eh = unicodehelper.decode_error_handler(space)
- u = str_decode_ascii( # try again, to get the error right
- s, len(s), None, final=True, errorhandler=eh)[0]
- return space.newunicode(u)
+ rutf8.check_ascii(s)
+ except rutf8.AsciiCheckError as e:
+ unicodehelper.decode_error_handler(space)(None,
+ 'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
+ assert False
+ return space.newunicode(s)
if encoding == 'utf-8':
yyy
s = space.charbuf_w(w_obj)
@@ -1130,7 +1130,7 @@
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject('')
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
# Helper for converting int/long
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -24,19 +24,21 @@
raise ValueError
class AsciiCheckError(Exception):
- pass
+ def __init__(self, pos):
+ self.pos = pos
def check_ascii(s):
for i in range(0, len(s)):
if ord(s[i]) & 0x80:
- raise AsciiCheckError
+ raise AsciiCheckError(i)
def str_decode_raw_utf8_escape(s, size, errors, final=False,
errorhandler=None):
+ lgt = 0
if errorhandler is None:
errorhandler = None # default_unicode_error_decode
if size == 0:
- return '', 0
+ return '', 0, 0
result = StringBuilder(size)
pos = 0
while pos < size:
@@ -46,6 +48,7 @@
if ch != '\\':
result.append(ch)
pos += 1
+ lgt += 1
continue
# \u-escapes are only interpreted iff the number of leading
@@ -55,10 +58,12 @@
pos += 1
if pos == size or s[pos] != '\\':
break
+ lgt += 1
result.append('\\')
# we have a backslash at the end of the string, stop here
if pos >= size:
+ lgt += 1
result.append('\\')
break
@@ -67,6 +72,7 @@
(s[pos] != 'u' and s[pos] != 'U')):
result.append('\\')
result.append(s[pos])
+ lgt += 2
pos += 1
continue
@@ -77,7 +83,7 @@
pos = hexescape(result, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
- return result.build(), pos
+ return result.build(), pos, lgt
def str_decode_utf8_escape(s, size, errors, final=False,
errorhandler=None,
@@ -88,6 +94,7 @@
if size == 0:
return '', 0
+ lgt = 0
builder = StringBuilder(size)
pos = 0
while pos < size:
@@ -97,6 +104,7 @@
if ch != '\\':
builder.append(ch)
pos += 1
+ lgt += 1
continue
# - Escapes
@@ -106,22 +114,23 @@
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, size)
builder.append(res)
+ lgt += 1
continue
ch = s[pos]
pos += 1
# \x escapes
if ch == '\n': pass
- elif ch == '\\': builder.append('\\')
- elif ch == '\'': builder.append('\'')
- elif ch == '\"': builder.append('\"')
- elif ch == 'b' : builder.append('\b')
- elif ch == 'f' : builder.append('\f')
- elif ch == 't' : builder.append('\t')
- elif ch == 'n' : builder.append('\n')
- elif ch == 'r' : builder.append('\r')
- elif ch == 'v' : builder.append('\v')
- elif ch == 'a' : builder.append('\a')
+ elif ch == '\\': builder.append('\\'); lgt += 1
+ elif ch == '\'': builder.append('\''); lgt += 1
+ elif ch == '\"': builder.append('\"'); lgt += 1
+ elif ch == 'b' : builder.append('\b'); lgt += 1
+ elif ch == 'f' : builder.append('\f'); lgt += 1
+ elif ch == 't' : builder.append('\t'); lgt += 1
+ elif ch == 'n' : builder.append('\n'); lgt += 1
+ elif ch == 'r' : builder.append('\r'); lgt += 1
+ elif ch == 'v' : builder.append('\v'); lgt += 1
+ elif ch == 'a' : builder.append('\a'); lgt += 1
elif '0' <= ch <= '7':
xxx
x = ord(ch) - ord('0')
@@ -199,5 +208,6 @@
else:
builder.append('\\')
builder.append(ch)
+ lgt += 2
- return builder.build(), pos
+ return builder.build(), pos, lgt
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/test/test_rutf8.py
@@ -0,0 +1,28 @@
+
+from hypothesis import given, strategies, settings
+
+from rpython.rlib import rutf8, runicode
+
+ at given(strategies.integers(min_value=0, max_value=runicode.MAXUNICODE))
+def test_unichr_as_utf8(i):
+ assert rutf8.unichr_as_utf8(i) == runicode.UNICHR(i).encode('utf8')
+
+ at given(strategies.binary())
+def test_check_ascii(s):
+ raised = False
+ try:
+ s.decode('ascii')
+ except UnicodeDecodeError as e:
+ raised = True
+ try:
+ rutf8.check_ascii(s)
+ except rutf8.AsciiCheckError as a:
+ assert raised
+ assert a.pos == e.start
+ else:
+ assert not raised
+
+ at given(strategies.binary())
+def test_str_decode_raw_utf8_escape(uni):
+ return # XXX fix details
+ rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
\ No newline at end of file
More information about the pypy-commit
mailing list