[pypy-commit] pypy unicode-utf8: give up and write the regex by hand, check in a test
fijal
pypy.commits at gmail.com
Thu Feb 23 13:39:24 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90330:62c7e93c717d
Date: 2017-02-23 19:32 +0100
http://bitbucket.org/pypy/pypy/changeset/62c7e93c717d/
Log: give up and write the regex by hand, check in a test
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -479,8 +479,8 @@
pos = 0
while pos < length:
sol = pos
- while pos < length and not self._islinebreak(value[pos]):
- pos += 1
+ while pos < length and not self._islinebreak(value, pos):
+ pos = rutf8.next_codepoint_pos(value, pos)
eol = pos
pos += 1
# read CRLF as one line break
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -50,6 +50,23 @@
def default_unicode_error_check(*args):
xxx
+def check_newline_utf8(s, pos):
+ chr1 = ord(s[pos])
+ if 0xa <= chr1 <= 0xd:
+ return True
+ if 0x1c <= chr1 <= 0x1e:
+ return True
+ if chr1 == 0xc2:
+ chr2 = ord(s[pos + 1])
+ return chr2 == 0x85
+ elif chr1 == 0xe2:
+ chr2 = ord(s[pos + 1])
+ if chr2 != 0x80:
+ return False
+ chr3 = ord(s[pos + 2])
+ return chr3 == 0xa8 or chr3 == 0xa9
+ return False
+
# if you can't use the @elidable version, call str_check_utf8_impl()
# directly
@jit.elidable
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,4 +1,5 @@
+import sys
from hypothesis import given, strategies, settings, example
from rpython.rlib import rutf8, runicode
@@ -64,3 +65,10 @@
assert new_pos - pos == skips[i]
i += 1
pos = new_pos
+
+def test_check_newline_utf8():
+ for i in xrange(sys.maxunicode):
+ if runicode.unicodedb.islinebreak(i):
+ assert rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+ else:
+ assert not rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
More information about the pypy-commit
mailing list