[pypy-commit] pypy unicode-utf8: give up and write the regex by hand, check in a test

fijal pypy.commits at gmail.com
Thu Feb 23 13:39:24 EST 2017


Author: fijal
Branch: unicode-utf8
Changeset: r90330:62c7e93c717d
Date: 2017-02-23 19:32 +0100
http://bitbucket.org/pypy/pypy/changeset/62c7e93c717d/

Log:	give up and write the regex by hand, check in a test

diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -479,8 +479,8 @@
         pos = 0
         while pos < length:
             sol = pos
-            while pos < length and not self._islinebreak(value[pos]):
-                pos += 1
+            while pos < length and not self._islinebreak(value, pos):
+                pos = rutf8.next_codepoint_pos(value, pos)
             eol = pos
             pos += 1
             # read CRLF as one line break
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -50,6 +50,23 @@
 def default_unicode_error_check(*args):
     xxx
 
+def check_newline_utf8(s, pos):
+    chr1 = ord(s[pos])
+    if 0xa <= chr1 <= 0xd:
+        return True
+    if 0x1c <= chr1 <= 0x1e:
+        return True
+    if chr1 == 0xc2:
+        chr2 = ord(s[pos + 1])
+        return chr2 == 0x85
+    elif chr1 == 0xe2:
+        chr2 = ord(s[pos + 1])
+        if chr2 != 0x80:
+            return False
+        chr3 = ord(s[pos + 2])
+        return chr3 == 0xa8 or chr3 == 0xa9
+    return False
+
 # if you can't use the @elidable version, call str_check_utf8_impl()
 # directly
 @jit.elidable
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,4 +1,5 @@
 
+import sys
 from hypothesis import given, strategies, settings, example
 
 from rpython.rlib import rutf8, runicode
@@ -64,3 +65,10 @@
         assert new_pos - pos == skips[i]
         i += 1
         pos = new_pos
+
+def test_check_newline_utf8():
+    for i in xrange(sys.maxunicode):
+        if runicode.unicodedb.islinebreak(i):
+            assert rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+        else:
+            assert not rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)


More information about the pypy-commit mailing list