[pypy-commit] pypy unicode-utf8: Add an elidable surrogate_in_utf8() function

arigo pypy.commits at gmail.com
Thu Sep 21 03:01:09 EDT 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92429:bf832917b82d
Date: 2017-09-21 08:56 +0200
http://bitbucket.org/pypy/pypy/changeset/bf832917b82d/

Log:	Add an elidable surrogate_in_utf8() function

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -347,6 +347,16 @@
     assert pos == len(s)
     return pos - continuation_bytes
 
+ at jit.elidable
+def surrogate_in_utf8(value):
+    """Check if the UTF-8 byte string 'value' contains a surrogate.
+    The 'value' argument must be otherwise correctly formed for UTF-8.
+    """
+    for i in range(len(value) - 2):
+        if value[i] == '\xed' and value[i + 1] >= '\xa0':
+            return True
+    return False
+
 
 UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
     'utf8_loc',
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -98,3 +98,10 @@
     for i in range(len(u)):
         assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
                 len(u[:i].encode('utf8')))
+
+ at given(strategies.lists(strategies.characters()))
+def test_surrogate_in_utf8(unichars):
+    uni = u''.join(unichars).encode('utf-8')
+    result = rutf8.surrogate_in_utf8(uni)
+    expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
+    assert result == expected


More information about the pypy-commit mailing list