[pypy-commit] pypy unicode-utf8: Add an elidable surrogate_in_utf8() function
arigo
pypy.commits at gmail.com
Thu Sep 21 03:01:09 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92429:bf832917b82d
Date: 2017-09-21 08:56 +0200
http://bitbucket.org/pypy/pypy/changeset/bf832917b82d/
Log: Add an elidable surrogate_in_utf8() function
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -347,6 +347,16 @@
assert pos == len(s)
return pos - continuation_bytes
+ at jit.elidable
+def surrogate_in_utf8(value):
+ """Check if the UTF-8 byte string 'value' contains a surrogate.
+ The 'value' argument must be otherwise correctly formed for UTF-8.
+ """
+ for i in range(len(value) - 2):
+ if value[i] == '\xed' and value[i + 1] >= '\xa0':
+ return True
+ return False
+
UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
'utf8_loc',
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -98,3 +98,10 @@
for i in range(len(u)):
assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
len(u[:i].encode('utf8')))
+
+ at given(strategies.lists(strategies.characters()))
+def test_surrogate_in_utf8(unichars):
+ uni = u''.join(unichars).encode('utf-8')
+ result = rutf8.surrogate_in_utf8(uni)
+ expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
+ assert result == expected
More information about the pypy-commit
mailing list