[pypy-commit] pypy unicode-utf8: rutf8.codepoint_position_at_index() should also work for index == len(u)
arigo
pypy.commits at gmail.com
Fri Sep 22 08:39:03 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92435:0f1073a0843b
Date: 2017-09-22 14:37 +0200
http://bitbucket.org/pypy/pypy/changeset/0f1073a0843b/
Log: rutf8.codepoint_position_at_index() should also work for index ==
len(u)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -95,6 +95,8 @@
"""
pos = r_uint(pos)
pos -= 1
+ if pos >= len(code): # for the case where pos - 1 == len(code):
+ return pos # assume there is an extra '\x00' character
chr1 = ord(code[pos])
if chr1 <= 0x7F:
return pos
@@ -377,9 +379,9 @@
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
"""
- if len(utf8) == utf8len <= ASCII_INDEX_STORAGE_BLOCKS * 64:
+ if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
return ASCII_INDEX_STORAGE
- arraysize = (utf8len + 63) // 64
+ arraysize = utf8len // 64 + 1
storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
baseindex = 0
current = 0
@@ -387,10 +389,14 @@
storage[current].baseindex = baseindex
next = baseindex
for i in range(16):
- next = next_codepoint_pos(utf8, next)
+ if utf8len == 0:
+ next += 1 # assume there is an extra '\x00' character
+ else:
+ next = next_codepoint_pos(utf8, next)
storage[current].ofs[i] = chr(next - baseindex)
utf8len -= 4
- if utf8len <= 0:
+ if utf8len < 0:
+ assert current + 1 == len(storage)
break
next = next_codepoint_pos(utf8, next)
next = next_codepoint_pos(utf8, next)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -93,9 +93,11 @@
ord(item))
@given(strategies.text())
+ at example(u'x' * 64 * 5)
+ at example(u'x' * (64 * 5 - 1))
def test_codepoint_position_at_index(u):
index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
- for i in range(len(u)):
+ for i in range(len(u) + 1):
assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
len(u[:i].encode('utf8')))
More information about the pypy-commit
mailing list