[pypy-commit] pypy unicode-utf8: rutf8.codepoint_position_at_index() should also work for index == len(u)

Fri Sep 22 08:39:03 EDT 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92435:0f1073a0843b
Date: 2017-09-22 14:37 +0200
http://bitbucket.org/pypy/pypy/changeset/0f1073a0843b/

Log:	rutf8.codepoint_position_at_index() should also work for index ==
	len(u)

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -95,6 +95,8 @@
     """
     pos = r_uint(pos)
     pos -= 1
+    if pos >= len(code):     # for the case where pos - 1 == len(code):
+        return pos           # assume there is an extra '\x00' character
     chr1 = ord(code[pos])
     if chr1 <= 0x7F:
         return pos
@@ -377,9 +379,9 @@
     """ Create an index storage which stores index of each 4th character
     in utf8 encoded unicode string.
     """
-    if len(utf8) == utf8len <= ASCII_INDEX_STORAGE_BLOCKS * 64:
+    if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
         return ASCII_INDEX_STORAGE
-    arraysize = (utf8len + 63) // 64
+    arraysize = utf8len // 64 + 1
     storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
     baseindex = 0
     current = 0
@@ -387,10 +389,14 @@
         storage[current].baseindex = baseindex
         next = baseindex
         for i in range(16):
-            next = next_codepoint_pos(utf8, next)
+            if utf8len == 0:
+                next += 1      # assume there is an extra '\x00' character
+            else:
+                next = next_codepoint_pos(utf8, next)
             storage[current].ofs[i] = chr(next - baseindex)
             utf8len -= 4
-            if utf8len <= 0:
+            if utf8len < 0:
+                assert current + 1 == len(storage)
                 break
             next = next_codepoint_pos(utf8, next)
             next = next_codepoint_pos(utf8, next)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -93,9 +93,11 @@
                 ord(item))
 
 @given(strategies.text())
+ at example(u'x' * 64 * 5)
+ at example(u'x' * (64 * 5 - 1))
 def test_codepoint_position_at_index(u):
     index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
-    for i in range(len(u)):
+    for i in range(len(u) + 1):
         assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
                 len(u[:i].encode('utf8')))