[pypy-commit] pypy unicode-utf8: implement next_codepoint_pos

Thu Feb 23 12:49:15 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90328:9bba28798a49
Date: 2017-02-23 18:48 +0100
http://bitbucket.org/pypy/pypy/changeset/9bba28798a49/

Log:	implement next_codepoint_pos

diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1032,6 +1032,7 @@
 
         unilist = space.listview_unicode(w_iterable)
         if unilist is not None:
+            xxx
             w_list.strategy = strategy = space.fromcache(UnicodeListStrategy)
             # need to copy because intlist can share with w_iterable
             w_list.lstorage = strategy.erase(unilist[:])
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -299,6 +299,7 @@
     newlist_text = newlist_bytes
 
     def newlist_unicode(self, list_u):
+        xxx
         return self.newlist(list_u)
         return W_ListObject.newlist_unicode(self, list_u)
 
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -471,6 +471,29 @@
             sb.append(unwrapped[i])
         return self._new(sb.build(), lgt)
 
+    @unwrap_spec(keepends=bool)
+    def descr_splitlines(self, space, keepends=False):
+        value = self._val(space)
+        length = len(value)
+        strs = []
+        pos = 0
+        while pos < length:
+            sol = pos
+            while pos < length and not self._islinebreak(value[pos]):
+                pos += 1
+            eol = pos
+            pos += 1
+            # read CRLF as one line break
+            if pos < length and value[eol] == '\r' and value[pos] == '\n':
+                pos += 1
+            if keepends:
+                eol = pos
+            strs.append(value[sol:eol])
+        if pos < length:
+            strs.append(value[pos:length])
+        return self._newlist_unwrapped(space, strs)
+
+
 def wrapunicode(space, uni):
     return W_UnicodeObject(uni)
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -29,6 +29,15 @@
                 chr((0x80 | (code & 0x3f)))), lgt
     raise ValueError
 
+def next_codepoint_pos(code, pos):
+    """ Gives the position of the next codepoint after pos, -1
+    if it's the last one (assumes valid utf8)
+    """
+    chr1 = ord(code[pos])
+    if chr1 < 0x80:
+        return pos + 1
+    return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
+
 class AsciiCheckError(Exception):
     def __init__(self, pos):
         self.pos = pos
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,5 +1,5 @@
 
-from hypothesis import given, strategies, settings
+from hypothesis import given, strategies, settings, example
 
 from rpython.rlib import rutf8, runicode
 
@@ -30,7 +30,7 @@
 @given(strategies.binary())
 def test_str_check_utf8(s):
     try:
-        u = s.decode("utf8")
+        u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True)
         valid = True
     except UnicodeDecodeError as e:
         valid = False
@@ -49,4 +49,18 @@
 @given(strategies.binary())
 def test_str_decode_raw_utf8_escape(uni):
     return # XXX fix details
-    rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
\ No newline at end of file
+    rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
+
+ at given(strategies.characters())
+def test_next_pos(uni):
+    skips = []
+    for elem in uni:
+        skips.append(len(elem.encode('utf8')))
+    pos = 0
+    i = 0
+    utf8 = uni.encode('utf8')
+    while pos < len(utf8):
+        new_pos = rutf8.next_codepoint_pos(utf8, pos)
+        assert new_pos - pos == skips[i]
+        i += 1
+        pos = new_pos