[pypy-commit] pypy unicode-utf8: implement next_codepoint_pos
fijal
pypy.commits at gmail.com
Thu Feb 23 12:49:15 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90328:9bba28798a49
Date: 2017-02-23 18:48 +0100
http://bitbucket.org/pypy/pypy/changeset/9bba28798a49/
Log: implement next_codepoint_pos
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1032,6 +1032,7 @@
unilist = space.listview_unicode(w_iterable)
if unilist is not None:
+ xxx
w_list.strategy = strategy = space.fromcache(UnicodeListStrategy)
# need to copy because intlist can share with w_iterable
w_list.lstorage = strategy.erase(unilist[:])
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -299,6 +299,7 @@
newlist_text = newlist_bytes
def newlist_unicode(self, list_u):
+ xxx
return self.newlist(list_u)
return W_ListObject.newlist_unicode(self, list_u)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -471,6 +471,29 @@
sb.append(unwrapped[i])
return self._new(sb.build(), lgt)
+ @unwrap_spec(keepends=bool)
+ def descr_splitlines(self, space, keepends=False):
+ value = self._val(space)
+ length = len(value)
+ strs = []
+ pos = 0
+ while pos < length:
+ sol = pos
+ while pos < length and not self._islinebreak(value[pos]):
+ pos += 1
+ eol = pos
+ pos += 1
+ # read CRLF as one line break
+ if pos < length and value[eol] == '\r' and value[pos] == '\n':
+ pos += 1
+ if keepends:
+ eol = pos
+ strs.append(value[sol:eol])
+ if pos < length:
+ strs.append(value[pos:length])
+ return self._newlist_unwrapped(space, strs)
+
+
def wrapunicode(space, uni):
return W_UnicodeObject(uni)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -29,6 +29,15 @@
chr((0x80 | (code & 0x3f)))), lgt
raise ValueError
+def next_codepoint_pos(code, pos):
+ """ Gives the position of the next codepoint after pos, -1
+ if it's the last one (assumes valid utf8)
+ """
+ chr1 = ord(code[pos])
+ if chr1 < 0x80:
+ return pos + 1
+ return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
+
class AsciiCheckError(Exception):
def __init__(self, pos):
self.pos = pos
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,5 +1,5 @@
-from hypothesis import given, strategies, settings
+from hypothesis import given, strategies, settings, example
from rpython.rlib import rutf8, runicode
@@ -30,7 +30,7 @@
@given(strategies.binary())
def test_str_check_utf8(s):
try:
- u = s.decode("utf8")
+ u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True)
valid = True
except UnicodeDecodeError as e:
valid = False
@@ -49,4 +49,18 @@
@given(strategies.binary())
def test_str_decode_raw_utf8_escape(uni):
return # XXX fix details
- rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
\ No newline at end of file
+ rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
+
+ at given(strategies.characters())
+def test_next_pos(uni):
+ skips = []
+ for elem in uni:
+ skips.append(len(elem.encode('utf8')))
+ pos = 0
+ i = 0
+ utf8 = uni.encode('utf8')
+ while pos < len(utf8):
+ new_pos = rutf8.next_codepoint_pos(utf8, pos)
+ assert new_pos - pos == skips[i]
+ i += 1
+ pos = new_pos
More information about the pypy-commit
mailing list