[pypy-commit] pypy unicode-utf8-re: in-progress
arigo
pypy.commits at gmail.com
Fri Dec 8 05:46:49 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93303:0fd38947b59e
Date: 2017-12-08 11:45 +0100
http://bitbucket.org/pypy/pypy/changeset/0fd38947b59e/
Log: in-progress
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,9 +6,8 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import Utf8StringBuilder
# ____________________________________________________________
#
@@ -110,11 +109,15 @@
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
- unicodestr = space.unicode_w(w_string)
- if pos > len(unicodestr):
- pos = len(unicodestr)
- if endpos > len(unicodestr):
- endpos = len(unicodestr)
+ utf8str, length = space.utf8_len_w(w_string)
+ if pos >= length:
+ bytepos = len(utf8str)
+ else:
+ bytepos = rutf8.codepoint_at_index(..)
+
+ pos = length
+ if endpos >= length:
+ endpos = length
return rsre_core.UnicodeMatchContext(self.code, unicodestr,
pos, endpos, self.flags)
elif space.isinstance_w(w_string, space.w_bytes):
diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -87,6 +87,13 @@
assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+ def test_findall_unicode(self):
+ import re
+ assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+ assert ["a", "u"] == re.findall("b(.)", "abalbus")
+ assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+ assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+
def test_finditer(self):
import re
it = re.finditer("b(.)", "brabbel")
More information about the pypy-commit
mailing list