[pypy-commit] pypy unicode-utf8-re: in-progress

arigo pypy.commits at gmail.com
Fri Dec 8 05:46:49 EST 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93303:0fd38947b59e
Date: 2017-12-08 11:45 +0100
http://bitbucket.org/pypy/pypy/changeset/0fd38947b59e/

Log:	in-progress

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,9 +6,8 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import Utf8StringBuilder
 
 # ____________________________________________________________
 #
@@ -110,11 +109,15 @@
         if endpos < pos:
             endpos = pos
         if space.isinstance_w(w_string, space.w_unicode):
-            unicodestr = space.unicode_w(w_string)
-            if pos > len(unicodestr):
-                pos = len(unicodestr)
-            if endpos > len(unicodestr):
-                endpos = len(unicodestr)
+            utf8str, length = space.utf8_len_w(w_string)
+            if pos >= length:
+                bytepos = len(utf8str)
+            else:
+                bytepos = rutf8.codepoint_at_index(..)
+
+                pos = length
+            if endpos >= length:
+                endpos = length
             return rsre_core.UnicodeMatchContext(self.code, unicodestr,
                                                  pos, endpos, self.flags)
         elif space.isinstance_w(w_string, space.w_bytes):
diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -87,6 +87,13 @@
         assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
         assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
 
+    def test_findall_unicode(self):
+        import re
+        assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+        assert ["a", "u"] == re.findall("b(.)", "abalbus")
+        assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+        assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+
     def test_finditer(self):
         import re
         it = re.finditer("b(.)", "brabbel")


More information about the pypy-commit mailing list