[pypy-svn] r15933 - in pypy/dist/pypy/module/_sre: . test

Wed Aug 10 19:17:31 CEST 2005

Author: nik
Date: Wed Aug 10 19:17:29 2005
New Revision: 15933

Modified:
   pypy/dist/pypy/module/_sre/__init__.py
   pypy/dist/pypy/module/_sre/app_sre.py
   pypy/dist/pypy/module/_sre/interp_sre.py
   pypy/dist/pypy/module/_sre/test/test_app_sre.py
Log:
moved more functions from app- to interp-level. added interp-level tests.


Modified: pypy/dist/pypy/module/_sre/__init__.py
==============================================================================

--- pypy/dist/pypy/module/_sre/__init__.py	(original)
+++ pypy/dist/pypy/module/_sre/__init__.py	Wed Aug 10 19:17:29 2005
@@ -21,5 +21,8 @@
         '_is_digit':      'interp_sre._is_digit',
         '_is_space':      'interp_sre._is_space',
         '_is_word':       'interp_sre._is_word',
+        '_is_uni_word':   'interp_sre._is_uni_word',
+        '_is_loc_word':   'interp_sre._is_loc_word',
         '_is_linebreak':  'interp_sre._is_linebreak',
+        '_is_uni_linebreak': 'interp_sre._is_uni_linebreak',
     }

Modified: pypy/dist/pypy/module/_sre/app_sre.py
==============================================================================
--- pypy/dist/pypy/module/_sre/app_sre.py	(original)
+++ pypy/dist/pypy/module/_sre/app_sre.py	Wed Aug 10 19:17:29 2005
@@ -1170,13 +1170,13 @@
     def at_non_boundary(self, ctx):
         return not ctx.at_boundary(_sre._is_word)
     def at_loc_boundary(self, ctx):
-        return ctx.at_boundary(_is_loc_word)
+        return ctx.at_boundary(_sre._is_loc_word)
     def at_loc_non_boundary(self, ctx):
-        return not ctx.at_boundary(_is_loc_word)
+        return not ctx.at_boundary(_sre._is_loc_word)
     def at_uni_boundary(self, ctx):
-        return ctx.at_boundary(_is_uni_word)
+        return ctx.at_boundary(_sre._is_uni_word)
     def at_uni_non_boundary(self, ctx):
-        return not ctx.at_boundary(_is_uni_word)
+        return not ctx.at_boundary(_sre._is_uni_word)
     def unknown(self, ctx):
         return False
 
@@ -1202,9 +1202,9 @@
     def category_not_linebreak(self, ctx):
         return not _sre._is_linebreak(ctx.peek_char())
     def category_loc_word(self, ctx):
-        return _is_loc_word(ctx.peek_char())
+        return _sre._is_loc_word(ctx.peek_char())
     def category_loc_not_word(self, ctx):
-        return not _is_loc_word(ctx.peek_char())
+        return not _sre._is_loc_word(ctx.peek_char())
     def category_uni_digit(self, ctx):
         return ctx.peek_char().isdigit()
     def category_uni_not_digit(self, ctx):
@@ -1214,28 +1214,19 @@
     def category_uni_not_space(self, ctx):
         return not ctx.peek_char().isspace()
     def category_uni_word(self, ctx):
-        return _is_uni_word(ctx.peek_char())
+        return _sre._is_uni_word(ctx.peek_char())
     def category_uni_not_word(self, ctx):
-        return not _is_uni_word(ctx.peek_char())
+        return not _sre._is_uni_word(ctx.peek_char())
     def category_uni_linebreak(self, ctx):
-        return ord(ctx.peek_char()) in _uni_linebreaks
+        return _sre._is_uni_linebreak(ctx.peek_char())
     def category_uni_not_linebreak(self, ctx):
-        return ord(ctx.peek_char()) not in _uni_linebreaks
+        return not _sre._is_uni_linebreak(ctx.peek_char())
     def unknown(self, ctx):
         return False
 
 _ChcodeDispatcher.build_dispatch_table(CHCODES, "")
 
 
-def _is_loc_word(char):
-    return (not (ord(char) & ~255) and char.isalnum()) or char == '_'
-
-def _is_uni_word(char):
-    return char.isalnum() or char == '_'
-
-# Static list of all unicode codepoints reported by Py_UNICODE_ISLINEBREAK.
-_uni_linebreaks = [10, 13, 28, 29, 30, 133, 8232, 8233]
-
 def _log(message):
     if 0:
         print message

Modified: pypy/dist/pypy/module/_sre/interp_sre.py
==============================================================================
--- pypy/dist/pypy/module/_sre/interp_sre.py	(original)
+++ pypy/dist/pypy/module/_sre/interp_sre.py	Wed Aug 10 19:17:29 2005
@@ -1,6 +1,6 @@
 from pypy.interpreter.baseobjspace import ObjSpace
 
-_ascii_char_info = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
+ascii_char_info = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
@@ -8,19 +8,45 @@
 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 ]
 
-_linebreak = ord("\n")
+linebreak = ord("\n")
+underline = ord("_")
+
+# Static list of all unicode codepoints reported by Py_UNICODE_ISLINEBREAK.
+# Using a dict as a poor man's set.
+uni_linebreaks = {10: True, 13: True, 28: True, 29: True, 30: True, 133: True,
+                  8232: True, 8233: True}
 
 def _is_digit(space, w_char):
     code = space.int_w(space.ord(w_char))
-    return space.newbool(code < 128 and _ascii_char_info[code] & 1)
+    return space.newbool(code < 128 and ascii_char_info[code] & 1)
 
 def _is_space(space, w_char):
     code = space.int_w(space.ord(w_char))
-    return space.newbool(code < 128 and _ascii_char_info[code] & 2)
+    return space.newbool(code < 128 and ascii_char_info[code] & 2)
 
 def _is_word(space, w_char):
     code = space.int_w(space.ord(w_char))
-    return space.newbool(code < 128 and _ascii_char_info[code] & 16)
+    return space.newbool(code < 128 and ascii_char_info[code] & 16)
+
+def _is_uni_word(space, w_char):
+    code = space.int_w(space.ord(w_char))
+    w_unichar = space.newunicode([code])
+    isalnum = space.is_true(space.call_method(w_unichar, "isalnum"))
+    return space.newbool(isalnum or code == underline)
+
+def _is_loc_word(space, w_char):
+    code = space.int_w(space.ord(w_char))
+    if code > 255:
+        return space.newbool(False)
+    # Need to use this new w_char_not_uni from here on, because this one is
+    # guaranteed to be not unicode.
+    w_char_not_uni = space.wrap(chr(code))
+    isalnum = space.is_true(space.call_method(w_char_not_uni, "isalnum"))
+    return space.newbool(isalnum or code == underline)
 
 def _is_linebreak(space, w_char):
-    return space.newbool(space.int_w(space.ord(w_char)) == _linebreak)
+    return space.newbool(space.int_w(space.ord(w_char)) == linebreak)
+
+def _is_uni_linebreak(space, w_char):
+    code = space.int_w(space.ord(w_char))
+    return space.newbool(uni_linebreaks.has_key(code))

Modified: pypy/dist/pypy/module/_sre/test/test_app_sre.py
==============================================================================
--- pypy/dist/pypy/module/_sre/test/test_app_sre.py	(original)
+++ pypy/dist/pypy/module/_sre/test/test_app_sre.py	Wed Aug 10 19:17:29 2005
@@ -303,6 +303,7 @@
         LOWER_PI = u"\u03c0"
         INDIAN_DIGIT = u"\u0966"
         EM_SPACE = u"\u2001"
+        LOWER_AE = "\xe4"
         assert re.search(r"bla\d\s\w", "bla3 b")
         assert re.search(r"b\d", u"b%s" % INDIAN_DIGIT, re.UNICODE)
         assert not re.search(r"b\D", u"b%s" % INDIAN_DIGIT, re.UNICODE)
@@ -310,6 +311,7 @@
         assert not re.search(r"b\S", u"b%s" % EM_SPACE, re.UNICODE)
         assert re.search(r"b\w", u"b%s" % LOWER_PI, re.UNICODE)
         assert not re.search(r"b\W", u"b%s" % LOWER_PI, re.UNICODE)
+        assert re.search(r"b\w", "b%s" % LOWER_AE, re.UNICODE)
 
     def test_search_simple_any(self):
         import re
@@ -657,9 +659,11 @@
             opcodes2 = s.encode_literal("b") \
                 + [s.OPCODES["category"], s.CHCODES["category_loc_not_word"], s.OPCODES["success"]]
             s.assert_no_match(opcodes1, "b\xFC")
+            s.assert_no_match(opcodes1, u"b\u00FC")
             s.assert_match(opcodes2, "b\xFC")
             locale.setlocale(locale.LC_ALL, "de_DE")
             s.assert_match(opcodes1, "b\xFC")
+            s.assert_no_match(opcodes1, u"b\u00FC")
             s.assert_no_match(opcodes2, "b\xFC")
             s.void_locale()
         except locale.Error: