[pypy-commit] pypy default: Support the "General unicode category" pattern in rsre.
arigo
noreply at buildbot.pypy.org
Thu Nov 28 17:23:30 CET 2013
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r68337:fc6598a6d83a
Date: 2013-11-28 17:22 +0100
http://bitbucket.org/pypy/pypy/changeset/fc6598a6d83a/
Log: Support the "General unicode category" pattern in rsre.
diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py
--- a/rpython/rlib/rsre/rsre_char.py
+++ b/rpython/rlib/rsre/rsre_char.py
@@ -145,13 +145,11 @@
result = False
while True:
opcode = pattern[ppos]
- i = 0
- for function in set_dispatch_unroll:
- if function is not None and opcode == i:
+ for i, function in set_dispatch_unroll:
+ if opcode == i:
newresult, ppos = function(pattern, ppos, char_code)
result |= newresult
break
- i = i + 1
else:
if opcode == 0: # FAILURE
break
@@ -225,13 +223,42 @@
index += count * (32 / CODESIZE) # skip blocks
return match, index
-set_dispatch_table = [
- None, # FAILURE
- None, None, None, None, None, None, None, None,
- set_category, set_charset, set_bigcharset, None, None, None,
- None, None, None, None, set_literal, None, None, None, None,
- None, None,
- None, # NEGATE
- set_range
-]
-set_dispatch_unroll = unrolling_iterable(set_dispatch_table)
+def set_unicode_general_category(pat, index, char_code):
+ # Unicode "General category property code" (not used by Python).
+ # A general category is two letters. 'pat[index+1]' contains both
+ # the first character, and the second character shifted by 8.
+ # http://en.wikipedia.org/wiki/Unicode_character_property#General_Category
+ # Also supports single-character categories, if the second character is 0.
+ # Negative matches are triggered by bit number 7.
+ assert unicodedb is not None
+ cat = unicodedb.category(char_code)
+ category_code = pat[index + 1]
+ first_character = category_code & 0x7F
+ second_character = (category_code >> 8) & 0x7F
+ negative_match = category_code & 0x80
+ #
+ if second_character == 0:
+ # single-character match
+ check = ord(cat[0])
+ expected = first_character
+ else:
+ # two-characters match
+ check = ord(cat[0]) | (ord(cat[1]) << 8)
+ expected = first_character | (second_character << 8)
+ #
+ if negative_match:
+ result = check != expected
+ else:
+ result = check == expected
+ #
+ return result, index + 2
+
+set_dispatch_table = {
+ 9: set_category,
+ 10: set_charset,
+ 11: set_bigcharset,
+ 19: set_literal,
+ 27: set_range,
+ 70: set_unicode_general_category,
+}
+set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -19,7 +19,7 @@
#OPCODE_CALL = 8
OPCODE_CATEGORY = 9
OPCODE_CHARSET = 10
-#OPCODE_BIGCHARSET = 11
+OPCODE_BIGCHARSET = 11
OPCODE_GROUPREF = 12
OPCODE_GROUPREF_EXISTS = 13
OPCODE_GROUPREF_IGNORE = 14
@@ -41,6 +41,9 @@
#OPCODE_SUBPATTERN = 30
OPCODE_MIN_REPEAT_ONE = 31
+# not used by Python itself
+OPCODE_UNICODE_GENERAL_CATEGORY = 70
+
# ____________________________________________________________
_seen_specname = {}
diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py
--- a/rpython/rlib/rsre/test/test_char.py
+++ b/rpython/rlib/rsre/test/test_char.py
@@ -126,3 +126,44 @@
assert cat(CHCODES["category_uni_not_digit"], ROMAN_NUMERAL)
assert cat(CHCODES["category_uni_not_digit"], CIRCLED_NUMBER)
assert cat(CHCODES["category_uni_not_digit"], DINGBAT_CIRCLED)
+
+
+def test_general_category():
+ from rpython.rlib.unicodedata import unicodedb
+
+ for cat, positive, negative in [('L', u'aZ\xe9', u'. ?'),
+ ('P', u'.?', u'aZ\xe9 ')]:
+ pat_pos = [70, ord(cat), 0]
+ pat_neg = [70, ord(cat) | 0x80, 0]
+ for c in positive:
+ assert unicodedb.category(ord(c)).startswith(cat)
+ assert rsre_char.check_charset(pat_pos, 0, ord(c))
+ assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+ for c in negative:
+ assert not unicodedb.category(ord(c)).startswith(cat)
+ assert not rsre_char.check_charset(pat_pos, 0, ord(c))
+ assert rsre_char.check_charset(pat_neg, 0, ord(c))
+
+ def cat2num(cat):
+ return ord(cat[0]) | (ord(cat[1]) << 8)
+
+ for cat, positive, negative in [('Lu', u'A', u'z\xe9 '),
+ ('Ll', u'z\xe9', u'A \n')]:
+ pat_pos = [70, cat2num(cat), 0]
+ pat_neg = [70, cat2num(cat) | 0x80, 0]
+ for c in positive:
+ assert unicodedb.category(ord(c)) == cat
+ assert rsre_char.check_charset(pat_pos, 0, ord(c))
+ assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+ for c in negative:
+ assert unicodedb.category(ord(c)) != cat
+ assert not rsre_char.check_charset(pat_pos, 0, ord(c))
+ assert rsre_char.check_charset(pat_neg, 0, ord(c))
+
+ # test for how the common 'L&' pattern might be compiled
+ pat = [70, cat2num('Lu'), 70, cat2num('Ll'), 70, cat2num('Lt'), 0]
+ assert rsre_char.check_charset(pat, 0, 65) # Lu
+ assert rsre_char.check_charset(pat, 0, 99) # Ll
+ assert rsre_char.check_charset(pat, 0, 453) # Lt
+ assert not rsre_char.check_charset(pat, 0, 688) # Lm
+ assert not rsre_char.check_charset(pat, 0, 5870) # Nl
More information about the pypy-commit
mailing list