[Python-checkins] cpython: Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

Fri Oct 31 11:42:52 CET 2014

https://hg.python.org/cpython/rev/6f52a3d0f548
changeset:   93319:6f52a3d0f548
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Fri Oct 31 12:36:56 2014 +0200
summary:
  Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

files:
  Lib/sre_compile.py      |  35 +++++++++++++++++-----------
  Lib/sre_constants.py    |   9 ++++--
  Lib/test/test_re.py     |  19 +++++++++++++++
  Misc/NEWS               |   3 ++
  Modules/_sre.c          |  28 ++++++++++++++++++++--
  Modules/sre.h           |   2 +-
  Modules/sre_constants.h |   3 +-
  Modules/sre_lib.h       |  28 ++++++++++++++++++----
  8 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -22,9 +22,6 @@
 else:
     MAXCODE = 0xFFFFFFFF
 
-def _identityfunction(x):
-    return x
-
 _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
 _SUCCESS_CODES = set([SUCCESS, FAILURE])
@@ -53,7 +50,7 @@
                     return _sre.getlower(literal, flags)
             else:
                 emit(OPCODES[op])
-                fixup = _identityfunction
+                fixup = None
             skip = _len(code); emit(0)
             _compile_charset(av, flags, code, fixup)
             code[skip] = _len(code) - skip
@@ -172,17 +169,15 @@
 def _compile_charset(charset, flags, code, fixup=None):
     # compile charset subprogram
     emit = code.append
-    if fixup is None:
-        fixup = _identityfunction
     for op, av in _optimize_charset(charset, fixup):
         emit(OPCODES[op])
         if op is NEGATE:
             pass
         elif op is LITERAL:
-            emit(fixup(av))
-        elif op is RANGE:
-            emit(fixup(av[0]))
-            emit(fixup(av[1]))
+            emit(av)
+        elif op is RANGE or op is RANGE_IGNORE:
+            emit(av[0])
+            emit(av[1])
         elif op is CHARSET:
             code.extend(av)
         elif op is BIGCHARSET:
@@ -207,9 +202,14 @@
         while True:
             try:
                 if op is LITERAL:
-                    charmap[fixup(av)] = 1
+                    if fixup:
+                        av = fixup(av)
+                    charmap[av] = 1
                 elif op is RANGE:
-                    for i in range(fixup(av[0]), fixup(av[1])+1):
+                    r = range(av[0], av[1]+1)
+                    if fixup:
+                        r = map(fixup, r)
+                    for i in r:
                         charmap[i] = 1
                 elif op is NEGATE:
                     out.append((op, av))
@@ -220,7 +220,12 @@
                     # character set contains non-UCS1 character codes
                     charmap += b'\0' * 0xff00
                     continue
-                # character set contains non-BMP character codes
+                # Character set contains non-BMP character codes.
+                # There are only two ranges of cased non-BMP characters:
+                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+                # and for both ranges RANGE_IGNORE works.
+                if fixup and op is RANGE:
+                    op = RANGE_IGNORE
                 tail.append((op, av))
             break
 
@@ -247,8 +252,10 @@
             else:
                 out.append((RANGE, (p, q - 1)))
         out += tail
-        if len(out) < len(charset):
+        # if the case was changed or new representation is more compact
+        if fixup or len(out) < len(charset):
             return out
+        # else original character set is good enough
         return charset
 
     # use bitmap
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20031017
+MAGIC = 20140917
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -56,6 +56,7 @@
 NOT_LITERAL = "not_literal"
 NOT_LITERAL_IGNORE = "not_literal_ignore"
 RANGE = "range"
+RANGE_IGNORE = "range_ignore"
 REPEAT = "repeat"
 REPEAT_ONE = "repeat_one"
 SUBPATTERN = "subpattern"
@@ -121,7 +122,8 @@
     REPEAT,
     REPEAT_ONE,
     SUBPATTERN,
-    MIN_REPEAT_ONE
+    MIN_REPEAT_ONE,
+    RANGE_IGNORE,
 
 ]
 
@@ -159,7 +161,8 @@
     GROUPREF: GROUPREF_IGNORE,
     IN: IN_IGNORE,
     LITERAL: LITERAL_IGNORE,
-    NOT_LITERAL: NOT_LITERAL_IGNORE
+    NOT_LITERAL: NOT_LITERAL_IGNORE,
+    RANGE: RANGE_IGNORE,
 }
 
 AT_MULTILINE = {
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -601,6 +601,25 @@
         self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
         self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
 
+    def test_ignore_case_range(self):
+        # Issues #3511, #17381.
+        self.assertTrue(re.match(r'[9-a]', '_', re.I))
+        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
+        self.assertTrue(re.match(br'[9-a]', b'_', re.I))
+        self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
+        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
+        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
+        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
+        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
+        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
+        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
+        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
+        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
+        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
+        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
+        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
+        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
+
     def test_category(self):
         self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -180,6 +180,9 @@
 Library
 -------
 
+- Issue #17381: Fixed handling of case-insensitive ranges in regular
+  expressions.
+
 - Issue #22410: Module level functions in the re module now cache compiled
   locale-dependent regular expressions taking into account the locale.
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -113,6 +113,11 @@
     return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
 }
 
+static unsigned int sre_upper(unsigned int ch)
+{
+    return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
+}
+
 /* locale-specific character predicates */
 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
  * warnings when c's type supports only numbers < N+1 */
@@ -124,6 +129,11 @@
     return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
 }
 
+static unsigned int sre_upper_locale(unsigned int ch)
+{
+    return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
+}
+
 /* unicode-specific character predicates */
 
 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
@@ -137,6 +147,11 @@
     return (unsigned int) Py_UNICODE_TOLOWER(ch);
 }
 
+static unsigned int sre_upper_unicode(unsigned int ch)
+{
+    return (unsigned int) Py_UNICODE_TOUPPER(ch);
+}
+
 LOCAL(int)
 sre_category(SRE_CODE category, unsigned int ch)
 {
@@ -377,12 +392,18 @@
     state->pos = start;
     state->endpos = end;
 
-    if (pattern->flags & SRE_FLAG_LOCALE)
+    if (pattern->flags & SRE_FLAG_LOCALE) {
         state->lower = sre_lower_locale;
-    else if (pattern->flags & SRE_FLAG_UNICODE)
+        state->upper = sre_upper_locale;
+    }
+    else if (pattern->flags & SRE_FLAG_UNICODE) {
         state->lower = sre_lower_unicode;
-    else
+        state->upper = sre_upper_unicode;
+    }
+    else {
         state->lower = sre_lower;
+        state->upper = sre_upper;
+    }
 
     return string;
   err:
@@ -1567,6 +1588,7 @@
             break;
 
         case SRE_OP_RANGE:
+        case SRE_OP_RANGE_IGNORE:
             GET_ARG;
             GET_ARG;
             break;
diff --git a/Modules/sre.h b/Modules/sre.h
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -84,7 +84,7 @@
     /* current repeat context */
     SRE_REPEAT *repeat;
     /* hooks */
-    SRE_TOLOWER_HOOK lower;
+    SRE_TOLOWER_HOOK lower, upper;
 } SRE_STATE;
 
 typedef struct {
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
  * See the _sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20031017
+#define SRE_MAGIC 20140917
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -44,6 +44,7 @@
 #define SRE_OP_REPEAT_ONE 29
 #define SRE_OP_SUBPATTERN 30
 #define SRE_OP_MIN_REPEAT_ONE 31
+#define SRE_OP_RANGE_IGNORE 32
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -101,7 +101,7 @@
 }
 
 LOCAL(int)
-SRE(charset)(SRE_CODE* set, SRE_CODE ch)
+SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
 {
     /* check if character is a member of the given set */
 
@@ -142,6 +142,20 @@
             set += 2;
             break;
 
+        case SRE_OP_RANGE_IGNORE:
+            /* <RANGE_IGNORE> <lower> <upper> */
+        {
+            SRE_CODE uch;
+            /* ch is already lower cased */
+            if (set[0] <= ch && ch <= set[1])
+                return ok;
+            uch = state->upper(ch);
+            if (set[0] <= uch && uch <= set[1])
+                return ok;
+            set += 2;
+            break;
+        }
+
         case SRE_OP_NEGATE:
             ok = !ok;
             break;
@@ -193,7 +207,7 @@
     case SRE_OP_IN:
         /* repeated set */
         TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
-        while (ptr < end && SRE(charset)(pattern + 2, *ptr))
+        while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
             ptr++;
         break;
 
@@ -628,7 +642,8 @@
             /* match set member (or non_member) */
             /* <IN> <skip> <set> */
             TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
-            if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr))
+            if (ctx->ptr >= end ||
+                !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
                 RETURN_FAILURE;
             ctx->pattern += ctx->pattern[0];
             ctx->ptr++;
@@ -657,7 +672,7 @@
         case SRE_OP_IN_IGNORE:
             TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
             if (ctx->ptr >= end
-                || !SRE(charset)(ctx->pattern+1,
+                || !SRE(charset)(state, ctx->pattern+1,
                                  (SRE_CODE)state->lower(*ctx->ptr)))
                 RETURN_FAILURE;
             ctx->pattern += ctx->pattern[0];
@@ -688,7 +703,8 @@
                     continue;
                 if (ctx->pattern[1] == SRE_OP_IN &&
                     (ctx->ptr >= end ||
-                     !SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
+                     !SRE(charset)(state, ctx->pattern + 3,
+                                   (SRE_CODE) *ctx->ptr)))
                     continue;
                 state->ptr = ctx->ptr;
                 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
@@ -1310,7 +1326,7 @@
         /* pattern starts with a character from a known set */
         end = (SRE_CHAR *)state->end;
         for (;;) {
-            while (ptr < end && !SRE(charset)(charset, *ptr))
+            while (ptr < end && !SRE(charset)(state, charset, *ptr))
                 ptr++;
             if (ptr >= end)
                 return 0;

-- 
Repository URL: https://hg.python.org/cpython