[Python-checkins] bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)

serhiy-storchaka webhook-mailer at python.org
Tue Mar 29 10:31:25 EDT 2022


https://github.com/python/cpython/commit/356997cccc21a3391175d20e9ef03d434675b496
commit: 356997cccc21a3391175d20e9ef03d434675b496
branch: main
author: Ma Lin <animalize at users.noreply.github.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-03-29T17:31:01+03:00
summary:

bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)

In rare cases, capturing group could get wrong result.

Regular expression engines in Perl and Java have similar bugs.
The new behavior now matches the behavior of more modern
RE engines: in the regex module and in PHP, Ruby and Node.js.

files:
A Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst
M Doc/whatsnew/3.11.rst
M Lib/test/test_re.py
M Modules/_sre.c
M Modules/sre_lib.h

diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 41e4659b0f779..837d8c8cbd016 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -718,6 +718,11 @@ Changes in the Python API
   deprecated since Python 3.6.
   (Contributed by Serhiy Storchaka in :issue:`47066`.)
 
+* :mod:`re` module: Fix a few long-standing bugs where, in rare cases,
+  capturing group could get wrong result. So the result may be different than
+  before.
+  (Contributed by Ma Lin in :issue:`35859`.)
+
 * The *population* parameter of :func:`random.sample` must be a sequence.
   Automatic conversion of sets to lists is no longer supported. If the sample size
   is larger than the population size, a :exc:`ValueError` is raised.
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index fd6db6a300d03..85716fbe2a8e8 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2033,6 +2033,75 @@ def test_bug_34294(self):
                           {'tag': 'foo', 'text': None},
                           {'tag': 'foo', 'text': None}])
 
+    def test_MARK_PUSH_macro_bug(self):
+        # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
+        # was the only available mark.
+        self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
+        self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
+        self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
+        self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))
+
+    def test_MIN_UNTIL_mark_bug(self):
+        # Fixed in issue35859, reported in issue9134.
+        # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
+        s = 'axxzbcz'
+        p = r'(?:(?:a|bc)*?(xx)??z)*'
+        self.assertEqual(re.match(p, s).groups(), ('xx',))
+
+        # test-case provided by issue9134
+        s = 'xtcxyzxc'
+        p = r'((x|yz)+?(t)??c)*'
+        m = re.match(p, s)
+        self.assertEqual(m.span(), (0, 8))
+        self.assertEqual(m.span(2), (6, 7))
+        self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))
+
+    def test_REPEAT_ONE_mark_bug(self):
+        # issue35859
+        # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
+        s = 'aabaab'
+        p = r'(?:[^b]*a(?=(b)|(a))ab)*'
+        m = re.match(p, s)
+        self.assertEqual(m.span(), (0, 6))
+        self.assertEqual(m.span(2), (4, 5))
+        self.assertEqual(m.groups(), (None, 'a'))
+
+        # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
+        s = 'abab'
+        p = r'(?:[^b]*(?=(b)|(a))ab)*'
+        m = re.match(p, s)
+        self.assertEqual(m.span(), (0, 4))
+        self.assertEqual(m.span(2), (2, 3))
+        self.assertEqual(m.groups(), (None, 'a'))
+
+        self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))
+
+    def test_MIN_REPEAT_ONE_mark_bug(self):
+        # issue35859
+        # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
+        s = 'abab'
+        p = r'(?:.*?(?=(a)|(b))b)*'
+        m = re.match(p, s)
+        self.assertEqual(m.span(), (0, 4))
+        self.assertEqual(m.span(2), (3, 4))
+        self.assertEqual(m.groups(), (None, 'b'))
+
+        s = 'axxzaz'
+        p = r'(?:a*?(xx)??z)*'
+        self.assertEqual(re.match(p, s).groups(), ('xx',))
+
+    def test_ASSERT_NOT_mark_bug(self):
+        # Fixed in issue35859, reported in issue725149.
+        # JUMP_ASSERT_NOT should LASTMARK_SAVE()
+        self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
+
+        # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
+        m = re.match(r'((?!(ab)c)(.))*', 'abab')
+        self.assertEqual(m.span(), (0, 4))
+        self.assertEqual(m.span(1), (3, 4))
+        self.assertEqual(m.span(3), (3, 4))
+        self.assertEqual(m.groups(), ('b', None, 'b'))
+
     def test_bug_40736(self):
         with self.assertRaisesRegex(TypeError, "got 'int'"):
             re.search("x*", 5)
diff --git a/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst
new file mode 100644
index 0000000000000..8c88ef01164e4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst
@@ -0,0 +1,2 @@
+:mod:`re` module, fix a few bugs about capturing group. In rare cases,
+capturing group gets an incorrect string. Patch by Ma Lin.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 35bdb4f3d2202..48193f82475a4 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
     } else {
         i = STATE_OFFSET(state, state->mark[index]);
         j = STATE_OFFSET(state, state->mark[index+1]);
+
+        /* check wrong span */
+        if (i > j) {
+            PyErr_SetString(PyExc_SystemError,
+                            "The span of capturing group is wrong,"
+                            " please report a bug for the re module.");
+            return NULL;
+        }
     }
 
     return getslice(state->isbytes, state->beginning, string, i, j);
@@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state,
             if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
                 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
                 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
+
+                /* check wrong span */
+                if (match->mark[j+2] > match->mark[j+3]) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "The span of capturing group is wrong,"
+                                    " please report a bug for the re module.");
+                    Py_DECREF(match);
+                    return NULL;
+                }
             } else
                 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
 
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index a82210ff94a0f..8e4e714eada38 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -449,20 +449,20 @@ do { \
     DATA_STACK_LOOKUP_AT(state,t,p,pos)
 
 #define MARK_PUSH(lastmark) \
-    do if (lastmark > 0) { \
+    do if (lastmark >= 0) { \
         i = lastmark; /* ctx->lastmark may change if reallocated */ \
         DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
     } while (0)
 #define MARK_POP(lastmark) \
-    do if (lastmark > 0) { \
+    do if (lastmark >= 0) { \
         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
     } while (0)
 #define MARK_POP_KEEP(lastmark) \
-    do if (lastmark > 0) { \
+    do if (lastmark >= 0) { \
         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
     } while (0)
 #define MARK_POP_DISCARD(lastmark) \
-    do if (lastmark > 0) { \
+    do if (lastmark >= 0) { \
         DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
     } while (0)
 
@@ -770,8 +770,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
             TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
             LASTMARK_SAVE();
-            ctx->u.rep = state->repeat;
-            if (ctx->u.rep)
+            if (state->repeat)
                 MARK_PUSH(ctx->lastmark);
             for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
                 if (ctx->pattern[1] == SRE_OP_LITERAL &&
@@ -786,16 +785,16 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                 state->ptr = ctx->ptr;
                 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
                 if (ret) {
-                    if (ctx->u.rep)
+                    if (state->repeat)
                         MARK_POP_DISCARD(ctx->lastmark);
                     RETURN_ON_ERROR(ret);
                     RETURN_SUCCESS;
                 }
-                if (ctx->u.rep)
+                if (state->repeat)
                     MARK_POP_KEEP(ctx->lastmark);
                 LASTMARK_RESTORE();
             }
-            if (ctx->u.rep)
+            if (state->repeat)
                 MARK_POP_DISCARD(ctx->lastmark);
             RETURN_FAILURE;
 
@@ -841,6 +840,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             }
 
             LASTMARK_SAVE();
+            if (state->repeat)
+                MARK_PUSH(ctx->lastmark);
 
             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
                 /* tail starts with a literal. skip positions where
@@ -858,16 +859,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                     DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
                             ctx->pattern+ctx->pattern[0]);
                     if (ret) {
+                        if (state->repeat)
+                            MARK_POP_DISCARD(ctx->lastmark);
                         RETURN_ON_ERROR(ret);
                         RETURN_SUCCESS;
                     }
-
+                    if (state->repeat)
+                        MARK_POP_KEEP(ctx->lastmark);
                     LASTMARK_RESTORE();
 
                     ctx->ptr--;
                     ctx->count--;
                 }
-
+                if (state->repeat)
+                    MARK_POP_DISCARD(ctx->lastmark);
             } else {
                 /* general case */
                 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
@@ -875,13 +880,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                     DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
                             ctx->pattern+ctx->pattern[0]);
                     if (ret) {
+                        if (state->repeat)
+                            MARK_POP_DISCARD(ctx->lastmark);
                         RETURN_ON_ERROR(ret);
                         RETURN_SUCCESS;
                     }
+                    if (state->repeat)
+                        MARK_POP_KEEP(ctx->lastmark);
+                    LASTMARK_RESTORE();
+
                     ctx->ptr--;
                     ctx->count--;
-                    LASTMARK_RESTORE();
                 }
+                if (state->repeat)
+                    MARK_POP_DISCARD(ctx->lastmark);
             }
             RETURN_FAILURE;
 
@@ -930,15 +942,24 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             } else {
                 /* general case */
                 LASTMARK_SAVE();
+                if (state->repeat)
+                    MARK_PUSH(ctx->lastmark);
+
                 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
                        || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
                     state->ptr = ctx->ptr;
                     DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
                             ctx->pattern+ctx->pattern[0]);
                     if (ret) {
+                        if (state->repeat)
+                            MARK_POP_DISCARD(ctx->lastmark);
                         RETURN_ON_ERROR(ret);
                         RETURN_SUCCESS;
                     }
+                    if (state->repeat)
+                        MARK_POP_KEEP(ctx->lastmark);
+                    LASTMARK_RESTORE();
+
                     state->ptr = ctx->ptr;
                     ret = SRE(count)(state, ctx->pattern+3, 1);
                     RETURN_ON_ERROR(ret);
@@ -948,8 +969,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                     assert(ret == 1);
                     ctx->ptr++;
                     ctx->count++;
-                    LASTMARK_RESTORE();
                 }
+                if (state->repeat)
+                    MARK_POP_DISCARD(ctx->lastmark);
             }
             RETURN_FAILURE;
 
@@ -1098,8 +1120,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                tail matches */
             state->repeat = ctx->u.rep->prev;
             DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
+            state->repeat = ctx->u.rep; // restore repeat before return
+
             RETURN_ON_SUCCESS(ret);
-            state->repeat = ctx->u.rep;
             state->ptr = ctx->ptr;
             RETURN_FAILURE;
 
@@ -1132,21 +1155,29 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                 RETURN_FAILURE;
             }
 
-            LASTMARK_SAVE();
-
             /* see if the tail matches */
             state->repeat = ctx->u.rep->prev;
+
+            LASTMARK_SAVE();
+            if (state->repeat)
+                MARK_PUSH(ctx->lastmark);
+
             DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
+            SRE_REPEAT *repeat_of_tail = state->repeat;
+            state->repeat = ctx->u.rep; // restore repeat before return
+
             if (ret) {
+                if (repeat_of_tail)
+                    MARK_POP_DISCARD(ctx->lastmark);
                 RETURN_ON_ERROR(ret);
                 RETURN_SUCCESS;
             }
+            if (repeat_of_tail)
+                MARK_POP(ctx->lastmark);
+            LASTMARK_RESTORE();
 
-            state->repeat = ctx->u.rep;
             state->ptr = ctx->ptr;
 
-            LASTMARK_RESTORE();
-
             if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
                 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
                 state->ptr == ctx->u.rep->last_ptr)
@@ -1444,11 +1475,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
                    ctx->ptr, ctx->pattern[1]));
             if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
                 state->ptr = ctx->ptr - ctx->pattern[1];
+                LASTMARK_SAVE();
+                if (state->repeat)
+                    MARK_PUSH(ctx->lastmark);
+
                 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
                 if (ret) {
+                    if (state->repeat)
+                        MARK_POP_DISCARD(ctx->lastmark);
                     RETURN_ON_ERROR(ret);
                     RETURN_FAILURE;
                 }
+                if (state->repeat)
+                    MARK_POP(ctx->lastmark);
+                LASTMARK_RESTORE();
             }
             ctx->pattern += ctx->pattern[0];
             break;



More information about the Python-checkins mailing list