[Python-checkins] cpython: Issue #22437: Number of capturing groups in regular expression is no longer

serhiy.storchaka python-checkins at python.org
Mon Sep 29 21:50:59 CEST 2014


https://hg.python.org/cpython/rev/0b85ea4bd1af
changeset:   92628:0b85ea4bd1af
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Mon Sep 29 22:49:23 2014 +0300
summary:
  Issue #22437: Number of capturing groups in regular expression is no longer
limited by 100.

files:
  Doc/whatsnew/3.5.rst |   6 +++
  Lib/sre_compile.py   |   6 ---
  Lib/sre_constants.py |   2 +-
  Lib/sre_parse.py     |  10 +++++
  Lib/test/test_re.py  |  18 ++++++++-
  Misc/NEWS            |   3 +
  Modules/_sre.c       |  57 ++++++++++++++++++++++---------
  Modules/sre.h        |   7 +--
  8 files changed, 79 insertions(+), 30 deletions(-)


diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -217,6 +217,12 @@
 * :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes`
   attribute on Windows (contributed by Ben Hoyt in :issue:`21719`).
 
+re
+--
+
+* Number of capturing groups in regular expression is no longer limited by 100.
+  (Contributed by Serhiy Storchaka in :issue:`22437`.)
+
 shutil
 ------
 
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -470,12 +470,6 @@
 
     # print code
 
-    # XXX: <fl> get rid of this limitation!
-    if p.pattern.groups > 100:
-        raise AssertionError(
-            "sorry, but this version only supports 100 named groups"
-            )
-
     # map in either direction
     groupindex = p.pattern.groupdict
     indexgroup = [None] * p.pattern.groups
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -15,7 +15,7 @@
 
 MAGIC = 20031017
 
-from _sre import MAXREPEAT
+from _sre import MAXREPEAT, MAXGROUPS
 
 # SRE standard exception (access as sre.error)
 # should this really be here?
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -72,6 +72,8 @@
     def opengroup(self, name=None):
         gid = self.groups
         self.groups = gid + 1
+        if self.groups > MAXGROUPS:
+            raise error("groups number is too large")
         if name is not None:
             ogid = self.groupdict.get(name, None)
             if ogid is not None:
@@ -695,8 +697,14 @@
                     else:
                         try:
                             condgroup = int(condname)
+                            if condgroup < 0:
+                                raise ValueError
                         except ValueError:
                             raise error("bad character in group name")
+                        if not condgroup:
+                            raise error("bad group number")
+                        if condgroup >= MAXGROUPS:
+                            raise error("the group number is too large")
                 else:
                     # flags
                     if not source.next in FLAGS:
@@ -822,6 +830,8 @@
                     index = int(name)
                     if index < 0:
                         raise error("negative group number")
+                    if index >= MAXGROUPS:
+                        raise error("the group number is too large")
                 except ValueError:
                     if not name.isidentifier():
                         raise error("bad character in group name")
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -193,6 +193,7 @@
     def test_symbolic_groups(self):
         re.compile('(?P<a>x)(?P=a)(?(a)y)')
         re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
+        re.compile('(?P<a1>x)\1(?(1)y)')
         self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
         self.assertRaises(re.error, re.compile, '(?Px)')
         self.assertRaises(re.error, re.compile, '(?P=)')
@@ -212,6 +213,10 @@
         re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
         re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
         self.assertRaises(re.error, re.compile, '(?P<©>x)')
+        # Support > 100 groups.
+        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+        pat = '(?:%s)(?(200)z|t)' % pat
+        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
 
     def test_symbolic_refs(self):
         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
@@ -228,6 +233,9 @@
         self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
         self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
         self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
+        # Support > 100 groups.
+        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+        self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
 
     def test_re_subn(self):
         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -404,6 +412,10 @@
         self.assertIsNone(p.match('abd'))
         self.assertIsNone(p.match('ac'))
 
+        # Support > 100 groups.
+        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+        pat = '(?:%s)(?(200)z)' % pat
+        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
 
     def test_re_groupref(self):
         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
@@ -1070,8 +1082,10 @@
         # a RuntimeError is raised instead of OverflowError.
         long_overflow = 2**128
         self.assertRaises(TypeError, re.finditer, "a", {})
-        self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
-        self.assertRaises(TypeError, _sre.compile, {}, 0, [])
+        with self.assertRaises(OverflowError):
+            _sre.compile("abc", 0, [long_overflow], 0, [], [])
+        with self.assertRaises(TypeError):
+            _sre.compile({}, 0, [], 0, [], [])
 
     def test_search_dot_unicode(self):
         self.assertTrue(re.search("123.*-", '123abc-'))
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -145,6 +145,9 @@
 Library
 -------
 
+- Issue #22437: Number of capturing groups in regular expression is no longer
+  limited by 100.
+
 - Issue #17442: InteractiveInterpreter now displays the full chained traceback
   in its showtraceback method, to match the built in interactive interpreter.
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -357,6 +357,11 @@
 
     memset(state, 0, sizeof(SRE_STATE));
 
+    state->mark = PyMem_New(void *, pattern->groups * 2);
+    if (!state->mark) {
+        PyErr_NoMemory();
+        goto err;
+    }
     state->lastmark = -1;
     state->lastindex = -1;
 
@@ -409,6 +414,8 @@
 
     return string;
   err:
+    PyMem_Del(state->mark);
+    state->mark = NULL;
     if (state->buffer.buf)
         PyBuffer_Release(&state->buffer);
     return NULL;
@@ -421,6 +428,8 @@
         PyBuffer_Release(&state->buffer);
     Py_XDECREF(state->string);
     data_stack_dealloc(state);
+    PyMem_Del(state->mark);
+    state->mark = NULL;
 }
 
 /* calculate offset from start of string */
@@ -560,6 +569,7 @@
     PyObject *pattern = NULL;
     SRE_STATE state;
     Py_ssize_t status;
+    PyObject *match;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs,
         "|Onn$O:match", _keywords,
@@ -579,12 +589,14 @@
     status = sre_match(&state, PatternObject_GetCode(self), 0);
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
-    if (PyErr_Occurred())
+    if (PyErr_Occurred()) {
+        state_fini(&state);
         return NULL;
-
+    }
+
+    match = pattern_new_match(self, &state, status);
     state_fini(&state);
-
-    return (PyObject *)pattern_new_match(self, &state, status);
+    return match;
 }
 
 static PyObject*
@@ -592,6 +604,7 @@
 {
     SRE_STATE state;
     Py_ssize_t status;
+    PyObject *match;
 
     PyObject *string = NULL, *string2 = NULL;
     Py_ssize_t start = 0;
@@ -616,12 +629,14 @@
     status = sre_match(&state, PatternObject_GetCode(self), 1);
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
-    if (PyErr_Occurred())
+    if (PyErr_Occurred()) {
+        state_fini(&state);
         return NULL;
-
+    }
+
+    match = pattern_new_match(self, &state, status);
     state_fini(&state);
-
-    return pattern_new_match(self, &state, status);
+    return match;
 }
 
 static PyObject*
@@ -629,6 +644,7 @@
 {
     SRE_STATE state;
     Py_ssize_t status;
+    PyObject *match;
 
     PyObject *string = NULL, *string2 = NULL;
     Py_ssize_t start = 0;
@@ -652,12 +668,14 @@
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
 
+    if (PyErr_Occurred()) {
+        state_fini(&state);
+        return NULL;
+    }
+
+    match = pattern_new_match(self, &state, status);
     state_fini(&state);
-
-    if (PyErr_Occurred())
-        return NULL;
-
-    return pattern_new_match(self, &state, status);
+    return match;
 }
 
 static PyObject*
@@ -1417,7 +1435,7 @@
     PyObject* groupindex = NULL;
     PyObject* indexgroup = NULL;
 
-    if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
+    if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
                           &PyList_Type, &code, &groups,
                           &groupindex, &indexgroup))
         return NULL;
@@ -1933,10 +1951,9 @@
 static int
 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
 {
-    if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
+    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
+        code >= end || end[-1] != SRE_OP_SUCCESS)
         FAIL;
-    if (groups == 0)  /* fix for simplejson */
-        groups = 100; /* 100 groups should always be safe */
     return _validate_inner(code, end-1, groups);
 }
 
@@ -2747,6 +2764,12 @@
         Py_DECREF(x);
     }
 
+    x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
+    if (x) {
+        PyDict_SetItemString(d, "MAXGROUPS", x);
+        Py_DECREF(x);
+    }
+
     x = PyUnicode_FromString(copyright);
     if (x) {
         PyDict_SetItemString(d, "copyright", x);
diff --git a/Modules/sre.h b/Modules/sre.h
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -18,8 +18,10 @@
 #define SRE_CODE Py_UCS4
 #if SIZEOF_SIZE_T > 4
 # define SRE_MAXREPEAT (~(SRE_CODE)0)
+# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
 #else
 # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
+# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
 #endif
 
 typedef struct {
@@ -52,9 +54,6 @@
 
 typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
 
-/* FIXME: <fl> shouldn't be a constant, really... */
-#define SRE_MARK_SIZE 200
-
 typedef struct SRE_REPEAT_T {
     Py_ssize_t count;
     SRE_CODE* pattern; /* points to REPEAT operator arguments */
@@ -76,7 +75,7 @@
     /* registers */
     Py_ssize_t lastindex;
     Py_ssize_t lastmark;
-    void* mark[SRE_MARK_SIZE];
+    void** mark;
     /* dynamically allocated stuff */
     char* data_stack;
     size_t data_stack_size;

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list