[Python-checkins] cpython: Issue #22364: Improved some re error messages using regex for hints.

serhiy.storchaka python-checkins at python.org
Wed Mar 25 20:04:37 CET 2015


https://hg.python.org/cpython/rev/068365acbe73
changeset:   95207:068365acbe73
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Wed Mar 25 21:03:47 2015 +0200
summary:
  Issue #22364: Improved some re error messages using regex for hints.

files:
  Lib/re.py           |    2 +-
  Lib/sre_compile.py  |    6 +-
  Lib/sre_parse.py    |  227 ++++++++++++++--------------
  Lib/test/test_re.py |  251 ++++++++++++++++++++++---------
  Misc/NEWS           |    2 +
  Modules/_sre.c      |    6 +-
  6 files changed, 299 insertions(+), 195 deletions(-)


diff --git a/Lib/re.py b/Lib/re.py
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -286,7 +286,7 @@
     if isinstance(pattern, _pattern_type):
         if flags:
             raise ValueError(
-                "Cannot process flags argument with a compiled pattern")
+                "cannot process flags argument with a compiled pattern")
         return pattern
     if not sre_compile.isstring(pattern):
         raise TypeError("first argument must be string or compiled pattern")
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -113,7 +113,7 @@
                 emit(ANY)
         elif op in REPEATING_CODES:
             if flags & SRE_FLAG_TEMPLATE:
-                raise error("internal: unsupported template operator")
+                raise error("internal: unsupported template operator %r" % (op,))
             elif _simple(av) and op is not REPEAT:
                 if op is MAX_REPEAT:
                     emit(REPEAT_ONE)
@@ -216,7 +216,7 @@
             else:
                 code[skipyes] = _len(code) - skipyes + 1
         else:
-            raise ValueError("unsupported operand type", op)
+            raise error("internal: unsupported operand type %r" % (op,))
 
 def _compile_charset(charset, flags, code, fixup=None, fixes=None):
     # compile charset subprogram
@@ -242,7 +242,7 @@
             else:
                 emit(av)
         else:
-            raise error("internal: unsupported set operator")
+            raise error("internal: unsupported set operator %r" % (op,))
     emit(FAILURE)
 
 def _optimize_charset(charset, fixup, fixes):
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -79,7 +79,7 @@
         gid = self.groups
         self.subpatterns.append(None)
         if self.groups > MAXGROUPS:
-            raise error("groups number is too large")
+            raise error("too many groups")
         if name is not None:
             ogid = self.groupdict.get(name, None)
             if ogid is not None:
@@ -235,7 +235,7 @@
             try:
                 char += self.decoded_string[index]
             except IndexError:
-                raise error("bogus escape (end of line)",
+                raise error("bad escape (end of pattern)",
                             self.string, len(self.string) - 1) from None
         self.index = index + 1
         self.next = char
@@ -263,8 +263,13 @@
             c = self.next
             self.__next()
             if c is None:
-                raise self.error("unterminated name")
+                if not result:
+                    raise self.error("missing group name")
+                raise self.error("missing %s, unterminated name" % terminator,
+                                 len(result))
             if c == terminator:
+                if not result:
+                    raise self.error("missing group name", 1)
                 break
             result += c
         return result
@@ -318,19 +323,19 @@
             # hexadecimal escape (exactly two digits)
             escape += source.getwhile(2, HEXDIGITS)
             if len(escape) != 4:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             return LITERAL, int(escape[2:], 16)
         elif c == "u" and source.istext:
             # unicode escape (exactly four digits)
             escape += source.getwhile(4, HEXDIGITS)
             if len(escape) != 6:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             return LITERAL, int(escape[2:], 16)
         elif c == "U" and source.istext:
             # unicode escape (exactly eight digits)
             escape += source.getwhile(8, HEXDIGITS)
             if len(escape) != 10:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
@@ -339,7 +344,7 @@
             escape += source.getwhile(2, OCTDIGITS)
             c = int(escape[1:], 8)
             if c > 0o377:
-                raise source.error('octal escape value %r outside of '
+                raise source.error('octal escape value %s outside of '
                                    'range 0-0o377' % escape, len(escape))
             return LITERAL, c
         elif c in DIGITS:
@@ -352,7 +357,7 @@
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
-    raise source.error("bogus escape: %r" % escape, len(escape))
+    raise source.error("bad escape %s" % escape, len(escape))
 
 def _escape(source, escape, state):
     # handle escape code in expression
@@ -368,19 +373,19 @@
             # hexadecimal escape
             escape += source.getwhile(2, HEXDIGITS)
             if len(escape) != 4:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             return LITERAL, int(escape[2:], 16)
         elif c == "u" and source.istext:
             # unicode escape (exactly four digits)
             escape += source.getwhile(4, HEXDIGITS)
             if len(escape) != 6:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             return LITERAL, int(escape[2:], 16)
         elif c == "U" and source.istext:
             # unicode escape (exactly eight digits)
             escape += source.getwhile(8, HEXDIGITS)
             if len(escape) != 10:
-                raise ValueError
+                raise source.error("incomplete escape %s" % escape, len(escape))
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
@@ -398,7 +403,7 @@
                     escape += source.get()
                     c = int(escape[1:], 8)
                     if c > 0o377:
-                        raise source.error('octal escape value %r outside of '
+                        raise source.error('octal escape value %s outside of '
                                            'range 0-0o377' % escape,
                                            len(escape))
                     return LITERAL, c
@@ -406,11 +411,11 @@
             group = int(escape[1:])
             if group < state.groups:
                 if not state.checkgroup(group):
-                    raise source.error("cannot refer to open group",
+                    raise source.error("cannot refer to an open group",
                                        len(escape))
                 state.checklookbehindgroup(group, source)
                 return GROUPREF, group
-            raise ValueError
+            raise source.error("invalid group reference", len(escape))
         if len(escape) == 2:
             if c in ASCIILETTERS:
                 import warnings
@@ -419,7 +424,7 @@
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
-    raise source.error("bogus escape: %r" % escape, len(escape))
+    raise source.error("bad escape %s" % escape, len(escape))
 
 def _parse_sub(source, state, nested=True):
     # parse an alternation: a|b|c
@@ -427,12 +432,11 @@
     items = []
     itemsappend = items.append
     sourcematch = source.match
+    start = source.tell()
     while True:
         itemsappend(_parse(source, state))
         if not sourcematch("|"):
             break
-    if nested and source.next is not None and source.next != ")":
-        raise source.error("pattern not properly closed")
 
     if len(items) == 1:
         return items[0]
@@ -480,8 +484,6 @@
             raise source.error("conditional backref with more than two branches")
     else:
         item_no = None
-    if source.next is not None and source.next != ")":
-        raise source.error("pattern not properly closed")
     subpattern = SubPattern(state)
     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
     return subpattern
@@ -526,6 +528,7 @@
             subpatternappend((LITERAL, _ord(this)))
 
         elif this == "[":
+            here = source.tell() - 1
             # character set
             set = []
             setappend = set.append
@@ -538,7 +541,8 @@
             while True:
                 this = sourceget()
                 if this is None:
-                    raise source.error("unexpected end of regular expression")
+                    raise source.error("unterminated character set",
+                                       source.tell() - here)
                 if this == "]" and set != start:
                     break
                 elif this[0] == "\\":
@@ -547,25 +551,28 @@
                     code1 = LITERAL, _ord(this)
                 if sourcematch("-"):
                     # potential range
-                    this = sourceget()
-                    if this is None:
-                        raise source.error("unexpected end of regular expression")
-                    if this == "]":
+                    that = sourceget()
+                    if that is None:
+                        raise source.error("unterminated character set",
+                                           source.tell() - here)
+                    if that == "]":
                         if code1[0] is IN:
                             code1 = code1[1][0]
                         setappend(code1)
                         setappend((LITERAL, _ord("-")))
                         break
-                    if this[0] == "\\":
-                        code2 = _class_escape(source, this)
+                    if that[0] == "\\":
+                        code2 = _class_escape(source, that)
                     else:
-                        code2 = LITERAL, _ord(this)
+                        code2 = LITERAL, _ord(that)
                     if code1[0] != LITERAL or code2[0] != LITERAL:
-                        raise source.error("bad character range", len(this))
+                        msg = "bad character range %s-%s" % (this, that)
+                        raise source.error(msg, len(this) + 1 + len(that))
                     lo = code1[1]
                     hi = code2[1]
                     if hi < lo:
-                        raise source.error("bad character range", len(this))
+                        msg = "bad character range %s-%s" % (this, that)
+                        raise source.error(msg, len(this) + 1 + len(that))
                     setappend((RANGE, (lo, hi)))
                 else:
                     if code1[0] is IN:
@@ -617,10 +624,10 @@
                     if max >= MAXREPEAT:
                         raise OverflowError("the repetition number is too large")
                     if max < min:
-                        raise source.error("bad repeat interval",
+                        raise source.error("min repeat greater than max repeat",
                                            source.tell() - here)
             else:
-                raise source.error("not supported", len(this))
+                raise AssertionError("unsupported quantifier %r" % (char,))
             # figure out which item to repeat
             if subpattern:
                 item = subpattern[-1:]
@@ -641,39 +648,32 @@
             subpatternappend((ANY, None))
 
         elif this == "(":
-            group = 1
+            start = source.tell() - 1
+            group = True
             name = None
             condgroup = None
             if sourcematch("?"):
-                group = 0
                 # options
                 char = sourceget()
                 if char is None:
-                    raise self.error("unexpected end of pattern")
+                    raise source.error("unexpected end of pattern")
                 if char == "P":
                     # python extensions
                     if sourcematch("<"):
                         # named group: skip forward to end of name
                         name = source.getuntil(">")
-                        group = 1
-                        if not name:
-                            raise source.error("missing group name", 1)
                         if not name.isidentifier():
-                            raise source.error("bad character in group name "
-                                               "%r" % name,
-                                               len(name) + 1)
+                            msg = "bad character in group name %r" % name
+                            raise source.error(msg, len(name) + 1)
                     elif sourcematch("="):
                         # named backreference
                         name = source.getuntil(")")
-                        if not name:
-                            raise source.error("missing group name", 1)
                         if not name.isidentifier():
-                            raise source.error("bad character in backref "
-                                               "group name %r" % name,
-                                               len(name) + 1)
+                            msg = "bad character in group name %r" % name
+                            raise source.error(msg, len(name) + 1)
                         gid = state.groupdict.get(name)
                         if gid is None:
-                            msg = "unknown group name: {0!r}".format(name)
+                            msg = "unknown group name %r" % name
                             raise source.error(msg, len(name) + 1)
                         state.checklookbehindgroup(gid, source)
                         subpatternappend((GROUPREF, gid))
@@ -682,16 +682,17 @@
                         char = sourceget()
                         if char is None:
                             raise source.error("unexpected end of pattern")
-                        raise source.error("unknown specifier: ?P%s" % char,
-                                           len(char))
+                        raise source.error("unknown extension ?P" + char,
+                                           len(char) + 2)
                 elif char == ":":
                     # non-capturing group
-                    group = 2
+                    group = None
                 elif char == "#":
                     # comment
                     while True:
                         if source.next is None:
-                            raise source.error("unbalanced parenthesis")
+                            raise source.error("missing ), unterminated comment",
+                                               source.tell() - start)
                         if sourceget() == ")":
                             break
                     continue
@@ -700,8 +701,11 @@
                     dir = 1
                     if char == "<":
                         char = sourceget()
-                        if char is None or char not in "=!":
-                            raise source.error("syntax error")
+                        if char is None:
+                            raise source.error("unexpected end of pattern")
+                        if char not in "=!":
+                            raise source.error("unknown extension ?<" + char,
+                                               len(char) + 2)
                         dir = -1 # lookbehind
                         lookbehindgroups = state.lookbehindgroups
                         if lookbehindgroups is None:
@@ -711,7 +715,8 @@
                         if lookbehindgroups is None:
                             state.lookbehindgroups = None
                     if not sourcematch(")"):
-                        raise source.error("unbalanced parenthesis")
+                        raise source.error("missing ), unterminated subpattern",
+                                           source.tell() - start)
                     if char == "=":
                         subpatternappend((ASSERT, (dir, p)))
                     else:
@@ -720,13 +725,11 @@
                 elif char == "(":
                     # conditional backreference group
                     condname = source.getuntil(")")
-                    group = 2
-                    if not condname:
-                        raise source.error("missing group name", 1)
+                    group = None
                     if condname.isidentifier():
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
-                            msg = "unknown group name: {0!r}".format(condname)
+                            msg = "unknown group name %r" % condname
                             raise source.error(msg, len(condname) + 1)
                     else:
                         try:
@@ -734,50 +737,48 @@
                             if condgroup < 0:
                                 raise ValueError
                         except ValueError:
-                            raise source.error("bad character in group name",
-                                               len(condname) + 1)
+                            msg = "bad character in group name %r" % condname
+                            raise source.error(msg, len(condname) + 1) from None
                         if not condgroup:
                             raise source.error("bad group number",
                                                len(condname) + 1)
                         if condgroup >= MAXGROUPS:
-                            raise source.error("the group number is too large",
+                            raise source.error("invalid group reference",
                                                len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
                 elif char in FLAGS:
                     # flags
-                    state.flags |= FLAGS[char]
-                    while source.next in FLAGS:
-                        state.flags |= FLAGS[sourceget()]
+                    while True:
+                        state.flags |= FLAGS[char]
+                        char = sourceget()
+                        if char is None:
+                            raise source.error("missing )")
+                        if char == ")":
+                            break
+                        if char not in FLAGS:
+                            raise source.error("unknown flag", len(char))
                     verbose = state.flags & SRE_FLAG_VERBOSE
+                    continue
                 else:
-                    raise source.error("unexpected end of pattern")
-            if group:
-                # parse group contents
-                if group == 2:
-                    # anonymous group
-                    group = None
-                else:
-                    try:
-                        group = state.opengroup(name)
-                    except error as err:
-                        raise source.error(err.msg, len(name) + 1)
-                if condgroup:
-                    p = _parse_sub_cond(source, state, condgroup)
-                else:
-                    p = _parse_sub(source, state)
-                if not sourcematch(")"):
-                    raise source.error("unbalanced parenthesis")
-                if group is not None:
-                    state.closegroup(group, p)
-                subpatternappend((SUBPATTERN, (group, p)))
+                    raise source.error("unknown extension ?" + char,
+                                       len(char) + 1)
+
+            # parse group contents
+            if group is not None:
+                try:
+                    group = state.opengroup(name)
+                except error as err:
+                    raise source.error(err.msg, len(name) + 1) from None
+            if condgroup:
+                p = _parse_sub_cond(source, state, condgroup)
             else:
-                while True:
-                    char = sourceget()
-                    if char is None:
-                        raise source.error("unexpected end of pattern")
-                    if char == ")":
-                        break
-                    raise source.error("unknown extension", len(char))
+                p = _parse_sub(source, state)
+            if not source.match(")"):
+                raise source.error("missing ), unterminated subpattern",
+                                   source.tell() - start)
+            if group is not None:
+                state.closegroup(group, p)
+            subpatternappend((SUBPATTERN, (group, p)))
 
         elif this == "^":
             subpatternappend((AT, AT_BEGINNING))
@@ -786,7 +787,7 @@
             subpattern.append((AT, AT_END))
 
         else:
-            raise source.error("parser error", len(this))
+            raise AssertionError("unsupported special character %r" % (char,))
 
     return subpattern
 
@@ -804,7 +805,7 @@
             raise ValueError("ASCII and UNICODE flags are incompatible")
     else:
         if flags & SRE_FLAG_UNICODE:
-            raise ValueError("can't use UNICODE flag with a bytes pattern")
+            raise ValueError("cannot use UNICODE flag with a bytes pattern")
         if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
             import warnings
             warnings.warn("ASCII and LOCALE flags are incompatible. "
@@ -826,11 +827,8 @@
     p.pattern.flags = fix_flags(str, p.pattern.flags)
 
     if source.next is not None:
-        if source.next == ")":
-            raise source.error("unbalanced parenthesis")
-        else:
-            raise source.error("bogus characters at end of regular expression",
-                               len(tail))
+        assert source.next == ")"
+        raise source.error("unbalanced parenthesis")
 
     if flags & SRE_FLAG_DEBUG:
         p.dump()
@@ -866,26 +864,25 @@
             c = this[1]
             if c == "g":
                 name = ""
-                if s.match("<"):
-                    name = s.getuntil(">")
-                if not name:
-                    raise s.error("missing group name", 1)
-                try:
-                    index = int(name)
-                    if index < 0:
-                        raise s.error("negative group number", len(name) + 1)
-                    if index >= MAXGROUPS:
-                        raise s.error("the group number is too large",
-                                      len(name) + 1)
-                except ValueError:
-                    if not name.isidentifier():
-                        raise s.error("bad character in group name",
-                                      len(name) + 1)
+                if not s.match("<"):
+                    raise s.error("missing <")
+                name = s.getuntil(">")
+                if name.isidentifier():
                     try:
                         index = pattern.groupindex[name]
                     except KeyError:
-                        msg = "unknown group name: {0!r}".format(name)
-                        raise IndexError(msg)
+                        raise IndexError("unknown group name %r" % name)
+                else:
+                    try:
+                        index = int(name)
+                        if index < 0:
+                            raise ValueError
+                    except ValueError:
+                        raise s.error("bad character in group name %r" % name,
+                                      len(name) + 1) from None
+                    if index >= MAXGROUPS:
+                        raise s.error("invalid group reference",
+                                      len(name) + 1)
                 addgroup(index)
             elif c == "0":
                 if s.next in OCTDIGITS:
@@ -903,7 +900,7 @@
                         isoctal = True
                         c = int(this[1:], 8)
                         if c > 0o377:
-                            raise s.error('octal escape value %r outside of '
+                            raise s.error('octal escape value %s outside of '
                                           'range 0-0o377' % this, len(this))
                         lappend(chr(c))
                 if not isoctal:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -38,6 +38,24 @@
                 self.assertIs(type(actual), type(expect), msg)
         recurse(actual, expect)
 
+    def checkPatternError(self, pattern, errmsg, pos=None):
+        with self.assertRaises(re.error) as cm:
+            re.compile(pattern)
+        with self.subTest(pattern=pattern):
+            err = cm.exception
+            self.assertEqual(err.msg, errmsg)
+            if pos is not None:
+                self.assertEqual(err.pos, pos)
+
+    def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
+        with self.assertRaises(re.error) as cm:
+            re.sub(pattern, repl, string)
+        with self.subTest(pattern=pattern, repl=repl):
+            err = cm.exception
+            self.assertEqual(err.msg, errmsg)
+            if pos is not None:
+                self.assertEqual(err.pos, pos)
+
     def test_keep_buffer(self):
         # See bug 14212
         b = bytearray(b'x')
@@ -148,6 +166,7 @@
         self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
         self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
         self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
+        self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
 
         self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
         self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
@@ -158,21 +177,25 @@
         self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
         self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
 
-        self.assertRaises(re.error, re.sub, 'x', r'\400', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\777', 'x')
+        self.checkTemplateError('x', r'\400', 'x',
+                                r'octal escape value \400 outside of '
+                                r'range 0-0o377', 0)
+        self.checkTemplateError('x', r'\777', 'x',
+                                r'octal escape value \777 outside of '
+                                r'range 0-0o377', 0)
 
-        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
-        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
-        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
-        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
+        self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
+        self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
+        self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
+        self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
 
         # in python2.3 (etc), these loop endlessly in sre_parser.py
         self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
@@ -198,47 +221,65 @@
         re.compile('(?P<a>x)(?P=a)(?(a)y)')
         re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
         re.compile('(?P<a1>x)\1(?(1)y)')
-        self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
-        self.assertRaises(re.error, re.compile, '(?Px)')
-        self.assertRaises(re.error, re.compile, '(?P=)')
-        self.assertRaises(re.error, re.compile, '(?P=1)')
-        self.assertRaises(re.error, re.compile, '(?P=a)')
-        self.assertRaises(re.error, re.compile, '(?P=a1)')
-        self.assertRaises(re.error, re.compile, '(?P=a.)')
-        self.assertRaises(re.error, re.compile, '(?P<)')
-        self.assertRaises(re.error, re.compile, '(?P<>)')
-        self.assertRaises(re.error, re.compile, '(?P<1>)')
-        self.assertRaises(re.error, re.compile, '(?P<a.>)')
-        self.assertRaises(re.error, re.compile, '(?())')
-        self.assertRaises(re.error, re.compile, '(?(a))')
-        self.assertRaises(re.error, re.compile, '(?(1a))')
-        self.assertRaises(re.error, re.compile, '(?(a.))')
+        self.checkPatternError('(?P<a>)(?P<a>)',
+                               "redefinition of group name 'a' as group 2; "
+                               "was group 1")
+        self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
+        self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
+        self.checkPatternError('(?P=', 'missing group name', 4)
+        self.checkPatternError('(?P=)', 'missing group name', 4)
+        self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
+        self.checkPatternError('(?P=a)', "unknown group name 'a'")
+        self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
+        self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
+        self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
+        self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
+        self.checkPatternError('(?P<', 'missing group name', 4)
+        self.checkPatternError('(?P<>)', 'missing group name', 4)
+        self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
+        self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
+        self.checkPatternError(r'(?(', 'missing group name', 3)
+        self.checkPatternError(r'(?())', 'missing group name', 3)
+        self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
+        self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
+        self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
+        self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
         # New valid/invalid identifiers in Python 3
         re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
         re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
-        self.assertRaises(re.error, re.compile, '(?P<©>x)')
+        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
         # Support > 100 groups.
         pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
         pat = '(?:%s)(?(200)z|t)' % pat
         self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
 
     def test_symbolic_refs(self):
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<2>', 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\2', 'xx')
-        self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
+        self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
+                                'missing >, unterminated name', 3)
+        self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
+                                'missing group name', 3)
+        self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
+        self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
+                                "bad character in group name 'a a'", 3)
+        self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
+                                'missing group name', 3)
+        self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
+                                "bad character in group name '1a1'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
+                                'invalid group reference')
+        self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
+                                'invalid group reference')
+        with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
+            re.sub('(?P<a>x)', '\g<ab>', 'xx')
         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
+        self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
+                                "bad character in group name '-1'", 3)
         # New valid/invalid identifiers in Python 3
         self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
         self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
-        self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
+        self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
+                                "bad character in group name '©'", 3)
         # Support > 100 groups.
         pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
         self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
@@ -444,6 +485,19 @@
         pat = '(?:%s)(?(200)z)' % pat
         self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
 
+        self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
+        self.checkPatternError(r'()(?(1)a|b',
+                               'missing ), unterminated subpattern', 2)
+        self.checkPatternError(r'()(?(1)a|b|c)',
+                               'conditional backref with more than '
+                               'two branches', 10)
+
+    def test_re_groupref_overflow(self):
+        self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
+                                'invalid group reference', 3)
+        self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
+                               'invalid group reference', 10)
+
     def test_re_groupref(self):
         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
                          ('|', 'a'))
@@ -456,6 +510,8 @@
         self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
                          (None, None))
 
+        self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
+
     def test_groupdict(self):
         self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
                                   'first second').groupdict(),
@@ -493,6 +549,7 @@
 
         self.assertTrue(re.match("^x{3}$", "xxx"))
         self.assertTrue(re.match("^x{1,3}$", "xxx"))
+        self.assertTrue(re.match("^x{3,3}$", "xxx"))
         self.assertTrue(re.match("^x{1,4}$", "xxx"))
         self.assertTrue(re.match("^x{3,4}?$", "xxx"))
         self.assertTrue(re.match("^x{3}?$", "xxx"))
@@ -503,6 +560,9 @@
         self.assertIsNone(re.match("^x{}$", "xxx"))
         self.assertTrue(re.match("^x{}$", "x{}"))
 
+        self.checkPatternError(r'x{2,1}',
+                               'min repeat greater than max repeat', 2)
+
     def test_getattr(self):
         self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
         self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
@@ -550,7 +610,7 @@
                                    b"1aa! a", re.LOCALE).group(0), b"1aa! a")
 
     def test_other_escapes(self):
-        self.assertRaises(re.error, re.compile, "\\")
+        self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
         self.assertEqual(re.match(r"\(", '(').group(), '(')
         self.assertIsNone(re.match(r"\(", ')'))
         self.assertEqual(re.match(r"\\", '\\').group(), '\\')
@@ -875,15 +935,17 @@
         self.assertTrue(re.match(r"\08", "\0008"))
         self.assertTrue(re.match(r"\01", "\001"))
         self.assertTrue(re.match(r"\018", "\0018"))
-        self.assertRaises(re.error, re.match, r"\567", "")
-        self.assertRaises(re.error, re.match, r"\911", "")
-        self.assertRaises(re.error, re.match, r"\x1", "")
-        self.assertRaises(re.error, re.match, r"\x1z", "")
-        self.assertRaises(re.error, re.match, r"\u123", "")
-        self.assertRaises(re.error, re.match, r"\u123z", "")
-        self.assertRaises(re.error, re.match, r"\U0001234", "")
-        self.assertRaises(re.error, re.match, r"\U0001234z", "")
-        self.assertRaises(re.error, re.match, r"\U00110000", "")
+        self.checkPatternError(r"\567",
+                               r'octal escape value \567 outside of '
+                               r'range 0-0o377', 0)
+        self.checkPatternError(r"\911", 'invalid group reference', 0)
+        self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
+        self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
+        self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
+        self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
+        self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
+        self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
+        self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
 
     def test_sre_character_class_literals(self):
         for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
@@ -903,12 +965,14 @@
             self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
             self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
             self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
-        self.assertRaises(re.error, re.match, r"[\567]", "")
-        self.assertRaises(re.error, re.match, r"[\911]", "")
-        self.assertRaises(re.error, re.match, r"[\x1z]", "")
-        self.assertRaises(re.error, re.match, r"[\u123z]", "")
-        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
-        self.assertRaises(re.error, re.match, r"[\U00110000]", "")
+        self.checkPatternError(r"[\567]",
+                               r'octal escape value \567 outside of '
+                               r'range 0-0o377', 1)
+        self.checkPatternError(r"[\911]", r'bad escape \9', 1)
+        self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
+        self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
+        self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
+        self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
         self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
 
     def test_sre_byte_literals(self):
@@ -927,10 +991,12 @@
         self.assertTrue(re.match(br"\08", b"\0008"))
         self.assertTrue(re.match(br"\01", b"\001"))
         self.assertTrue(re.match(br"\018", b"\0018"))
-        self.assertRaises(re.error, re.match, br"\567", b"")
-        self.assertRaises(re.error, re.match, br"\911", b"")
-        self.assertRaises(re.error, re.match, br"\x1", b"")
-        self.assertRaises(re.error, re.match, br"\x1z", b"")
+        self.checkPatternError(br"\567",
+                               r'octal escape value \567 outside of '
+                               r'range 0-0o377', 0)
+        self.checkPatternError(br"\911", 'invalid group reference', 0)
+        self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
+        self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
 
     def test_sre_byte_class_literals(self):
         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
@@ -946,9 +1012,22 @@
             self.assertTrue(re.match(br"[\u1234]", b'u'))
         with self.assertWarns(DeprecationWarning):
             self.assertTrue(re.match(br"[\U00012345]", b'U'))
-        self.assertRaises(re.error, re.match, br"[\567]", b"")
-        self.assertRaises(re.error, re.match, br"[\911]", b"")
-        self.assertRaises(re.error, re.match, br"[\x1z]", b"")
+        self.checkPatternError(br"[\567]",
+                               r'octal escape value \567 outside of '
+                               r'range 0-0o377', 1)
+        self.checkPatternError(br"[\911]", r'bad escape \9', 1)
+        self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
+
+    def test_character_set_errors(self):
+        self.checkPatternError(r'[', 'unterminated character set', 0)
+        self.checkPatternError(r'[^', 'unterminated character set', 0)
+        self.checkPatternError(r'[a', 'unterminated character set', 0)
+        # bug 545855 -- This pattern failed to cause a compile error as it
+        # should, instead provoking a TypeError.
+        self.checkPatternError(r"[a-", 'unterminated character set', 0)
+        self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
+        self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
+        self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
 
     def test_bug_113254(self):
         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
@@ -963,11 +1042,6 @@
         self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
         self.assertEqual(re.match("((a))", "a").lastindex, 1)
 
-    def test_bug_545855(self):
-        # bug 545855 -- This pattern failed to cause a compile error as it
-        # should, instead provoking a TypeError.
-        self.assertRaises(re.error, re.compile, 'foo[a-')
-
     def test_bug_418626(self):
         # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
         # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
@@ -991,6 +1065,24 @@
         self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
         self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
 
+    def test_nothing_to_repeat(self):
+        for reps in '*', '+', '?', '{1,2}':
+            for mod in '', '?':
+                self.checkPatternError('%s%s' % (reps, mod),
+                                       'nothing to repeat', 0)
+                self.checkPatternError('(?:%s%s)' % (reps, mod),
+                                       'nothing to repeat', 3)
+
+    def test_multiple_repeat(self):
+        for outer_reps in '*', '+', '{1,2}':
+            for outer_mod in '', '?':
+                outer_op = outer_reps + outer_mod
+                for inner_reps in '*', '+', '?', '{1,2}':
+                    for inner_mod in '', '?':
+                        inner_op = inner_reps + inner_mod
+                        self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
+                                'multiple repeat', 1 + len(inner_op))
+
     def test_unlimited_zero_width_repeat(self):
         # Issue #9669
         self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
@@ -1381,13 +1473,13 @@
 
     def test_backref_group_name_in_exception(self):
         # Issue 17341: Poor error message when compiling invalid regex
-        with self.assertRaisesRegex(sre_constants.error, '<foo>'):
-            re.compile('(?P=<foo>)')
+        self.checkPatternError('(?P=<foo>)',
+                               "bad character in group name '<foo>'", 4)
 
     def test_group_name_in_exception(self):
         # Issue 17341: Poor error message when compiling invalid regex
-        with self.assertRaisesRegex(sre_constants.error, '\?foo'):
-            re.compile('(?P<?foo>)')
+        self.checkPatternError('(?P<?foo>)',
+                               "bad character in group name '?foo'", 4)
 
     def test_issue17998(self):
         for reps in '*', '+', '?', '{1}':
@@ -1556,6 +1648,19 @@
         self.assertIn(' at position 77', str(err))
         self.assertIn('(line 5, column 17)', str(err))
 
+    def test_misc_errors(self):
+        self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
+        self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
+        self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
+        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
+        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
+        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
+        self.checkPatternError(r'(?i', 'missing )', 3)
+        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
+        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
+        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
+        self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
+
 
 class PatternReprTests(unittest.TestCase):
     def check(self, pattern, expected):
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -30,6 +30,8 @@
 Library
 -------
 
+- Issue #22364: Improved some re error messages using regex for hints.
+
 - Issue #23742: ntpath.expandvars() no longer loses unbalanced single quotes.
 
 - Issue #21717: The zipfile.ZipFile.open function now supports 'x' (exclusive
diff --git a/Modules/_sre.c b/Modules/_sre.c
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -315,7 +315,7 @@
 
     /* get pointer to byte string buffer */
     if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
-        PyErr_SetString(PyExc_TypeError, "expected string or buffer");
+        PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
         return NULL;
     }
 
@@ -359,12 +359,12 @@
 
     if (isbytes && pattern->isbytes == 0) {
         PyErr_SetString(PyExc_TypeError,
-                        "can't use a string pattern on a bytes-like object");
+                        "cannot use a string pattern on a bytes-like object");
         goto err;
     }
     if (!isbytes && pattern->isbytes > 0) {
         PyErr_SetString(PyExc_TypeError,
-                        "can't use a bytes pattern on a string-like object");
+                        "cannot use a bytes pattern on a string-like object");
         goto err;
     }
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list