[Python-checkins] bpo-40480 "fnmatch" exponential execution time (GH-19908)

Tim Peters webhook-mailer at python.org
Tue May 5 22:28:32 EDT 2020


https://github.com/python/cpython/commit/b9c46a2c2d7fc68457bff641f78932d66f5e5f59
commit: b9c46a2c2d7fc68457bff641f78932d66f5e5f59
branch: master
author: Tim Peters <tim.peters at gmail.com>
committer: GitHub <noreply at github.com>
date: 2020-05-05T21:28:24-05:00
summary:

bpo-40480 "fnmatch" exponential execution time (GH-19908)

bpo-40480:  create different regexps in the presence of multiple `*`
patterns to prevent fnmatch() from taking exponential time.

files:
A Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst
M Lib/fnmatch.py
M Lib/test/test_fnmatch.py

diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py
index b98e6413295e1..d7d915d51314d 100644
--- a/Lib/fnmatch.py
+++ b/Lib/fnmatch.py
@@ -77,15 +77,19 @@ def translate(pat):
     There is no way to quote meta-characters.
     """
 
+    STAR = object()
+    res = []
+    add = res.append
     i, n = 0, len(pat)
-    res = ''
     while i < n:
         c = pat[i]
         i = i+1
         if c == '*':
-            res = res + '.*'
+            # compress consecutive `*` into one
+            if (not res) or res[-1] is not STAR:
+                add(STAR)
         elif c == '?':
-            res = res + '.'
+            add('.')
         elif c == '[':
             j = i
             if j < n and pat[j] == '!':
@@ -95,7 +99,7 @@ def translate(pat):
             while j < n and pat[j] != ']':
                 j = j+1
             if j >= n:
-                res = res + '\\['
+                add('\\[')
             else:
                 stuff = pat[i:j]
                 if '--' not in stuff:
@@ -122,7 +126,49 @@ def translate(pat):
                     stuff = '^' + stuff[1:]
                 elif stuff[0] in ('^', '['):
                     stuff = '\\' + stuff
-                res = '%s[%s]' % (res, stuff)
+                add(f'[{stuff}]')
         else:
-            res = res + re.escape(c)
-    return r'(?s:%s)\Z' % res
+            add(re.escape(c))
+    assert i == n
+
+    # Deal with STARs.
+    inp = res
+    res = []
+    add = res.append
+    i, n = 0, len(inp)
+    # Fixed pieces at the start?
+    while i < n and inp[i] is not STAR:
+        add(inp[i])
+        i += 1
+    # Now deal with STAR fixed STAR fixed ...
+    # For an interior `STAR fixed` pairing, we want to do a minimal
+    # .*? match followed by `fixed`, with no possibility of backtracking.
+    # We can't spell that directly, but can trick it into working by matching
+    #    .*?fixed
+    # in a lookahead assertion, save the matched part in a group, then
+    # consume that group via a backreference. If the overall match fails,
+    # the lookahead assertion won't try alternatives. So the translation is:
+    #     (?=(P<name>.*?fixed))(?P=name)
+    # Group names are created as needed: g1, g2, g3, ...
+    groupnum = 0
+    while i < n:
+        assert inp[i] is STAR
+        i += 1
+        if i == n:
+            add(".*")
+            break
+        assert inp[i] is not STAR
+        fixed = []
+        while i < n and inp[i] is not STAR:
+            fixed.append(inp[i])
+            i += 1
+        fixed = "".join(fixed)
+        if i == n:
+            add(".*")
+            add(fixed)
+        else:
+            groupnum += 1
+            add(f"(?=(?P<g{groupnum}>.*?{fixed}))(?P=g{groupnum})")
+    assert i == n
+    res = "".join(res)
+    return fr'(?s:{res})\Z'
diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py
index 55f9f0d3a5425..4c173069503cc 100644
--- a/Lib/test/test_fnmatch.py
+++ b/Lib/test/test_fnmatch.py
@@ -45,6 +45,13 @@ def test_fnmatch(self):
         check('\nfoo', 'foo*', False)
         check('\n', '*')
 
+    def test_slow_fnmatch(self):
+        check = self.check_match
+        check('a' * 50, '*a*a*a*a*a*a*a*a*a*a')
+        # The next "takes forever" if the regexp translation is
+        # straightforward.  See bpo-40480.
+        check('a' * 50 + 'b', '*a*a*a*a*a*a*a*a*a*a', False)
+
     def test_mix_bytes_str(self):
         self.assertRaises(TypeError, fnmatch, 'test', b'*')
         self.assertRaises(TypeError, fnmatch, b'test', '*')
@@ -107,6 +114,16 @@ def test_translate(self):
         self.assertEqual(translate('[!x]'), r'(?s:[^x])\Z')
         self.assertEqual(translate('[^x]'), r'(?s:[\^x])\Z')
         self.assertEqual(translate('[x'), r'(?s:\[x)\Z')
+        # from the docs
+        self.assertEqual(translate('*.txt'), r'(?s:.*\.txt)\Z')
+        # squash consecutive stars
+        self.assertEqual(translate('*********'), r'(?s:.*)\Z')
+        self.assertEqual(translate('A*********'), r'(?s:A.*)\Z')
+        self.assertEqual(translate('*********A'), r'(?s:.*A)\Z')
+        self.assertEqual(translate('A*********?[?]?'), r'(?s:A.*.[?].)\Z')
+        # fancy translation to prevent exponential-time match failure
+        self.assertEqual(translate('**a*a****a'),
+             r'(?s:(?=(?P<g1>.*?a))(?P=g1)(?=(?P<g2>.*?a))(?P=g2).*a)\Z')
 
 
 class FilterTestCase(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst b/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst
new file mode 100644
index 0000000000000..d046b1422419d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst
@@ -0,0 +1 @@
+``fnmatch.fnmatch()`` could take exponential time in the presence of multiple ``*`` pattern characters.  This was repaired by generating more elaborate regular expressions to avoid futile backtracking.
\ No newline at end of file



More information about the Python-checkins mailing list