[3.12] GH-105113: Improve performance of `pathlib.PurePath.match()` (GH-105114)
https://github.com/python/cpython/commit/e7cb216339e8aa8d00f7f55f9e37864b172... commit: e7cb216339e8aa8d00f7f55f9e37864b1720ca0f branch: 3.12 author: Barney Gale <barney.gale@gmail.com> committer: barneygale <barney.gale@gmail.com> date: 2023-05-31T21:37:37+01:00 summary: [3.12] GH-105113: Improve performance of `pathlib.PurePath.match()` (GH-105114) We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com> files: A Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst M Doc/library/pathlib.rst M Lib/pathlib.py diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 627f2df9263d..39b9eb84e7e0 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties: >>> PurePath('a/b.py').match('/*.py') False + The *pattern* may be another path object; this speeds up matching the same + pattern against multiple files:: + + >>> pattern = PurePath('*.py') + >>> PurePath('a/b.py').match(pattern) + True + As with other methods, case-sensitivity follows platform defaults:: >>> PurePosixPath('b.py').match('*.PY') diff --git a/Lib/pathlib.py b/Lib/pathlib.py index a42085e3a3f8..29517e4c74db 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -61,6 +62,22 @@ def _is_case_sensitive(flavour): # Globbing helpers # + +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive): return re.compile(fnmatch.translate(pat), flags).match +@functools.lru_cache() +def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with + its path separators and newlines swapped (e.g. '*\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): + # We slice off the common prefix and suffix added by translate() to + # ensure that re.DOTALL is not set, and the end of the string not + # matched, respectively. With DOTALL not set, '*' wildcards will not + # match path separators, because the '.' characters in the pattern + # will not match newlines. + parts.append(fnmatch.translate(part)[_FNMATCH_SLICE]) + # Match the end of the path, always. + parts.append(r'\Z') + flags = re.MULTILINE + if not case_sensitive: + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + class _Selector: """A selector matches a specific glob pattern part against the children of a given path.""" @@ -274,6 +323,10 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_cached` slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. + '_lines_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -439,6 +492,16 @@ def _parts_normcase(self): self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached + @property + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. + try: + return self._lines_cached + except AttributeError: + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_cached = str(self).translate(trans) + return self._lines_cached + def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -695,23 +758,18 @@ def match(self, path_pattern, *, case_sensitive=None): """ Return True if this path matches the given pattern. """ + if not isinstance(path_pattern, PurePath): + path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self._flavour) - pat = self.with_segments(path_pattern) - if not pat.parts: + pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: raise ValueError("empty pattern") - pat_parts = pat.parts - parts = self.parts - if pat.drive or pat.root: - if len(pat_parts) != len(parts): - return False - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - match = _compile_pattern(pat, case_sensitive) - if not match(part): - return False - return True + # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath.__slots__. diff --git a/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst b/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst new file mode 100644 index 000000000000..59164ae4734e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst @@ -0,0 +1,2 @@ +Improve performance of :meth:`pathlib.PurePath.match` by compiling an +:class:`re.Pattern` object for the entire pattern.
participants (1)
-
barneygale