bpo-39595: Improve zipfile.Path performance (GH-18406) (GH-18472)
https://github.com/python/cpython/commit/ed4d263e8767b0e4c47df99141b500d36ce... commit: ed4d263e8767b0e4c47df99141b500d36ce0275d branch: 3.8 author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> committer: GitHub <noreply@github.com> date: 2020-02-11T22:21:32-05:00 summary: bpo-39595: Improve zipfile.Path performance (GH-18406) (GH-18472) * Improve zipfile.Path performance on zipfiles with a large number of entries. * 📜🤖 Added by blurb_it. * Add bpo to blurb * Sync with importlib_metadata 1.5 (6fe70ca) * Update blurb. * Remove compatibility code * Add stubs module, omitted from earlier commit Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> (cherry picked from commit e5bd73632e77dc5ab0cab77e48e94ca5e354be8a) Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> files: A Lib/test/test_importlib/stubs.py A Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst M Lib/importlib/metadata.py M Lib/test/test_importlib/fixtures.py M Lib/test/test_importlib/test_main.py M Lib/test/test_zipfile.py M Lib/zipfile.py diff --git a/Lib/importlib/metadata.py b/Lib/importlib/metadata.py index ae8ecf9b8500c..831f593277ccd 100644 --- a/Lib/importlib/metadata.py +++ b/Lib/importlib/metadata.py @@ -391,6 +391,7 @@ class FastPath: def __init__(self, root): self.root = root + self.base = os.path.basename(root).lower() def joinpath(self, child): return pathlib.Path(self.root, child) @@ -413,12 +414,11 @@ def zip_children(self): ) def is_egg(self, search): - root_n_low = os.path.split(self.root)[1].lower() - + base = self.base return ( - root_n_low == search.normalized + '.egg' - or root_n_low.startswith(search.prefix) - and root_n_low.endswith('.egg')) + base == search.versionless_egg_name + or base.startswith(search.prefix) + and base.endswith('.egg')) def search(self, name): for child in self.children(): @@ -439,6 +439,7 @@ class Prepared: prefix = '' suffixes = '.dist-info', '.egg-info' exact_matches = [''][:0] + versionless_egg_name = '' def __init__(self, name): self.name = name @@ -448,6 +449,7 @@ def __init__(self, name): self.prefix = self.normalized + '-' self.exact_matches = [ self.normalized + suffix for suffix in self.suffixes] + self.versionless_egg_name = self.normalized + '.egg' class MetadataPathFinder(DistributionFinder): diff --git a/Lib/test/test_importlib/fixtures.py b/Lib/test/test_importlib/fixtures.py index 0b4ce18d5a6cd..695c92a786cb0 100644 --- a/Lib/test/test_importlib/fixtures.py +++ b/Lib/test/test_importlib/fixtures.py @@ -47,14 +47,28 @@ def tempdir_as_cwd(): yield tmp -class SiteDir: +@contextlib.contextmanager +def install_finder(finder): + sys.meta_path.append(finder) + try: + yield + finally: + sys.meta_path.remove(finder) + + +class Fixtures: def setUp(self): self.fixtures = ExitStack() self.addCleanup(self.fixtures.close) + + +class SiteDir(Fixtures): + def setUp(self): + super(SiteDir, self).setUp() self.site_dir = self.fixtures.enter_context(tempdir()) -class OnSysPath: +class OnSysPath(Fixtures): @staticmethod @contextlib.contextmanager def add_sys_path(dir): @@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()): def DALS(str): "Dedent and left-strip" return textwrap.dedent(str).lstrip() + + +class NullFinder: + def find_module(self, name): + pass diff --git a/Lib/test/test_importlib/stubs.py b/Lib/test/test_importlib/stubs.py new file mode 100644 index 0000000000000..e5b011c399fa9 --- /dev/null +++ b/Lib/test/test_importlib/stubs.py @@ -0,0 +1,10 @@ +import unittest + + +class fake_filesystem_unittest: + """ + Stubbed version of the pyfakefs module + """ + class TestCase(unittest.TestCase): + def setUpPyfakefs(self): + self.skipTest("pyfakefs not available") diff --git a/Lib/test/test_importlib/test_main.py b/Lib/test/test_importlib/test_main.py index c5f1dbbae325e..42a79992ecc8c 100644 --- a/Lib/test/test_importlib/test_main.py +++ b/Lib/test/test_importlib/test_main.py @@ -7,6 +7,11 @@ import unittest import importlib.metadata +try: + import pyfakefs.fake_filesystem_unittest as ffs +except ImportError: + from .stubs import fake_filesystem_unittest as ffs + from . import fixtures from importlib.metadata import ( Distribution, EntryPoint, @@ -185,6 +190,33 @@ def test_egg(self): version('foo') +class MissingSysPath(fixtures.OnSysPath, unittest.TestCase): + site_dir = '/does-not-exist' + + def test_discovery(self): + """ + Discovering distributions should succeed even if + there is an invalid path on sys.path. + """ + importlib.metadata.distributions() + + +class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase): + site_dir = '/access-denied' + + def setUp(self): + super(InaccessibleSysPath, self).setUp() + self.setUpPyfakefs() + self.fs.create_dir(self.site_dir, perm_bits=000) + + def test_discovery(self): + """ + Discovering distributions should succeed even if + there is an invalid path on sys.path. + """ + list(importlib.metadata.distributions()) + + class TestEntryPoints(unittest.TestCase): def __init__(self, *args): super(TestEntryPoints, self).__init__(*args) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index c65de9202c0c4..61bca8651c02a 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -2683,16 +2683,71 @@ def test_extract_command(self): self.assertEqual(f.read(), zf.read(zi)) +class TestExecutablePrependedZip(unittest.TestCase): + """Test our ability to open zip files with an executable prepended.""" + + def setUp(self): + self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') + self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') + + def _test_zip_works(self, name): + # bpo28494 sanity check: ensure is_zipfile works on these. + self.assertTrue(zipfile.is_zipfile(name), + f'is_zipfile failed on {name}') + # Ensure we can operate on these via ZipFile. + with zipfile.ZipFile(name) as zipfp: + for n in zipfp.namelist(): + data = zipfp.read(n) + self.assertIn(b'FAVORITE_NUMBER', data) + + def test_read_zip_with_exe_prepended(self): + self._test_zip_works(self.exe_zip) + + def test_read_zip64_with_exe_prepended(self): + self._test_zip_works(self.exe_zip64) + + @unittest.skipUnless(sys.executable, 'sys.executable required.') + @unittest.skipUnless(os.access('/bin/bash', os.X_OK), + 'Test relies on #!/bin/bash working.') + def test_execute_zip2(self): + output = subprocess.check_output([self.exe_zip, sys.executable]) + self.assertIn(b'number in executable: 5', output) + + @unittest.skipUnless(sys.executable, 'sys.executable required.') + @unittest.skipUnless(os.access('/bin/bash', os.X_OK), + 'Test relies on #!/bin/bash working.') + def test_execute_zip64(self): + output = subprocess.check_output([self.exe_zip64, sys.executable]) + self.assertIn(b'number in executable: 5', output) + + # Poor man's technique to consume a (smallish) iterable. consume = tuple +# from jaraco.itertools 5.0 +class jaraco: + class itertools: + class Counter: + def __init__(self, i): + self.count = 0 + self._orig_iter = iter(i) + + def __iter__(self): + return self + + def __next__(self): + result = next(self._orig_iter) + self.count += 1 + return result + + def add_dirs(zf): """ Given a writable zip file zf, inject directory entries for any directories implied by the presence of children. """ - for name in zipfile.Path._implied_dirs(zf.namelist()): + for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): zf.writestr(name, b"") return zf @@ -2733,44 +2788,6 @@ def build_alpharep_fixture(): return zf -class TestExecutablePrependedZip(unittest.TestCase): - """Test our ability to open zip files with an executable prepended.""" - - def setUp(self): - self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') - self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') - - def _test_zip_works(self, name): - # bpo-28494 sanity check: ensure is_zipfile works on these. - self.assertTrue(zipfile.is_zipfile(name), - f'is_zipfile failed on {name}') - # Ensure we can operate on these via ZipFile. - with zipfile.ZipFile(name) as zipfp: - for n in zipfp.namelist(): - data = zipfp.read(n) - self.assertIn(b'FAVORITE_NUMBER', data) - - def test_read_zip_with_exe_prepended(self): - self._test_zip_works(self.exe_zip) - - def test_read_zip64_with_exe_prepended(self): - self._test_zip_works(self.exe_zip64) - - @unittest.skipUnless(sys.executable, 'sys.executable required.') - @unittest.skipUnless(os.access('/bin/bash', os.X_OK), - 'Test relies on #!/bin/bash working.') - def test_execute_zip2(self): - output = subprocess.check_output([self.exe_zip, sys.executable]) - self.assertIn(b'number in executable: 5', output) - - @unittest.skipUnless(sys.executable, 'sys.executable required.') - @unittest.skipUnless(os.access('/bin/bash', os.X_OK), - 'Test relies on #!/bin/bash working.') - def test_execute_zip64(self): - output = subprocess.check_output([self.exe_zip64, sys.executable]) - self.assertIn(b'number in executable: 5', output) - - class TestPath(unittest.TestCase): def setUp(self): self.fixtures = contextlib.ExitStack() @@ -2808,6 +2825,14 @@ def test_iterdir_and_types(self): i, = h.iterdir() assert i.is_file() + def test_subdir_is_dir(self): + for alpharep in self.zipfile_alpharep(): + root = zipfile.Path(alpharep) + assert (root / 'b').is_dir() + assert (root / 'b/').is_dir() + assert (root / 'g').is_dir() + assert (root / 'g/').is_dir() + def test_open(self): for alpharep in self.zipfile_alpharep(): root = zipfile.Path(alpharep) @@ -2869,6 +2894,45 @@ def test_missing_dir_parent(self): root = zipfile.Path(alpharep) assert (root / 'missing dir/').parent.at == '' + def test_mutability(self): + """ + If the underlying zipfile is changed, the Path object should + reflect that change. + """ + for alpharep in self.zipfile_alpharep(): + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + alpharep.writestr('foo.txt', 'foo') + alpharep.writestr('bar/baz.txt', 'baz') + assert any( + child.name == 'foo.txt' + for child in root.iterdir()) + assert (root / 'foo.txt').read_text() == 'foo' + baz, = (root / 'bar').iterdir() + assert baz.read_text() == 'baz' + + HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13 + + def huge_zipfile(self): + """Create a read-only zipfile with a huge number of entries entries.""" + strm = io.BytesIO() + zf = zipfile.ZipFile(strm, "w") + for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): + zf.writestr(entry, entry) + zf.mode = 'r' + return zf + + def test_joinpath_constant_time(self): + """ + Ensure joinpath on items in zipfile is linear time. + """ + root = zipfile.Path(self.huge_zipfile()) + entries = jaraco.itertools.Counter(root.iterdir()) + for entry in entries: + entry.joinpath('suffix') + # Check the file iterated all items + assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 8b99c1189baa8..5dc6516cc47b7 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -16,6 +16,8 @@ import sys import threading import time +import contextlib +from collections import OrderedDict try: import zlib # We may need its compression method @@ -2182,6 +2184,79 @@ def _ancestry(path): path, tail = posixpath.split(path) +class CompleteDirs(ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + """ + + @staticmethod + def _implied_dirs(names): + parents = itertools.chain.from_iterable(map(_parents, names)) + # Deduplicate entries in original order + implied_dirs = OrderedDict.fromkeys( + p + posixpath.sep for p in parents + # Cast names to a set for O(1) lookups + if p + posixpath.sep not in set(names) + ) + return implied_dirs + + def namelist(self): + names = super(CompleteDirs, self).namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self): + return set(self.namelist()) + + def resolve_dir(self, name): + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + '/' + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + @classmethod + def make(cls, source): + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, ZipFile): + return cls(source) + + # Only allow for FastPath when supplied zipfile is read-only + if 'r' not in source.mode: + cls = CompleteDirs + + res = cls.__new__(cls) + vars(res).update(vars(source)) + return res + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + def namelist(self): + with contextlib.suppress(AttributeError): + return self.__names + self.__names = super(FastLookup, self).namelist() + return self.__names + + def _name_set(self): + with contextlib.suppress(AttributeError): + return self.__lookup + self.__lookup = super(FastLookup, self)._name_set() + return self.__lookup + + class Path: """ A pathlib-compatible interface for zip files. @@ -2250,7 +2325,7 @@ class Path: __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" def __init__(self, root, at=""): - self.root = root if isinstance(root, ZipFile) else ZipFile(root) + self.root = FastLookup.make(root) self.at = at @property @@ -2282,12 +2357,12 @@ def is_file(self): return not self.is_dir() def exists(self): - return self.at in self._names() + return self.at in self.root._name_set() def iterdir(self): if not self.is_dir(): raise ValueError("Can't listdir a file") - subs = map(self._next, self._names()) + subs = map(self._next, self.root.namelist()) return filter(self._is_child, subs) def __str__(self): @@ -2298,25 +2373,10 @@ def __repr__(self): def joinpath(self, add): next = posixpath.join(self.at, add) - next_dir = posixpath.join(self.at, add, "") - names = self._names() - return self._next(next_dir if next not in names and next_dir in names else next) + return self._next(self.root.resolve_dir(next)) __truediv__ = joinpath - @staticmethod - def _implied_dirs(names): - return _unique_everseen( - parent + "/" - for name in names - for parent in _parents(name) - if parent + "/" not in names - ) - - @classmethod - def _add_implied_dirs(cls, names): - return names + list(cls._implied_dirs(names)) - @property def parent(self): parent_at = posixpath.dirname(self.at.rstrip('/')) @@ -2324,9 +2384,6 @@ def parent(self): parent_at += '/' return self._next(parent_at) - def _names(self): - return self._add_implied_dirs(self.root.namelist()) - def main(args=None): import argparse @@ -2388,5 +2445,6 @@ def addToZip(zf, path, zippath): zippath = '' addToZip(zf, path, zippath) + if __name__ == "__main__": main() diff --git a/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst new file mode 100644 index 0000000000000..3a461389af7d1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst @@ -0,0 +1 @@ +Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 <https://importlib-metadata.readthedocs.io/en/latest/changelog%20(links).html#v1-5-0>`_.
participants (1)
-
Miss Islington (bot)