[Python-checkins] cpython: Issue #13968: The glob module now supports recursive search in

serhiy.storchaka python-checkins at python.org
Thu Sep 11 11:23:11 CEST 2014


http://hg.python.org/cpython/rev/ff4b9d654691
changeset:   92400:ff4b9d654691
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Thu Sep 11 12:17:37 2014 +0300
summary:
  Issue #13968: The glob module now supports recursive search in
 subdirectories using the "**" pattern.

files:
  Doc/library/glob.rst  |   24 ++++-
  Doc/whatsnew/3.5.rst  |    7 +
  Lib/glob.py           |   56 +++++++++++-
  Lib/test/test_glob.py |  128 +++++++++++++++++++++++++++--
  Misc/NEWS             |    3 +
  5 files changed, 199 insertions(+), 19 deletions(-)


diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst
--- a/Doc/library/glob.rst
+++ b/Doc/library/glob.rst
@@ -29,7 +29,7 @@
    The :mod:`pathlib` module offers high-level path objects.
 
 
-.. function:: glob(pathname)
+.. function:: glob(pathname, *, recursive=False)
 
    Return a possibly-empty list of path names that match *pathname*, which must be
    a string containing a path specification. *pathname* can be either absolute
@@ -37,8 +37,19 @@
    :file:`../../Tools/\*/\*.gif`), and can contain shell-style wildcards. Broken
    symlinks are included in the results (as in the shell).
 
+   If *recursive* is true, the pattern "``**``" will match any files and zero or
+   more directories and subdirectories.  If the pattern is followed by a
+   ``os.sep``, only directories and subdirectories match.
 
-.. function:: iglob(pathname)
+   .. note::
+      Using the "``**``" pattern in large directory trees may consume
+      an inordinate amount of time.
+
+   .. versionchanged:: 3.5
+      Support for recursive globs using "``**``".
+
+
+.. function:: iglob(pathname, recursive=False)
 
    Return an :term:`iterator` which yields the same values as :func:`glob`
    without actually storing them all simultaneously.
@@ -55,8 +66,9 @@
    .. versionadded:: 3.4
 
 
-For example, consider a directory containing only the following files:
-:file:`1.gif`, :file:`2.txt`, and :file:`card.gif`.  :func:`glob` will produce
+For example, consider a directory containing the following files:
+:file:`1.gif`, :file:`2.txt`, :file:`card.gif` and a subdirectory :file:`sub`
+which contains only the file :file:`3.txt`.  :func:`glob` will produce
 the following results.  Notice how any leading components of the path are
 preserved. ::
 
@@ -67,6 +79,10 @@
    ['1.gif', 'card.gif']
    >>> glob.glob('?.gif')
    ['1.gif']
+   >>> glob.glob('**/*.txt', recursive=True)
+   ['2.txt', 'sub/3.txt']
+   >>> glob.glob('./**/', recursive=True)
+   ['./', './sub/']
 
 If the directory contains files starting with ``.`` they won't be matched by
 default. For example, consider a directory containing :file:`card.gif` and
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -141,6 +141,13 @@
   *module* contains no docstrings instead of raising :exc:`ValueError`
   (contributed by Glenn Jones in :issue:`15916`).
 
+glob
+----
+
+* :func:`~glob.iglob` and :func:`~glob.glob` now support recursive search in
+  subdirectories using the "``**``" pattern.
+  (Contributed by Serhiy Storchaka in :issue:`13968`.)
+
 imaplib
 -------
 
diff --git a/Lib/glob.py b/Lib/glob.py
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -6,7 +6,7 @@
 
 __all__ = ["glob", "iglob"]
 
-def glob(pathname):
+def glob(pathname, *, recursive=False):
     """Return a list of paths matching a pathname pattern.
 
     The pattern may contain simple shell-style wildcards a la
@@ -14,10 +14,12 @@
     dot are special cases that are not matched by '*' and '?'
     patterns.
 
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
     """
-    return list(iglob(pathname))
+    return list(iglob(pathname, recursive=recursive))
 
-def iglob(pathname):
+def iglob(pathname, *, recursive=False):
     """Return an iterator which yields the paths matching a pathname pattern.
 
     The pattern may contain simple shell-style wildcards a la
@@ -25,6 +27,8 @@
     dot are special cases that are not matched by '*' and '?'
     patterns.
 
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
     """
     dirname, basename = os.path.split(pathname)
     if not has_magic(pathname):
@@ -37,17 +41,23 @@
                 yield pathname
         return
     if not dirname:
-        yield from glob1(None, basename)
+        if recursive and _isrecursive(basename):
+            yield from glob2(dirname, basename)
+        else:
+            yield from glob1(dirname, basename)
         return
     # `os.path.split()` returns the argument itself as a dirname if it is a
     # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
     # contains magic characters (i.e. r'\\?\C:').
     if dirname != pathname and has_magic(dirname):
-        dirs = iglob(dirname)
+        dirs = iglob(dirname, recursive=recursive)
     else:
         dirs = [dirname]
     if has_magic(basename):
-        glob_in_dir = glob1
+        if recursive and _isrecursive(basename):
+            glob_in_dir = glob2
+        else:
+            glob_in_dir = glob1
     else:
         glob_in_dir = glob0
     for dirname in dirs:
@@ -83,6 +93,34 @@
             return [basename]
     return []
 
+# This helper function recursively yields relative pathnames inside a literal
+# directory.
+
+def glob2(dirname, pattern):
+    assert _isrecursive(pattern)
+    if dirname:
+        yield pattern[:0]
+    yield from _rlistdir(dirname)
+
+# Recursively yields relative pathnames inside a literal directory.
+
+def _rlistdir(dirname):
+    if not dirname:
+        if isinstance(dirname, bytes):
+            dirname = bytes(os.curdir, 'ASCII')
+        else:
+            dirname = os.curdir
+    try:
+        names = os.listdir(dirname)
+    except os.error:
+        return
+    for x in names:
+        if not _ishidden(x):
+            yield x
+            path = os.path.join(dirname, x) if dirname else x
+            for y in _rlistdir(path):
+                yield os.path.join(x, y)
+
 
 magic_check = re.compile('([*?[])')
 magic_check_bytes = re.compile(b'([*?[])')
@@ -97,6 +135,12 @@
 def _ishidden(path):
     return path[0] in ('.', b'.'[0])
 
+def _isrecursive(pattern):
+    if isinstance(pattern, bytes):
+        return pattern == b'**'
+    else:
+        return pattern == '**'
+
 def escape(pathname):
     """Escape all special characters.
     """
diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py
--- a/Lib/test/test_glob.py
+++ b/Lib/test/test_glob.py
@@ -4,7 +4,7 @@
 import sys
 import unittest
 
-from test.support import (run_unittest, TESTFN, skip_unless_symlink,
+from test.support import (TESTFN, skip_unless_symlink,
                           can_symlink, create_empty_file)
 
 
@@ -13,6 +13,9 @@
     def norm(self, *parts):
         return os.path.normpath(os.path.join(self.tempdir, *parts))
 
+    def joins(self, *tuples):
+        return [os.path.join(self.tempdir, *parts) for parts in tuples]
+
     def mktemp(self, *parts):
         filename = self.norm(*parts)
         base, file = os.path.split(filename)
@@ -38,17 +41,17 @@
     def tearDown(self):
         shutil.rmtree(self.tempdir)
 
-    def glob(self, *parts):
+    def glob(self, *parts, **kwargs):
         if len(parts) == 1:
             pattern = parts[0]
         else:
             pattern = os.path.join(*parts)
         p = os.path.join(self.tempdir, pattern)
-        res = glob.glob(p)
-        self.assertEqual(list(glob.iglob(p)), res)
+        res = glob.glob(p, **kwargs)
+        self.assertEqual(list(glob.iglob(p, **kwargs)), res)
         bres = [os.fsencode(x) for x in res]
-        self.assertEqual(glob.glob(os.fsencode(p)), bres)
-        self.assertEqual(list(glob.iglob(os.fsencode(p))), bres)
+        self.assertEqual(glob.glob(os.fsencode(p), **kwargs), bres)
+        self.assertEqual(list(glob.iglob(os.fsencode(p), **kwargs)), bres)
         return res
 
     def assertSequencesEqual_noorder(self, l1, l2):
@@ -192,9 +195,116 @@
         check('//?/c:/?', '//?/c:/[?]')
         check('//*/*/*', '//*/*/[*]')
 
-def test_main():
-    run_unittest(GlobTests)
+    def rglob(self, *parts, **kwargs):
+        return self.glob(*parts, recursive=True, **kwargs)
+
+    def test_recursive_glob(self):
+        eq = self.assertSequencesEqual_noorder
+        full = [('ZZZ',),
+                ('a',), ('a', 'D'),
+                ('a', 'bcd'),
+                ('a', 'bcd', 'EF'),
+                ('a', 'bcd', 'efg'),
+                ('a', 'bcd', 'efg', 'ha'),
+                ('aaa',), ('aaa', 'zzzF'),
+                ('aab',), ('aab', 'F'),
+               ]
+        if can_symlink():
+            full += [('sym1',), ('sym2',),
+                     ('sym3',),
+                     ('sym3', 'EF'),
+                     ('sym3', 'efg'),
+                     ('sym3', 'efg', 'ha'),
+                    ]
+        eq(self.rglob('**'), self.joins(('',), *full))
+        eq(self.rglob('.', '**'), self.joins(('.',''),
+            *(('.',) + i for i in full)))
+        dirs = [('a', ''), ('a', 'bcd', ''), ('a', 'bcd', 'efg', ''),
+                ('aaa', ''), ('aab', '')]
+        if can_symlink():
+            dirs += [('sym3', ''), ('sym3', 'efg', '')]
+        eq(self.rglob('**', ''), self.joins(('',), *dirs))
+
+        eq(self.rglob('a', '**'), self.joins(
+            ('a', ''), ('a', 'D'), ('a', 'bcd'), ('a', 'bcd', 'EF'),
+            ('a', 'bcd', 'efg'), ('a', 'bcd', 'efg', 'ha')))
+        eq(self.rglob('a**'), self.joins(('a',), ('aaa',), ('aab',)))
+        expect = [('a', 'bcd', 'EF')]
+        if can_symlink():
+            expect += [('sym3', 'EF')]
+        eq(self.rglob('**', 'EF'), self.joins(*expect))
+        expect = [('a', 'bcd', 'EF'), ('aaa', 'zzzF'), ('aab', 'F')]
+        if can_symlink():
+            expect += [('sym3', 'EF')]
+        eq(self.rglob('**', '*F'), self.joins(*expect))
+        eq(self.rglob('**', '*F', ''), [])
+        eq(self.rglob('**', 'bcd', '*'), self.joins(
+            ('a', 'bcd', 'EF'), ('a', 'bcd', 'efg')))
+        eq(self.rglob('a', '**', 'bcd'), self.joins(('a', 'bcd')))
+
+        predir = os.path.abspath(os.curdir)
+        try:
+            os.chdir(self.tempdir)
+            join = os.path.join
+            eq(glob.glob('**', recursive=True), [join(*i) for i in full])
+            eq(glob.glob(join('**', ''), recursive=True),
+                [join(*i) for i in dirs])
+            eq(glob.glob(join('**','zz*F'), recursive=True),
+                [join('aaa', 'zzzF')])
+            eq(glob.glob('**zz*F', recursive=True), [])
+            expect = [join('a', 'bcd', 'EF')]
+            if can_symlink():
+                expect += [join('sym3', 'EF')]
+            eq(glob.glob(join('**', 'EF'), recursive=True), expect)
+        finally:
+            os.chdir(predir)
+
+
+ at skip_unless_symlink
+class SymlinkLoopGlobTests(unittest.TestCase):
+
+    def test_selflink(self):
+        tempdir = TESTFN + "_dir"
+        os.makedirs(tempdir)
+        create_empty_file(os.path.join(tempdir, 'file'))
+        os.symlink(os.curdir, os.path.join(tempdir, 'link'))
+        self.addCleanup(shutil.rmtree, tempdir)
+
+        results = glob.glob('**', recursive=True)
+        self.assertEqual(len(results), len(set(results)))
+        results = set(results)
+        depth = 0
+        while results:
+            path = os.path.join(*([tempdir] + ['link'] * depth))
+            self.assertIn(path, results)
+            results.remove(path)
+            if not results:
+                break
+            path = os.path.join(path, 'file')
+            self.assertIn(path, results)
+            results.remove(path)
+            depth += 1
+
+        results = glob.glob(os.path.join('**', 'file'), recursive=True)
+        self.assertEqual(len(results), len(set(results)))
+        results = set(results)
+        depth = 0
+        while results:
+            path = os.path.join(*([tempdir] + ['link'] * depth + ['file']))
+            self.assertIn(path, results)
+            results.remove(path)
+            depth += 1
+
+        results = glob.glob(os.path.join('**', ''), recursive=True)
+        self.assertEqual(len(results), len(set(results)))
+        results = set(results)
+        depth = 0
+        while results:
+            path = os.path.join(*([tempdir] + ['link'] * depth + ['']))
+            self.assertIn(path, results)
+            results.remove(path)
+            depth += 1
 
 
 if __name__ == "__main__":
-    test_main()
+    unittest.main()
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -132,6 +132,9 @@
 Library
 -------
 
+- Issue #13968: The glob module now supports recursive search in
+  subdirectories using the "**" pattern.
+
 - Issue #21951: Fixed a crash in Tkinter on AIX when called Tcl command with
   empty string or tuple argument.
 

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list