[Python-checkins] bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

Giampaolo Rodola webhook-mailer at python.org
Mon Nov 12 09:18:25 EST 2018


https://github.com/python/cpython/commit/19c46a4c96553b2a8390bf8a0e138f2b23e28ed6
commit: 19c46a4c96553b2a8390bf8a0e138f2b23e28ed6
branch: master
author: Giampaolo Rodola <g.rodola at gmail.com>
committer: GitHub <noreply at github.com>
date: 2018-11-12T06:18:15-08:00
summary:

bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

files:
A Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
M Doc/whatsnew/3.8.rst
M Lib/shutil.py

diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 91e0d5bb7b33..e5e6d4a59944 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -277,6 +277,14 @@ Optimizations
   See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
   (Contributed by Giampaolo Rodola' in :issue:`25427`.)
 
+* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+  functions depending from it use cached :func:`os.stat` values. The speedup
+  for copying a directory with 8000 files is around +9% on Linux, +20% on
+  Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
+  syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+  on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
+
+
 * The default protocol in the :mod:`pickle` module is now Protocol 4,
   first introduced in Python 3.4.  It offers better performance and smaller
   size compared to Protocol 3 available since Python 3.0.
diff --git a/Lib/shutil.py b/Lib/shutil.py
index b7a7df3a51fa..74348ba62ef7 100644
--- a/Lib/shutil.py
+++ b/Lib/shutil.py
@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
 
 def _samefile(src, dst):
     # Macintosh, Unix.
+    if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
+        try:
+            return os.path.samestat(src.stat(), os.stat(dst))
+        except OSError:
+            return False
+
     if hasattr(os.path, 'samefile'):
         try:
             return os.path.samefile(src, dst)
@@ -210,6 +216,12 @@ def _samefile(src, dst):
     return (os.path.normcase(os.path.abspath(src)) ==
             os.path.normcase(os.path.abspath(dst)))
 
+def _stat(fn):
+    return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
+
+def _islink(fn):
+    return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
+
 def copyfile(src, dst, *, follow_symlinks=True):
     """Copy data from src to dst in the most efficient way possible.
 
@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
     file_size = 0
     for i, fn in enumerate([src, dst]):
         try:
-            st = os.stat(fn)
+            st = _stat(fn)
         except OSError:
             # File most likely does not exist
             pass
         else:
             # XXX What about other special files? (sockets, devices...)
             if stat.S_ISFIFO(st.st_mode):
+                fn = fn.path if isinstance(fn, os.DirEntry) else fn
                 raise SpecialFileError("`%s` is a named pipe" % fn)
             if _WINDOWS and i == 0:
                 file_size = st.st_size
 
-    if not follow_symlinks and os.path.islink(src):
+    if not follow_symlinks and _islink(src):
         os.symlink(os.readlink(src), dst)
     else:
         with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
     (e.g. Linux) this method does nothing.
 
     """
-    if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
+    if not follow_symlinks and _islink(src) and os.path.islink(dst):
         if hasattr(os, 'lchmod'):
             stat_func, chmod_func = os.lstat, os.lchmod
         else:
             return
     elif hasattr(os, 'chmod'):
-        stat_func, chmod_func = os.stat, os.chmod
+        stat_func, chmod_func = _stat, os.chmod
     else:
         return
 
@@ -325,7 +338,7 @@ def _nop(*args, ns=None, follow_symlinks=None):
         pass
 
     # follow symlinks (aka don't not follow symlinks)
-    follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
+    follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
     if follow:
         # use the real function if it exists
         def lookup(name):
@@ -339,7 +352,10 @@ def lookup(name):
                 return fn
             return _nop
 
-    st = lookup("stat")(src, follow_symlinks=follow)
+    if isinstance(src, os.DirEntry):
+        st = src.stat(follow_symlinks=follow)
+    else:
+        st = lookup("stat")(src, follow_symlinks=follow)
     mode = stat.S_IMODE(st.st_mode)
     lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
         follow_symlinks=follow)
@@ -415,79 +431,47 @@ def _ignore_patterns(path, names):
         return set(ignored_names)
     return _ignore_patterns
 
-def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
-             ignore_dangling_symlinks=False):
-    """Recursively copy a directory tree.
-
-    The destination directory must not already exist.
-    If exception(s) occur, an Error is raised with a list of reasons.
-
-    If the optional symlinks flag is true, symbolic links in the
-    source tree result in symbolic links in the destination tree; if
-    it is false, the contents of the files pointed to by symbolic
-    links are copied. If the file pointed by the symlink doesn't
-    exist, an exception will be added in the list of errors raised in
-    an Error exception at the end of the copy process.
-
-    You can set the optional ignore_dangling_symlinks flag to true if you
-    want to silence this exception. Notice that this has no effect on
-    platforms that don't support os.symlink.
-
-    The optional ignore argument is a callable. If given, it
-    is called with the `src` parameter, which is the directory
-    being visited by copytree(), and `names` which is the list of
-    `src` contents, as returned by os.listdir():
-
-        callable(src, names) -> ignored_names
-
-    Since copytree() is called recursively, the callable will be
-    called once for each directory that is copied. It returns a
-    list of names relative to the `src` directory that should
-    not be copied.
-
-    The optional copy_function argument is a callable that will be used
-    to copy each file. It will be called with the source path and the
-    destination path as arguments. By default, copy2() is used, but any
-    function that supports the same signature (like copy()) can be used.
-
-    """
-    names = os.listdir(src)
+def _copytree(entries, src, dst, symlinks, ignore, copy_function,
+              ignore_dangling_symlinks):
     if ignore is not None:
-        ignored_names = ignore(src, names)
+        ignored_names = ignore(src, set(os.listdir(src)))
     else:
         ignored_names = set()
 
     os.makedirs(dst)
     errors = []
-    for name in names:
-        if name in ignored_names:
+    use_srcentry = copy_function is copy2 or copy_function is copy
+
+    for srcentry in entries:
+        if srcentry.name in ignored_names:
             continue
-        srcname = os.path.join(src, name)
-        dstname = os.path.join(dst, name)
+        srcname = os.path.join(src, srcentry.name)
+        dstname = os.path.join(dst, srcentry.name)
+        srcobj = srcentry if use_srcentry else srcname
         try:
-            if os.path.islink(srcname):
+            if srcentry.is_symlink():
                 linkto = os.readlink(srcname)
                 if symlinks:
                     # We can't just leave it to `copy_function` because legacy
                     # code with a custom `copy_function` may rely on copytree
                     # doing the right thing.
                     os.symlink(linkto, dstname)
-                    copystat(srcname, dstname, follow_symlinks=not symlinks)
+                    copystat(srcobj, dstname, follow_symlinks=not symlinks)
                 else:
                     # ignore dangling symlink if the flag is on
                     if not os.path.exists(linkto) and ignore_dangling_symlinks:
                         continue
                     # otherwise let the copy occurs. copy2 will raise an error
-                    if os.path.isdir(srcname):
-                        copytree(srcname, dstname, symlinks, ignore,
+                    if srcentry.is_dir():
+                        copytree(srcobj, dstname, symlinks, ignore,
                                  copy_function)
                     else:
-                        copy_function(srcname, dstname)
-            elif os.path.isdir(srcname):
-                copytree(srcname, dstname, symlinks, ignore, copy_function)
+                        copy_function(srcobj, dstname)
+            elif srcentry.is_dir():
+                copytree(srcobj, dstname, symlinks, ignore, copy_function)
             else:
                 # Will raise a SpecialFileError for unsupported file types
-                copy_function(srcname, dstname)
+                copy_function(srcentry, dstname)
         # catch the Error from the recursive copytree so that we can
         # continue with other files
         except Error as err:
@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
         raise Error(errors)
     return dst
 
+def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
+             ignore_dangling_symlinks=False):
+    """Recursively copy a directory tree.
+
+    The destination directory must not already exist.
+    If exception(s) occur, an Error is raised with a list of reasons.
+
+    If the optional symlinks flag is true, symbolic links in the
+    source tree result in symbolic links in the destination tree; if
+    it is false, the contents of the files pointed to by symbolic
+    links are copied. If the file pointed by the symlink doesn't
+    exist, an exception will be added in the list of errors raised in
+    an Error exception at the end of the copy process.
+
+    You can set the optional ignore_dangling_symlinks flag to true if you
+    want to silence this exception. Notice that this has no effect on
+    platforms that don't support os.symlink.
+
+    The optional ignore argument is a callable. If given, it
+    is called with the `src` parameter, which is the directory
+    being visited by copytree(), and `names` which is the list of
+    `src` contents, as returned by os.listdir():
+
+        callable(src, names) -> ignored_names
+
+    Since copytree() is called recursively, the callable will be
+    called once for each directory that is copied. It returns a
+    list of names relative to the `src` directory that should
+    not be copied.
+
+    The optional copy_function argument is a callable that will be used
+    to copy each file. It will be called with the source path and the
+    destination path as arguments. By default, copy2() is used, but any
+    function that supports the same signature (like copy()) can be used.
+
+    """
+    with os.scandir(src) as entries:
+        return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
+                         ignore=ignore, copy_function=copy_function,
+                         ignore_dangling_symlinks=ignore_dangling_symlinks)
+
 # version vulnerable to race conditions
 def _rmtree_unsafe(path, onerror):
     try:
diff --git a/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
new file mode 100644
index 000000000000..21950453b0ad
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
@@ -0,0 +1,7 @@
+:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+functions depending from it use cached :func:`os.stat` values. The speedup
+for copying a directory with 8000 files is around +9% on Linux, +20% on
+Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
+syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+on network filesystems.
+(Contributed by Giampaolo Rodola' in :issue:`33695`.)



More information about the Python-checkins mailing list