[Python-checkins] bpo-34010: Fix tarfile read performance regression (GH-8020)

Wed Jul 4 04:13:22 EDT 2018

https://github.com/python/cpython/commit/12a08c47601cadea8e7d3808502cdbcca87b2ce2
commit: 12a08c47601cadea8e7d3808502cdbcca87b2ce2
branch: master
author: hajoscher <hajoscher at gmail.com>
committer: INADA Naoki <methane at users.noreply.github.com>
date: 2018-07-04T17:13:18+09:00
summary:

bpo-34010: Fix tarfile read performance regression (GH-8020)

During buffered read, use a list followed by join instead of extending a bytes object.
This is how it was done before but changed in commit b506dc32c1a.

files:
A Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
M Lib/tarfile.py

diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 7b4732d47197..59f044cc5a00 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -525,7 +525,7 @@ def read(self, size=None):
                 if not buf:
                     break
                 t.append(buf)
-            buf = "".join(t)
+            buf = b"".join(t)
         else:
             buf = self._read(size)
         self.pos += len(buf)
@@ -538,6 +538,7 @@ def _read(self, size):
             return self.__read(size)
 
         c = len(self.dbuf)
+        t = [self.dbuf]
         while c < size:
             buf = self.__read(self.bufsize)
             if not buf:
@@ -546,26 +547,27 @@ def _read(self, size):
                 buf = self.cmp.decompress(buf)
             except self.exception:
                 raise ReadError("invalid compressed data")
-            self.dbuf += buf
+            t.append(buf)
             c += len(buf)
-        buf = self.dbuf[:size]
-        self.dbuf = self.dbuf[size:]
-        return buf
+        t = b"".join(t)
+        self.dbuf = t[size:]
+        return t[:size]
 
     def __read(self, size):
         """Return size bytes from stream. If internal buffer is empty,
            read another block from the stream.
         """
         c = len(self.buf)
+        t = [self.buf]
         while c < size:
             buf = self.fileobj.read(self.bufsize)
             if not buf:
                 break
-            self.buf += buf
+            t.append(buf)
             c += len(buf)
-        buf = self.buf[:size]
-        self.buf = self.buf[size:]
-        return buf
+        t = b"".join(t)
+        self.buf = t[size:]
+        return t[:size]
 # class _Stream
 
 class _StreamProxy(object):
diff --git a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
new file mode 100644
index 000000000000..4cb7892ee81a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
@@ -0,0 +1,2 @@
+Fixed a performance regression for reading streams with tarfile. The
+buffered read should use a list, instead of appending to a bytes object.