[Python-checkins] cpython: Issue #5689: Add support for lzma compression to the tarfile module.

lars.gustaebel python-checkins at python.org
Sat Dec 10 20:40:17 CET 2011


http://hg.python.org/cpython/rev/899a8c7b2310
changeset:   73927:899a8c7b2310
user:        Lars Gustäbel <lars at gustaebel.de>
date:        Sat Dec 10 20:38:14 2011 +0100
summary:
  Issue #5689: Add support for lzma compression to the tarfile module.

files:
  Doc/library/tarfile.rst  |  22 +++++--
  Lib/tarfile.py           |  66 +++++++++++++++++++++--
  Lib/test/test_tarfile.py |  78 ++++++++++++++++++++++++---
  Misc/NEWS                |   2 +
  4 files changed, 146 insertions(+), 22 deletions(-)


diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst
--- a/Doc/library/tarfile.rst
+++ b/Doc/library/tarfile.rst
@@ -13,12 +13,12 @@
 --------------
 
 The :mod:`tarfile` module makes it possible to read and write tar
-archives, including those using gzip or bz2 compression.
+archives, including those using gzip, bz2 and lzma compression.
 (:file:`.zip` files can be read and written using the :mod:`zipfile` module.)
 
 Some facts and figures:
 
-* reads and writes :mod:`gzip` and :mod:`bz2` compressed archives.
+* reads and writes :mod:`gzip`, :mod:`bz2` and :mod:`lzma` compressed archives.
 
 * read/write support for the POSIX.1-1988 (ustar) format.
 
@@ -55,6 +55,8 @@
    +------------------+---------------------------------------------+
    | ``'r:bz2'``      | Open for reading with bzip2 compression.    |
    +------------------+---------------------------------------------+
+   | ``'r:xz'``       | Open for reading with lzma compression.     |
+   +------------------+---------------------------------------------+
    | ``'a' or 'a:'``  | Open for appending with no compression. The |
    |                  | file is created if it does not exist.       |
    +------------------+---------------------------------------------+
@@ -64,11 +66,13 @@
    +------------------+---------------------------------------------+
    | ``'w:bz2'``      | Open for bzip2 compressed writing.          |
    +------------------+---------------------------------------------+
+   | ``'w:xz'``       | Open for lzma compressed writing.           |
+   +------------------+---------------------------------------------+
 
-   Note that ``'a:gz'`` or ``'a:bz2'`` is not possible. If *mode* is not suitable
-   to open a certain (compressed) file for reading, :exc:`ReadError` is raised. Use
-   *mode* ``'r'`` to avoid this.  If a compression method is not supported,
-   :exc:`CompressionError` is raised.
+   Note that ``'a:gz'``, ``'a:bz2'`` or ``'a:xz'`` is not possible. If *mode*
+   is not suitable to open a certain (compressed) file for reading,
+   :exc:`ReadError` is raised. Use *mode* ``'r'`` to avoid this.  If a
+   compression method is not supported, :exc:`CompressionError` is raised.
 
    If *fileobj* is specified, it is used as an alternative to a :term:`file object`
    opened in binary mode for *name*. It is supposed to be at position 0.
@@ -99,6 +103,9 @@
    | ``'r|bz2'`` | Open a bzip2 compressed *stream* for       |
    |             | reading.                                   |
    +-------------+--------------------------------------------+
+   | ``'r|xz'``  | Open a lzma compressed *stream* for        |
+   |             | reading.                                   |
+   +-------------+--------------------------------------------+
    | ``'w|'``    | Open an uncompressed *stream* for writing. |
    +-------------+--------------------------------------------+
    | ``'w|gz'``  | Open a gzip compressed *stream* for        |
@@ -107,6 +114,9 @@
    | ``'w|bz2'`` | Open a bzip2 compressed *stream* for       |
    |             | writing.                                   |
    +-------------+--------------------------------------------+
+   | ``'w|xz'``  | Open an lzma compressed *stream* for       |
+   |             | writing.                                   |
+   +-------------+--------------------------------------------+
 
 
 .. class:: TarFile
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -420,10 +420,11 @@
                 self.crc = zlib.crc32(b"")
                 if mode == "r":
                     self._init_read_gz()
+                    self.exception = zlib.error
                 else:
                     self._init_write_gz()
 
-            if comptype == "bz2":
+            elif comptype == "bz2":
                 try:
                     import bz2
                 except ImportError:
@@ -431,8 +432,25 @@
                 if mode == "r":
                     self.dbuf = b""
                     self.cmp = bz2.BZ2Decompressor()
+                    self.exception = IOError
                 else:
                     self.cmp = bz2.BZ2Compressor()
+
+            elif comptype == "xz":
+                try:
+                    import lzma
+                except ImportError:
+                    raise CompressionError("lzma module is not available")
+                if mode == "r":
+                    self.dbuf = b""
+                    self.cmp = lzma.LZMADecompressor()
+                    self.exception = lzma.LZMAError
+                else:
+                    self.cmp = lzma.LZMACompressor()
+
+            elif comptype != "tar":
+                raise CompressionError("unknown compression type %r" % comptype)
+
         except:
             if not self._extfileobj:
                 self.fileobj.close()
@@ -584,7 +602,7 @@
                 break
             try:
                 buf = self.cmp.decompress(buf)
-            except IOError:
+            except self.exception:
                 raise ReadError("invalid compressed data")
             self.dbuf += buf
             c += len(buf)
@@ -622,11 +640,14 @@
         return self.buf
 
     def getcomptype(self):
-        if self.buf.startswith(b"\037\213\010"):
+        if self.buf.startswith(b"\x1f\x8b\x08"):
             return "gz"
-        if self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
+        elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
             return "bz2"
-        return "tar"
+        elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
+            return "xz"
+        else:
+            return "tar"
 
     def close(self):
         self.fileobj.close()
@@ -1651,18 +1672,22 @@
            'r:'         open for reading exclusively uncompressed
            'r:gz'       open for reading with gzip compression
            'r:bz2'      open for reading with bzip2 compression
+           'r:xz'       open for reading with lzma compression
            'a' or 'a:'  open for appending, creating the file if necessary
            'w' or 'w:'  open for writing without compression
            'w:gz'       open for writing with gzip compression
            'w:bz2'      open for writing with bzip2 compression
+           'w:xz'       open for writing with lzma compression
 
            'r|*'        open a stream of tar blocks with transparent compression
            'r|'         open an uncompressed stream of tar blocks for reading
            'r|gz'       open a gzip compressed stream of tar blocks
            'r|bz2'      open a bzip2 compressed stream of tar blocks
+           'r|xz'       open an lzma compressed stream of tar blocks
            'w|'         open an uncompressed stream for writing
            'w|gz'       open a gzip compressed stream for writing
            'w|bz2'      open a bzip2 compressed stream for writing
+           'w|xz'       open an lzma compressed stream for writing
         """
 
         if not name and not fileobj:
@@ -1780,11 +1805,40 @@
         t._extfileobj = False
         return t
 
+    @classmethod
+    def xzopen(cls, name, mode="r", fileobj=None, preset=9, **kwargs):
+        """Open lzma compressed tar archive name for reading or writing.
+           Appending is not allowed.
+        """
+        if mode not in ("r", "w"):
+            raise ValueError("mode must be 'r' or 'w'")
+
+        try:
+            import lzma
+        except ImportError:
+            raise CompressionError("lzma module is not available")
+
+        if mode == "r":
+            # LZMAFile complains about a preset argument in read mode.
+            preset = None
+
+        fileobj = lzma.LZMAFile(filename=name if fileobj is None else None,
+                mode=mode, fileobj=fileobj, preset=preset)
+
+        try:
+            t = cls.taropen(name, mode, fileobj, **kwargs)
+        except (lzma.LZMAError, EOFError):
+            fileobj.close()
+            raise ReadError("not an lzma file")
+        t._extfileobj = False
+        return t
+
     # All *open() methods are registered here.
     OPEN_METH = {
         "tar": "taropen",   # uncompressed tar
         "gz":  "gzopen",    # gzip compressed tar
-        "bz2": "bz2open"    # bzip2 compressed tar
+        "bz2": "bz2open",   # bzip2 compressed tar
+        "xz":  "xzopen"     # lzma compressed tar
     }
 
     #--------------------------------------------------------------------------
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -21,6 +21,10 @@
     import bz2
 except ImportError:
     bz2 = None
+try:
+    import lzma
+except ImportError:
+    lzma = None
 
 def md5sum(data):
     return md5(data).hexdigest()
@@ -29,6 +33,7 @@
 tarname = support.findfile("testtar.tar")
 gzipname = os.path.join(TEMPDIR, "testtar.tar.gz")
 bz2name = os.path.join(TEMPDIR, "testtar.tar.bz2")
+xzname = os.path.join(TEMPDIR, "testtar.tar.xz")
 tmpname = os.path.join(TEMPDIR, "tmp.tar")
 
 md5_regtype = "65f477c818ad9e15f7feab0c6d37742f"
@@ -201,13 +206,15 @@
             _open = gzip.GzipFile
         elif self.mode.endswith(":bz2"):
             _open = bz2.BZ2File
+        elif self.mode.endswith(":xz"):
+            _open = lzma.LZMAFile
         else:
-            _open = open
+            _open = io.FileIO
 
         for char in (b'\0', b'a'):
             # Test if EOFHeaderError ('\0') and InvalidHeaderError ('a')
             # are ignored correctly.
-            with _open(tmpname, "wb") as fobj:
+            with _open(tmpname, "w") as fobj:
                 fobj.write(char * 1024)
                 fobj.write(tarfile.TarInfo("foo").tobuf())
 
@@ -222,9 +229,10 @@
 class MiscReadTest(CommonReadTest):
 
     def test_no_name_argument(self):
-        if self.mode.endswith("bz2"):
-            # BZ2File has no name attribute.
-            return
+        if self.mode.endswith(("bz2", "xz")):
+            # BZ2File and LZMAFile have no name attribute.
+            self.skipTest("no name attribute")
+
         with open(self.tarname, "rb") as fobj:
             tar = tarfile.open(fileobj=fobj, mode=self.mode)
             self.assertEqual(tar.name, os.path.abspath(fobj.name))
@@ -265,10 +273,12 @@
             _open = gzip.GzipFile
         elif self.mode.endswith(":bz2"):
             _open = bz2.BZ2File
+        elif self.mode.endswith(":xz"):
+            _open = lzma.LZMAFile
         else:
-            _open = open
-        fobj = _open(self.tarname, "rb")
-        try:
+            _open = io.FileIO
+
+        with _open(self.tarname) as fobj:
             fobj.seek(offset)
 
             # Test if the tarfile starts with the second member.
@@ -281,8 +291,6 @@
             self.assertEqual(tar.extractfile(t).read(), data,
                     "seek back did not work")
             tar.close()
-        finally:
-            fobj.close()
 
     def test_fail_comp(self):
         # For Gzip and Bz2 Tests: fail with a ReadError on an uncompressed file.
@@ -526,6 +534,18 @@
             testfunc(bz2name, "r|*")
             testfunc(bz2name, "r|bz2")
 
+        if lzma:
+            self.assertRaises(tarfile.ReadError, tarfile.open, tarname, mode="r:xz")
+            self.assertRaises(tarfile.ReadError, tarfile.open, tarname, mode="r|xz")
+            self.assertRaises(tarfile.ReadError, tarfile.open, xzname, mode="r:")
+            self.assertRaises(tarfile.ReadError, tarfile.open, xzname, mode="r|")
+
+            testfunc(xzname, "r")
+            testfunc(xzname, "r:*")
+            testfunc(xzname, "r:xz")
+            testfunc(xzname, "r|*")
+            testfunc(xzname, "r|xz")
+
     def test_detect_file(self):
         self._test_modes(self._testfunc_file)
 
@@ -1096,6 +1116,9 @@
             data = dec.decompress(data)
             self.assertTrue(len(dec.unused_data) == 0,
                     "found trailing data")
+        elif self.mode.endswith("xz"):
+            with lzma.LZMAFile(tmpname) as fobj:
+                data = fobj.read()
         else:
             with open(tmpname, "rb") as fobj:
                 data = fobj.read()
@@ -1510,6 +1533,12 @@
         self._create_testtar("w:bz2")
         self.assertRaises(tarfile.ReadError, tarfile.open, tmpname, "a")
 
+    def test_append_lzma(self):
+        if lzma is None:
+            self.skipTest("lzma module not available")
+        self._create_testtar("w:xz")
+        self.assertRaises(tarfile.ReadError, tarfile.open, tmpname, "a")
+
     # Append mode is supposed to fail if the tarfile to append to
     # does not end with a zero block.
     def _test_error(self, data):
@@ -1788,6 +1817,21 @@
         self._test_partial_input("r:bz2")
 
 
+class LzmaMiscReadTest(MiscReadTest):
+    tarname = xzname
+    mode = "r:xz"
+class LzmaUstarReadTest(UstarReadTest):
+    tarname = xzname
+    mode = "r:xz"
+class LzmaStreamReadTest(StreamReadTest):
+    tarname = xzname
+    mode = "r|xz"
+class LzmaWriteTest(WriteTest):
+    mode = "w:xz"
+class LzmaStreamWriteTest(StreamWriteTest):
+    mode = "w|xz"
+
+
 def test_main():
     support.unlink(TEMPDIR)
     os.makedirs(TEMPDIR)
@@ -1850,6 +1894,20 @@
             Bz2PartialReadTest,
         ]
 
+    if lzma:
+        # Create testtar.tar.xz and add lzma-specific tests.
+        support.unlink(xzname)
+        with lzma.LZMAFile(xzname, "w") as tar:
+            tar.write(data)
+
+        tests += [
+            LzmaMiscReadTest,
+            LzmaUstarReadTest,
+            LzmaStreamReadTest,
+            LzmaWriteTest,
+            LzmaStreamWriteTest,
+        ]
+
     try:
         support.run_unittest(*tests)
     finally:
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -406,6 +406,8 @@
 Library
 -------
 
+- Issue #5689: Add support for lzma compression to the tarfile module.
+
 - Issue #13248: Turn 3.2's PendingDeprecationWarning into 3.3's
   DeprecationWarning.  It covers 'cgi.escape', 'importlib.abc.PyLoader',
   'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath',

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list