[Python-checkins] cpython (3.5): Issue #24838: tarfile's ustar and gnu formats now correctly calculate name and

Tue Apr 19 02:54:46 EDT 2016

https://hg.python.org/cpython/rev/d08d6b776694
changeset:   101054:d08d6b776694
branch:      3.5
parent:      101052:11d8f5d1968d
user:        Lars Gustäbel <lars at gustaebel.de>
date:        Tue Apr 19 08:43:17 2016 +0200
summary:
  Issue #24838: tarfile's ustar and gnu formats now correctly calculate name and
link field limits for multibyte character encodings like utf-8.

files:
  Lib/tarfile.py           |  29 ++++----
  Lib/test/test_tarfile.py |  87 ++++++++++++++++++++++++++-
  Misc/NEWS                |   3 +
  3 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/Lib/tarfile.py b/Lib/tarfile.py
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -812,11 +812,11 @@
         """
         info["magic"] = POSIX_MAGIC
 
-        if len(info["linkname"]) > LENGTH_LINK:
+        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
             raise ValueError("linkname is too long")
 
-        if len(info["name"]) > LENGTH_NAME:
-            info["prefix"], info["name"] = self._posix_split_name(info["name"])
+        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
+            info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
 
         return self._create_header(info, USTAR_FORMAT, encoding, errors)
 
@@ -826,10 +826,10 @@
         info["magic"] = GNU_MAGIC
 
         buf = b""
-        if len(info["linkname"]) > LENGTH_LINK:
+        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
 
-        if len(info["name"]) > LENGTH_NAME:
+        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
 
         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
@@ -889,19 +889,20 @@
         """
         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
 
-    def _posix_split_name(self, name):
+    def _posix_split_name(self, name, encoding, errors):
         """Split a name longer than 100 chars into a prefix
            and a name part.
         """
-        prefix = name[:LENGTH_PREFIX + 1]
-        while prefix and prefix[-1] != "/":
-            prefix = prefix[:-1]
+        components = name.split("/")
+        for i in range(1, len(components)):
+            prefix = "/".join(components[:i])
+            name = "/".join(components[i:])
+            if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
+                    len(name.encode(encoding, errors)) <= LENGTH_NAME:
+                break
+        else:
+            raise ValueError("name is too long")
 
-        name = name[len(prefix):]
-        prefix = prefix[:-1]
-
-        if not prefix or len(name) > LENGTH_NAME:
-            raise ValueError("name is too long")
         return prefix, name
 
     @staticmethod
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -1667,9 +1667,7 @@
             tar.close()
 
 
-class UstarUnicodeTest(unittest.TestCase):
-
-    format = tarfile.USTAR_FORMAT
+class UnicodeTest:
 
     def test_iso8859_1_filename(self):
         self._test_unicode_filename("iso8859-1")
@@ -1750,7 +1748,86 @@
             tar.close()
 
 
-class GNUUnicodeTest(UstarUnicodeTest):
+class UstarUnicodeTest(UnicodeTest, unittest.TestCase):
+
+    format = tarfile.USTAR_FORMAT
+
+    # Test whether the utf-8 encoded version of a filename exceeds the 100
+    # bytes name field limit (every occurrence of '\xff' will be expanded to 2
+    # bytes).
+    def test_unicode_name1(self):
+        self._test_ustar_name("0123456789" * 10)
+        self._test_ustar_name("0123456789" * 10 + "0", ValueError)
+        self._test_ustar_name("0123456789" * 9 + "01234567\xff")
+        self._test_ustar_name("0123456789" * 9 + "012345678\xff", ValueError)
+
+    def test_unicode_name2(self):
+        self._test_ustar_name("0123456789" * 9 + "012345\xff\xff")
+        self._test_ustar_name("0123456789" * 9 + "0123456\xff\xff", ValueError)
+
+    # Test whether the utf-8 encoded version of a filename exceeds the 155
+    # bytes prefix + '/' + 100 bytes name limit.
+    def test_unicode_longname1(self):
+        self._test_ustar_name("0123456789" * 15 + "01234/" + "0123456789" * 10)
+        self._test_ustar_name("0123456789" * 15 + "0123/4" + "0123456789" * 10, ValueError)
+        self._test_ustar_name("0123456789" * 15 + "012\xff/" + "0123456789" * 10)
+        self._test_ustar_name("0123456789" * 15 + "0123\xff/" + "0123456789" * 10, ValueError)
+
+    def test_unicode_longname2(self):
+        self._test_ustar_name("0123456789" * 15 + "01\xff/2" + "0123456789" * 10, ValueError)
+        self._test_ustar_name("0123456789" * 15 + "01\xff\xff/" + "0123456789" * 10, ValueError)
+
+    def test_unicode_longname3(self):
+        self._test_ustar_name("0123456789" * 15 + "01\xff\xff/2" + "0123456789" * 10, ValueError)
+        self._test_ustar_name("0123456789" * 15 + "01234/" + "0123456789" * 9 + "01234567\xff")
+        self._test_ustar_name("0123456789" * 15 + "01234/" + "0123456789" * 9 + "012345678\xff", ValueError)
+
+    def test_unicode_longname4(self):
+        self._test_ustar_name("0123456789" * 15 + "01234/" + "0123456789" * 9 + "012345\xff\xff")
+        self._test_ustar_name("0123456789" * 15 + "01234/" + "0123456789" * 9 + "0123456\xff\xff", ValueError)
+
+    def _test_ustar_name(self, name, exc=None):
+        with tarfile.open(tmpname, "w", format=self.format, encoding="utf-8") as tar:
+            t = tarfile.TarInfo(name)
+            if exc is None:
+                tar.addfile(t)
+            else:
+                self.assertRaises(exc, tar.addfile, t)
+
+        if exc is None:
+            with tarfile.open(tmpname, "r") as tar:
+                for t in tar:
+                    self.assertEqual(name, t.name)
+                    break
+
+    # Test the same as above for the 100 bytes link field.
+    def test_unicode_link1(self):
+        self._test_ustar_link("0123456789" * 10)
+        self._test_ustar_link("0123456789" * 10 + "0", ValueError)
+        self._test_ustar_link("0123456789" * 9 + "01234567\xff")
+        self._test_ustar_link("0123456789" * 9 + "012345678\xff", ValueError)
+
+    def test_unicode_link2(self):
+        self._test_ustar_link("0123456789" * 9 + "012345\xff\xff")
+        self._test_ustar_link("0123456789" * 9 + "0123456\xff\xff", ValueError)
+
+    def _test_ustar_link(self, name, exc=None):
+        with tarfile.open(tmpname, "w", format=self.format, encoding="utf-8") as tar:
+            t = tarfile.TarInfo("foo")
+            t.linkname = name
+            if exc is None:
+                tar.addfile(t)
+            else:
+                self.assertRaises(exc, tar.addfile, t)
+
+        if exc is None:
+            with tarfile.open(tmpname, "r") as tar:
+                for t in tar:
+                    self.assertEqual(name, t.linkname)
+                    break
+
+
+class GNUUnicodeTest(UnicodeTest, unittest.TestCase):
 
     format = tarfile.GNU_FORMAT
 
@@ -1768,7 +1845,7 @@
                     self.fail("unable to read bad GNU tar pax header")
 
 
-class PAXUnicodeTest(UstarUnicodeTest):
+class PAXUnicodeTest(UnicodeTest, unittest.TestCase):
 
     format = tarfile.PAX_FORMAT
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -107,6 +107,9 @@
 Library
 -------
 
+- Issue #24838: tarfile's ustar and gnu formats now correctly calculate name
+  and link field limits for multibyte character encodings like utf-8.
+
 - Issue #26657: Fix directory traversal vulnerability with http.server on
   Windows.  This fixes a regression that was introduced in 3.3.4rc1 and
   3.4.0rc1.  Based on patch by Philipp Hagemeister.

-- 
Repository URL: https://hg.python.org/cpython