3.2 can't extract tarfile produced by 2.7

Benjamin Kaplan benjamin.kaplan at case.edu
Wed Dec 26 17:11:17 CET 2012


On Dec 26, 2012 11:00 AM, "Antoon Pardon" <antoon.pardon at rece.vub.ac.be>
wrote:
>
> I am converting some programs to python 3. These programs manipulate
tarfiles. In order for the python3 programs to be really useful
> they need to be able to process the tarfiles produced by python2 that
however seems to be a problem.
>
> This is testcode that produces a tarfile.
>
> #! /usr/bin/python
>
> compression = "bz2"
> tarmode = "w|%s" % compression
> rt = '.'
>
> import os
> import os.path
> import errno
>
> import tarfile as tar
>
> def process():
>     pj = os.path.join
>     entries = os.listdir(rt)
>     of = open("DUMP.tbz", "w")
>     tf = tar.open(mode = tarmode, fileobj = of,
>                   encoding = 'ascii', format = tar.PAX_FORMAT)
>     for entry in entries:
>         fqpn = pj(rt, entry)
>         try:
>             tf.add(fqpn, entry, recursive = False)
>         except OSError as ErrInfo:
>             print("%s: disappeared" % fqpn)
>             if ErrInfo.errno != errno.ENOENT:
>                 raise
>     tf.close()
>     of.close()
>
> if __name__ == "__main__":
>     process()
>
>
==============================================================================
> This is testcode that checks a tarfile
>
> #!/usr/bin/python
>
> compression = "bz2"
> tarmode = "r|%s" % compression
>
> import os
> import os.path
> import stat
>
> import tarfile as tar
>
> def equalfile(fl1, fl2):
>     bf1 = fl1.read(8192)
>     bf2 = fl2.read(8192)
>     while bf1 == bf2:
>         if bf1 == "":
>             return True
>         bf1 = fl1.read(8192)
>         bf2 = fl2.read(8192)
>     return False
>
> def process():
>     gf = open("DUMP.tbz", "r")
>     tf = tar.open(mode = tarmode, fileobj = gf,
>                   encoding = 'ascii', format = tar.PAX_FORMAT)
>     for tarinfo in tf:
>         entry = tarinfo.name
>         fileinfo = os.stat(entry)
>         if stat.S_ISREG(fileinfo.st_mode) and tarinfo.isreg():
>             bfl = tf.extractfile(tarinfo)
>             ofl = open(entry)
>             if not equalfile(bfl, ofl):
>                 print("%s: does not match backup" % entry)
>                 sync = False
>     tf.close()
>     gf.close()
>
> if __name__ == "__main__":
>     process()
>
>
=================================================================================
>
> When I use python2.7 to produce and later check the tarfile everything
works as expected. However when I use python3.2 to check the tarfile I
> get the following traceback.
>
> Traceback (most recent call last):
>   File "tarchck", line 39, in <module>
>     process()
>   File "tarchck", line 25, in process
>     encoding = 'ascii', format = tar.PAX_FORMAT)
>   File "/usr/lib/python3.2/tarfile.py", line 1771, in open
>     t = cls(name, filemode, stream, **kwargs)
>   File "/usr/lib/python3.2/tarfile.py", line 1667, in __init__
>     self.firstmember = self.next()
>   File "/usr/lib/python3.2/tarfile.py", line 2418, in next
>     tarinfo = self.tarinfo.fromtarfile(self)
>   File "/usr/lib/python3.2/tarfile.py", line 1281, in fromtarfile
>     buf = tarfile.fileobj.read(BLOCKSIZE)
>   File "/usr/lib/python3.2/tarfile.py", line 573, in read
>     buf = self._read(size)
>   File "/usr/lib/python3.2/tarfile.py", line 585, in _read
>     buf = self.__read(self.bufsize)
>   File "/usr/lib/python3.2/tarfile.py", line 604, in __read
>     buf = self.fileobj.read(self.bufsize)
>   File "/usr/lib/python3.2/codecs.py", line 300, in decode
>     (result, consumed) = self._buffer_decode(data, self.errors, final)
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9e in position 10:
invalid start byte
>
> I have been looking around but have no idea how I have to adapt this code
in order to have it process the tarfile under python3.2. The original code
didn't have the coding and format keywords on the tar.open statement and
after reading the documentation I thought that
> would make things work, but no such luck. Further reading didn't
> provide anything usefull
>
> --
> Antoon Pardon
> --

You're opening the file in text mode, so it's trying to decode it as text
using your default encoding (utf-8). You want the file read as a series of
bytes, so open it in binary mode.

gf =open("DUMP.tbz", "rb")
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20121226/a0fdbdc9/attachment.html>


More information about the Python-list mailing list