Fixes to zipfile.py [PATCH]

Fri Mar 7 17:39:25 EST 2003

Hello,

I have an instance where I need to use zipfile.py for reading large
(50,000-file, 500MB or larger) ZIP files.  Unfortunately, it is rather
poorly suited to this.  For one, it allocates tens of thousands of
ZipInfo objects on open of the ZIP file, taking 10-30 seconds and
eating close to 100MB of RAM.  For another, it provides no way to
extract a file in a ZIP archive directly to a file.  This means that
extracting a 100MB file requires 100MB RAM.

What I have done is written a ZipReader class dedicated to reading
files.  Some of this scanning that ZipFile does is necessary to help
with writing or updating, but not for reading.

My modifications have reduced the runtime of one torture test case from over
100 seconds to about 20, and another from over 200 seconds to 80.

The basic idea is that, on opening, we scan the central directory
noting solely the location of the start of each file.  This gets saved
in a hash of filenames.  This approach saves huge amounts of time on
open, and makes access to any individual file fairly quick as well.
Also, as part of this scanning, we note the names of directories
encountered, to make it easier for software that needs to basically do
a version of os.listdir() on a ZIP file directory.

In general, this approach will be much faster when accessing one or two
files from a large ZIP file, about the same when dealing with a small
ZIP file, and a little slower when accessing many files from a large
ZIP file.  I believe it's a net gain.

Another nifty hack: reuse some code from socket.py to provide a
file-like object capable of doing readline() and readlines() directly
on data being read from a compressed file.  It would be nice to see
this socket.py code generalized to support any class that provides a
read().

I have used this code in PyGopherd to provide seamless serving of
Gopher and Web sites directly from ZIP files.

Here is the patch against the zipfile.py in CVS:

--- /home/jgoerzen/zipfile.py	2003-03-06 21:22:29.000000000 -0600
+++ zipfile.py	2003-03-07 16:22:13.000000000 -0600
@@ -2,8 +2,14 @@
 # Written by James C. Ahlstrom jim at interet.com
 # All rights transferred to CNRI pursuant to the Python contribution agreement
 
-import struct, os, time
+import struct, os, time, types
 import binascii
+from StringIO import StringIO
+from socket import _fileobject as BaseFileSimulator
+
+_STRING_TYPES = (types.StringType,)
+if hasattr(types, "UnicodeType"):
+    _STRING_TYPES = _STRING_TYPES + (types.UnicodeType,)
 
 try:
     import zlib # We may need its compression method
@@ -167,6 +173,304 @@
     def _normpath(path):
         return path
 
+class FileSimulator(BaseFileSimulator):
+    def close(self):
+        pass
+
+class ZipDecompressor:
+    def __init__(self, fd, zinfo):
+        self.fp = fd
+        self.zinfo = zinfo
+        self.buffer = ''
+        self.bytesread = 0
+        self.byteswritten = 0           # Used for deflation only
+        self.crc = binascii.crc32("")
+        self.filepos = self.fp.tell()
+
+        if zinfo.compress_type == ZIP_STORED:
+            self.read = self.read_stored
+        elif zinfo.compress_type == ZIP_DEFLATED:
+            self.read = self.read_deflated
+            self.dc = zlib.decompressobj(-15)
+        self.recv = self.read
+
+    def copyto(self, destfd, size = -1):
+        copied = 0
+        if size < 1:
+            size = self.zinfo.file_size
+        while copied < size:
+            data = self.read(min(4096, size - copied))
+            destfd.write(data)
+            copied += len(data)
+
+    def _finalize(self):
+        if self.crc != self.zinfo.CRC:
+            raise BadZipfile, "Bad CRC-32 for file %s" % self.zinfo.filename
+
+    def flush(self):
+        while self.byteswritten < self.zinfo.file_size:
+            self.read(min(4096, self.zinfo.file_size - self.byteswritten))
+        self._finalize()
+
+    def close(self):
+        pass
+
+    def read_stored(self, count = -1):
+        if count < 1 or count > self.zinfo.compress_size - self.bytesread:
+            count = self.zinfo.compress_size - self.bytesread
+        if count < 1:
+            return ''
+
+        count = count + self.bytesread
+        self.fp.seek(self.zinfo.file_offset + self.bytesread)
+        retval = ''
+        while self.bytesread < count:
+            data = self.fp.read(min(4096, count - self.bytesread))
+            retval += data
+            self.bytesread += len(data)
+        self.byteswritten = self.bytesread
+        self.crc = binascii.crc32(retval, self.crc)
+        if self.bytesread == self.zinfo.compress_size:
+            self._finalize()
+        return retval
+
+    def read_deflated(self, count = -1):
+        if count < 1 or count > self.zinfo.file_size - self.byteswritten:
+            count = self.zinfo.file_size - self.byteswritten
+        if count < 1:
+            return ''
+
+        count = count
+        self.fp.seek(self.zinfo.file_offset + self.bytesread)
+
+        # First, fill up the buffer.
+        while len(self.buffer) < count:
+            bytes = self.fp.read(min(self.zinfo.compress_size - self.bytesread, 4096))
+            self.bytesread += len(bytes)
+            result = self.dc.decompress(bytes)
+            if len(result):
+                self.buffer += result
+                self.crc = binascii.crc32(result, self.crc)
+
+            if self.bytesread == self.zinfo.compress_size:
+                bytes = self.dc.decompress('Z') + self.dc.flush()
+                if len(bytes):
+                    self.buffer += bytes
+                    self.crc = binascii.crc32(bytes, self.crc)
+                self._finalize()
+
+        retval = self.buffer[:count]
+        self.byteswritten += len(retval)
+        self.buffer = self.buffer[count:]
+        return retval
+                
+###########################################################################
+###########################################################################
+###########################################################################
+# New ZipReader class
+
+
+class ZipReader:
+    """ Class with methods to open, read, close, list zip files.
+
+    z = ZipFile(file)
+
+    file: Either the path to the file, or a file-like object.
+          If it is a path, the file will be opened and closed by ZipFile.
+    """
+
+    fp = None                   # Set here since __del__ checks it
+
+    def __init__(self, file):
+        """Open the ZIP file with mode read "r", write "w" or append "a"."""
+        self.debug = 0  # Level of printing: 0 through 3
+        self.locationmap = {'': -1} # Map to location of central dir header
+
+        # Check if we were passed a file-like object
+        if type(file) in _STRING_TYPES:
+            self._filePassed = 0
+            self.filename = file
+            self.fp = open(file, 'rb')
+        else:
+            self._filePassed = 1
+            self.fp = file
+            self.filename = getattr(file, 'name', None)
+
+
+        self._GetContents()
+
+    def _GetContents(self):
+        """Read the directory, making sure we close the file if the format
+        is bad."""
+        try:
+            self._RealGetContents()
+        except BadZipfile:
+            if not self._filePassed:
+                self.fp.close()
+                self.fp = None
+            raise
+
+    def _RealGetContents(self):
+        """Read in the table of contents for the ZIP file."""
+        fp = self.fp
+        endrec = _EndRecData(fp)
+        if not endrec:
+            raise BadZipfile, "File is not a zip file"
+        if self.debug > 1:
+            print endrec
+        size_cd = endrec[5]             # bytes in central directory
+        offset_cd = endrec[6]   # offset of central directory
+        self.comment = endrec[8]        # archive comment
+        # endrec[9] is the offset of the "End of Central Dir" record
+        x = endrec[9] - size_cd
+        # "concat" is zero, unless zip was concatenated to another file
+        concat = x - offset_cd
+        self.concat = concat
+        if self.debug > 2:
+            print "given, inferred, offset", offset_cd, x, concat
+        # self.start_dir:  Position of start of central directory
+        self.start_dir = offset_cd + concat
+        fp.seek(self.start_dir, 0)
+        total = 0
+        while total < size_cd:
+            centdir = self._getcentdir() # Reads 46 bytes
+            filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+
+            self.locationmap[filename] = self.start_dir + total
+            components = filename.split('/')
+            self.locationmap['/'.join(filename.split('/')[:-1])] = -1
+            # Skip past the other stuff.
+            total = (total + 46 + centdir[_CD_FILENAME_LENGTH]
+                     + centdir[_CD_EXTRA_FIELD_LENGTH]
+                     + centdir[_CD_COMMENT_LENGTH])
+            fp.seek(self.start_dir + total, 0)
+
+    def namelist(self):
+        """Return a list of file names in the archive."""
+        return self.locationmap.keys()
+
+    def namelistiter(self):
+        return self.locationmap.iterkeys()
+
+    def hasfile(self, name):
+        return self.locationmap.has_key(name)
+
+    def infolist(self):
+        """Return a list of class ZipInfo instances for files in the
+        archive."""
+        return [self.getinfo(x) for x in self.locationmap.iterkeys()]
+
+    def printdir(self):
+        """Print a table of contents for the zip file."""
+        print "%-46s %19s %12s" % ("File Name", "Modified    ", "Size")
+        for name in self.locationmap.iterkeys():
+            zinfo = self.getinfo(name)
+            date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time
+            print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
+
+    def testzip(self):
+        """Read all the files and check the CRC."""
+        for name in self.locationmap.iterkeys():
+            try:
+                self.read(name)       # Check CRC-32
+            except:
+                return name
+
+    def getinfo(self, name):
+        """Return the instance of ZipInfo given 'name'."""
+        return self._getinfofrompos(self.locationmap[name])
+
+    def _getcentdir(self):
+        """Read central directory info from an ALREADY-SEEKED fp!"""
+        centdir = self.fp.read(46)
+        if centdir[0:4] != stringCentralDir:
+            raise BadZipfile, "Bad magic number for central directory"
+        centdir = struct.unpack(structCentralDir, centdir)
+        if self.debug > 2:
+            print centdir
+        return centdir
+
+    def _getinfofrompos(self, location):
+        if location < 0:
+            raise KeyError, "Attempt to get information from non-file"
+        fp = self.fp
+        fp.seek(location, 0)
+        centdir = self._getcentdir()
+        filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+        # Create ZipInfo instance to store file information
+        x = ZipInfo(filename)
+        x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
+        x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
+        x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + self.concat
+        # file_offset must be computed below...
+        (x.create_version, x.create_system, x.extract_version, x.reserved,
+            x.flag_bits, x.compress_type, t, d,
+            x.CRC, x.compress_size, x.file_size) = centdir[1:12]
+        x.volume, x.internal_attr, x.external_attr = centdir[15:18]
+        # Convert date/time code to (year, month, day, hour, min, sec)
+        x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
+                                 t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
+
+        # And now, read the info from the file's header.
+
+        data = x
+        fp.seek(data.header_offset, 0)
+        fheader = fp.read(30)
+        if fheader[0:4] != stringFileHeader:
+            raise BadZipfile, "Bad magic number for file header"
+        fheader = struct.unpack(structFileHeader, fheader)
+        # file_offset is computed here, since the extra field for
+        # the central directory and for the local file header
+        # refer to different fields, and they can have different
+        # lengths
+        data.file_offset = (data.header_offset + 30
+                            + fheader[_FH_FILENAME_LENGTH]
+                            + fheader[_FH_EXTRA_FIELD_LENGTH])
+        fname = fp.read(fheader[_FH_FILENAME_LENGTH])
+        if fname != data.filename:
+            raise RuntimeError, \
+                  'File name in directory "%s" and header "%s" differ.' % (
+                      data.filename, fname)
+
+        return data
+
+    def read(self, name):
+        fd = StringIO()
+        fd = self.copyto(name, fd)
+        return fd.getvalue()
+
+    def _open_zinfo(self, zi):
+        return FileSimulator(ZipDecompressor(self.fp, zi), None, -1)
+        
+    def open(self, name):
+        if not self.fp:
+            raise RuntimeError, \
+                  "Attempt to read ZIP archive that was already closed"
+        return self._open_zinfo(self.getinfo(name))
+
+    def copyto(self, name, fd):
+        """Copy the contents of the named file to the given descriptor."""
+        self.open(name).copyto(fd)
+
+    def __del__(self):
+        """Call the "close()" method in case the user forgot."""
+        self.close()
+
+    def close(self):
+        """Close the file, and for mode "w" and "a" write the ending
+        records."""
+        if self.fp is None:
+            return
+        if not self._filePassed:
+            self.fp.close()
+        self.fp = None
+
+
+###########################################################################
+###########################################################################
+###########################################################################
+# Old ZipFile class
+
 
 class ZipFile:
     """ Class with methods to open, read, write, close, list zip files.