[Python-checkins] cpython (merge 3.5 -> default): Issue #19883: Fixed possible integer overflows in zipimport.

Thu Jan 28 14:33:32 EST 2016

https://hg.python.org/cpython/rev/f4631dc56ecf
changeset:   100103:f4631dc56ecf
parent:      100101:cf06a36f5f64
parent:      100102:687be1cbe587
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Thu Jan 28 21:32:39 2016 +0200
summary:
  Issue #19883: Fixed possible integer overflows in zipimport.

files:
  Misc/NEWS           |    2 +
  Modules/zipimport.c |  369 ++++++++++++++++++-------------
  2 files changed, 217 insertions(+), 154 deletions(-)

diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -159,6 +159,8 @@
 Library
 -------
 
+- Issue #19883: Fixed possible integer overflows in zipimport.
+
 - Issue #26227: On Windows, getnameinfo(), gethostbyaddr() and
   gethostbyname_ex() functions of the socket module now decode the hostname
   from the ANSI code page rather than UTF-8.
diff --git a/Modules/zipimport.c b/Modules/zipimport.c
--- a/Modules/zipimport.c
+++ b/Modules/zipimport.c
@@ -827,23 +827,43 @@
 
 /* implementation */
 
-/* Given a buffer, return the long that is represented by the first
+/* Given a buffer, return the unsigned int that is represented by the first
    4 bytes, encoded as little endian. This partially reimplements
    marshal.c:r_long() */
-static long
-get_long(const unsigned char *buf) {
-    long x;
+static unsigned int
+get_uint32(const unsigned char *buf)
+{
+    unsigned int x;
     x =  buf[0];
-    x |= (long)buf[1] <<  8;
-    x |= (long)buf[2] << 16;
-    x |= (long)buf[3] << 24;
-#if SIZEOF_LONG > 4
-    /* Sign extension for 64-bit machines */
-    x |= -(x & 0x80000000L);
-#endif
+    x |= (unsigned int)buf[1] <<  8;
+    x |= (unsigned int)buf[2] << 16;
+    x |= (unsigned int)buf[3] << 24;
     return x;
 }
 
+/* Given a buffer, return the unsigned int that is represented by the first
+   2 bytes, encoded as little endian. This partially reimplements
+   marshal.c:r_short() */
+static unsigned short
+get_uint16(const unsigned char *buf)
+{
+    unsigned short x;
+    x =  buf[0];
+    x |= (unsigned short)buf[1] <<  8;
+    return x;
+}
+
+static void
+set_file_error(PyObject *archive, int eof)
+{
+    if (eof) {
+        PyErr_SetString(PyExc_EOFError, "EOF read where not expected");
+    }
+    else {
+        PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, archive);
+    }
+}
+
 /*
    read_directory(archive) -> files dict (new reference)
 
@@ -871,20 +891,18 @@
 {
     PyObject *files = NULL;
     FILE *fp;
-    unsigned short flags;
-    short compress, time, date, name_size;
-    long crc, data_size, file_size, header_size;
-    Py_ssize_t file_offset, header_position, header_offset;
-    long l, count;
-    Py_ssize_t i;
+    unsigned short flags, compress, time, date, name_size;
+    unsigned int crc, data_size, file_size, header_size, header_offset;
+    unsigned long file_offset, header_position;
+    unsigned long arc_offset;  /* Absolute offset to start of the zip-archive. */
+    unsigned int count, i;
+    unsigned char buffer[46];
     char name[MAXPATHLEN + 5];
-    char dummy[8]; /* Buffer to read unused header values into */
     PyObject *nameobj = NULL;
-    char *p, endof_central_dir[22];
-    Py_ssize_t arc_offset;  /* Absolute offset to start of the zip-archive. */
     PyObject *path;
     const char *charset;
     int bootstrap;
+    const char *errmsg = NULL;
 
     fp = _Py_fopen_obj(archive, "rb");
     if (fp == NULL) {
@@ -898,91 +916,112 @@
     }
 
     if (fseek(fp, -22, SEEK_END) == -1) {
-        fclose(fp);
-        PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-        return NULL;
+        goto file_error;
     }
-    header_position = ftell(fp);
-    if (fread(endof_central_dir, 1, 22, fp) != 22) {
-        fclose(fp);
-        PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-        return NULL;
+    header_position = (unsigned long)ftell(fp);
+    if (header_position == (unsigned long)-1) {
+        goto file_error;
     }
-    if (get_long((unsigned char *)endof_central_dir) != 0x06054B50) {
+    assert(header_position <= (unsigned long)LONG_MAX);
+    if (fread(buffer, 1, 22, fp) != 22) {
+        goto file_error;
+    }
+    if (get_uint32(buffer) != 0x06054B50u) {
         /* Bad: End of Central Dir signature */
-        fclose(fp);
-        PyErr_Format(ZipImportError, "not a Zip file: %R", archive);
-        return NULL;
+        errmsg = "not a Zip file";
+        goto invalid_header;
     }
 
-    header_size = get_long((unsigned char *)endof_central_dir + 12);
-    header_offset = get_long((unsigned char *)endof_central_dir + 16);
-    arc_offset = header_position - header_offset - header_size;
-    header_offset += arc_offset;
+    header_size = get_uint32(buffer + 12);
+    header_offset = get_uint32(buffer + 16);
+    if (header_position < header_size) {
+        errmsg = "bad central directory size";
+        goto invalid_header;
+    }
+    if (header_position < header_offset) {
+        errmsg = "bad central directory offset";
+        goto invalid_header;
+    }
+    if (header_position - header_size < header_offset) {
+        errmsg = "bad central directory size or offset";
+        goto invalid_header;
+    }
+    header_position -= header_size;
+    arc_offset = header_position - header_offset;
 
     files = PyDict_New();
-    if (files == NULL)
+    if (files == NULL) {
         goto error;
-
+    }
     /* Start of Central Directory */
     count = 0;
-    if (fseek(fp, header_offset, 0) == -1)
+    if (fseek(fp, (long)header_position, 0) == -1) {
         goto file_error;
+    }
     for (;;) {
         PyObject *t;
+        size_t n;
         int err;
 
+        n = fread(buffer, 1, 46, fp);
+        if (n < 4) {
+            goto eof_error;
+        }
         /* Start of file header */
-        l = PyMarshal_ReadLongFromFile(fp);
-        if (l == -1 && PyErr_Occurred())
-            goto error;
-        if (l != 0x02014B50)
+        if (get_uint32(buffer) != 0x02014B50u) {
             break;              /* Bad: Central Dir File Header */
+        }
+        if (n != 46) {
+            goto eof_error;
+        }
+        flags = get_uint16(buffer + 8);
+        compress = get_uint16(buffer + 10);
+        time = get_uint16(buffer + 12);
+        date = get_uint16(buffer + 14);
+        crc = get_uint32(buffer + 16);
+        data_size = get_uint32(buffer + 20);
+        file_size = get_uint32(buffer + 24);
+        name_size = get_uint16(buffer + 28);
+        header_size = (unsigned int)name_size +
+           get_uint16(buffer + 30) /* extra field */ +
+           get_uint16(buffer + 32) /* comment */;
 
-        /* On Windows, calling fseek to skip over the fields we don't use is
-        slower than reading the data into a dummy buffer because fseek flushes
-        stdio's internal buffers. See issue #8745. */
-        if (fread(dummy, 1, 4, fp) != 4) /* Skip unused fields, avoid fseek */
+        file_offset = get_uint32(buffer + 42);
+        if (file_offset > header_offset) {
+            errmsg = "bad local header offset";
+            goto invalid_header;
+        }
+        file_offset += arc_offset;
+
+        if (name_size > MAXPATHLEN) {
+            name_size = MAXPATHLEN;
+        }
+        if (fread(name, 1, name_size, fp) != name_size) {
             goto file_error;
-
-        flags = (unsigned short)PyMarshal_ReadShortFromFile(fp);
-        compress = PyMarshal_ReadShortFromFile(fp);
-        time = PyMarshal_ReadShortFromFile(fp);
-        date = PyMarshal_ReadShortFromFile(fp);
-        crc = PyMarshal_ReadLongFromFile(fp);
-        data_size = PyMarshal_ReadLongFromFile(fp);
-        file_size = PyMarshal_ReadLongFromFile(fp);
-        name_size = PyMarshal_ReadShortFromFile(fp);
-        header_size = name_size +
-           PyMarshal_ReadShortFromFile(fp) +
-           PyMarshal_ReadShortFromFile(fp);
-        if (PyErr_Occurred())
-            goto error;
-
-        if (fread(dummy, 1, 8, fp) != 8) /* Skip unused fields, avoid fseek */
-            goto file_error;
-        file_offset = PyMarshal_ReadLongFromFile(fp) + arc_offset;
-        if (PyErr_Occurred())
-            goto error;
-
-        if (name_size > MAXPATHLEN)
-            name_size = MAXPATHLEN;
-
-        p = name;
-        for (i = 0; i < (Py_ssize_t)name_size; i++) {
-            *p = (char)getc(fp);
-            if (*p == '/')
-                *p = SEP;
-            p++;
         }
-        *p = 0;         /* Add terminating null byte */
-        for (; i < header_size; i++) /* Skip the rest of the header */
-            if(getc(fp) == EOF) /* Avoid fseek */
+        name[name_size] = '\0';  /* Add terminating null byte */
+        if (SEP != '/') {
+            for (i = 0; i < name_size; i++) {
+                if (name[i] == '/') {
+                    name[i] = SEP;
+                }
+            }
+        }
+        /* Skip the rest of the header.
+         * On Windows, calling fseek to skip over the fields we don't use is
+         * slower than reading the data because fseek flushes stdio's
+         * internal buffers.  See issue #8745. */
+        assert(header_size <= 3*0xFFFFu);
+        for (i = name_size; i < header_size; i++) {
+            if (getc(fp) == EOF) {
                 goto file_error;
+            }
+        }
 
         bootstrap = 0;
-        if (flags & 0x0800)
+        if (flags & 0x0800) {
             charset = "utf-8";
+        }
         else if (!PyThreadState_GET()->interp->codecs_initialized) {
             /* During bootstrap, we may need to load the encodings
                package from a ZIP file. But the cp437 encoding is implemented
@@ -993,44 +1032,59 @@
             charset = "ascii";
             bootstrap = 1;
         }
-        else
+        else {
             charset = "cp437";
+        }
         nameobj = PyUnicode_Decode(name, name_size, charset, NULL);
         if (nameobj == NULL) {
-            if (bootstrap)
+            if (bootstrap) {
                 PyErr_Format(PyExc_NotImplementedError,
                     "bootstrap issue: python%i%i.zip contains non-ASCII "
                     "filenames without the unicode flag",
                     PY_MAJOR_VERSION, PY_MINOR_VERSION);
+            }
             goto error;
         }
-        if (PyUnicode_READY(nameobj) == -1)
+        if (PyUnicode_READY(nameobj) == -1) {
             goto error;
+        }
         path = PyUnicode_FromFormat("%U%c%U", archive, SEP, nameobj);
-        if (path == NULL)
+        if (path == NULL) {
             goto error;
-        t = Py_BuildValue("Nhllnhhl", path, compress, data_size,
+        }
+        t = Py_BuildValue("NHIIkHHI", path, compress, data_size,
                           file_size, file_offset, time, date, crc);
-        if (t == NULL)
+        if (t == NULL) {
             goto error;
+        }
         err = PyDict_SetItem(files, nameobj, t);
         Py_CLEAR(nameobj);
         Py_DECREF(t);
-        if (err != 0)
+        if (err != 0) {
             goto error;
+        }
         count++;
     }
     fclose(fp);
-    if (Py_VerboseFlag)
-        PySys_FormatStderr("# zipimport: found %ld names in %R\n",
+    if (Py_VerboseFlag) {
+        PySys_FormatStderr("# zipimport: found %u names in %R\n",
                            count, archive);
+    }
     return files;
+
+eof_error:
+    set_file_error(archive, !ferror(fp));
+    goto error;
+
 file_error:
-    fclose(fp);
-    Py_XDECREF(files);
-    Py_XDECREF(nameobj);
     PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-    return NULL;
+    goto error;
+
+invalid_header:
+    assert(errmsg != NULL);
+    PyErr_Format(ZipImportError, "%s: %R", errmsg, archive);
+    goto error;
+
 error:
     fclose(fp);
     Py_XDECREF(files);
@@ -1076,17 +1130,18 @@
 static PyObject *
 get_data(PyObject *archive, PyObject *toc_entry)
 {
-    PyObject *raw_data, *data = NULL, *decompress;
+    PyObject *raw_data = NULL, *data, *decompress;
     char *buf;
     FILE *fp;
-    int err;
-    Py_ssize_t bytes_read = 0;
-    long l;
     PyObject *datapath;
-    long compress, data_size, file_size, file_offset, bytes_size;
-    long time, date, crc;
+    unsigned short compress, time, date;
+    unsigned int crc;
+    Py_ssize_t data_size, file_size, bytes_size;
+    long file_offset, header_size;
+    unsigned char buffer[30];
+    const char *errmsg = NULL;
 
-    if (!PyArg_ParseTuple(toc_entry, "Olllllll", &datapath, &compress,
+    if (!PyArg_ParseTuple(toc_entry, "OHnnlHHI", &datapath, &compress,
                           &data_size, &file_size, &file_offset, &time,
                           &date, &crc)) {
         return NULL;
@@ -1097,39 +1152,30 @@
     }
 
     fp = _Py_fopen_obj(archive, "rb");
-    if (!fp)
+    if (!fp) {
         return NULL;
-
+    }
     /* Check to make sure the local file header is correct */
     if (fseek(fp, file_offset, 0) == -1) {
-        fclose(fp);
-        PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-        return NULL;
+        goto file_error;
+    }
+    if (fread(buffer, 1, 30, fp) != 30) {
+        goto eof_error;
+    }
+    if (get_uint32(buffer) != 0x04034B50u) {
+        /* Bad: Local File Header */
+        errmsg = "bad local file header";
+        goto invalid_header;
     }
 
-    l = PyMarshal_ReadLongFromFile(fp);
-    if (l != 0x04034B50) {
-        /* Bad: Local File Header */
-        if (!PyErr_Occurred())
-            PyErr_Format(ZipImportError,
-                         "bad local file header in %U",
-                         archive);
-        fclose(fp);
-        return NULL;
+    header_size = (unsigned int)30 +
+        get_uint16(buffer + 26) /* file name */ +
+        get_uint16(buffer + 28) /* extra field */;
+    if (file_offset > LONG_MAX - header_size) {
+        errmsg = "bad local file header size";
+        goto invalid_header;
     }
-    if (fseek(fp, file_offset + 26, 0) == -1) {
-        fclose(fp);
-        PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-        return NULL;
-    }
-
-    l = 30 + PyMarshal_ReadShortFromFile(fp) +
-        PyMarshal_ReadShortFromFile(fp);        /* local header size */
-    if (PyErr_Occurred()) {
-        fclose(fp);
-        return NULL;
-    }
-    file_offset += l;           /* Start of file data */
+    file_offset += header_size;  /* Start of file data */
 
     if (data_size > LONG_MAX - 1) {
         fclose(fp);
@@ -1137,33 +1183,27 @@
         return NULL;
     }
     bytes_size = compress == 0 ? data_size : data_size + 1;
-    if (bytes_size == 0)
+    if (bytes_size == 0) {
         bytes_size++;
+    }
     raw_data = PyBytes_FromStringAndSize((char *)NULL, bytes_size);
-
     if (raw_data == NULL) {
-        fclose(fp);
-        return NULL;
+        goto error;
     }
     buf = PyBytes_AsString(raw_data);
 
-    err = fseek(fp, file_offset, 0);
-    if (err == 0) {
-        bytes_read = fread(buf, 1, data_size, fp);
-    } else {
-        fclose(fp);
-        Py_DECREF(raw_data);
-        PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
-        return NULL;
+    if (fseek(fp, file_offset, 0) == -1) {
+        goto file_error;
     }
-    fclose(fp);
-    if (err || bytes_read != data_size) {
+    if (fread(buf, 1, data_size, fp) != (size_t)data_size) {
         PyErr_SetString(PyExc_IOError,
                         "zipimport: can't read data");
-        Py_DECREF(raw_data);
-        return NULL;
+        goto error;
     }
 
+    fclose(fp);
+    fp = NULL;
+
     if (compress != 0) {
         buf[data_size] = 'Z';  /* saw this in zipfile.py */
         data_size++;
@@ -1186,9 +1226,28 @@
     }
     data = PyObject_CallFunction(decompress, "Oi", raw_data, -15);
     Py_DECREF(decompress);
-error:
     Py_DECREF(raw_data);
     return data;
+
+eof_error:
+    set_file_error(archive, !ferror(fp));
+    goto error;
+
+file_error:
+    PyErr_Format(ZipImportError, "can't read Zip file: %R", archive);
+    goto error;
+
+invalid_header:
+    assert(errmsg != NULL);
+    PyErr_Format(ZipImportError, "%s: %R", errmsg, archive);
+    goto error;
+
+error:
+    if (fp != NULL) {
+        fclose(fp);
+    }
+    Py_XDECREF(raw_data);
+    return NULL;
 }
 
 /* Lenient date/time comparison function. The precision of the mtime
@@ -1213,37 +1272,39 @@
 unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
 {
     PyObject *code;
-    char *buf = PyBytes_AsString(data);
+    unsigned char *buf = (unsigned char *)PyBytes_AsString(data);
     Py_ssize_t size = PyBytes_Size(data);
 
-    if (size <= 9) {
+    if (size < 12) {
         PyErr_SetString(ZipImportError,
                         "bad pyc data");
         return NULL;
     }
 
-    if (get_long((unsigned char *)buf) != PyImport_GetMagicNumber()) {
-        if (Py_VerboseFlag)
+    if (get_uint32(buf) != (unsigned int)PyImport_GetMagicNumber()) {
+        if (Py_VerboseFlag) {
             PySys_FormatStderr("# %R has bad magic\n",
                                pathname);
+        }
         Py_INCREF(Py_None);
         return Py_None;  /* signal caller to try alternative */
     }
 
-    if (mtime != 0 && !eq_mtime(get_long((unsigned char *)buf + 4),
-                                mtime)) {
-        if (Py_VerboseFlag)
+    if (mtime != 0 && !eq_mtime(get_uint32(buf + 4), mtime)) {
+        if (Py_VerboseFlag) {
             PySys_FormatStderr("# %R has bad mtime\n",
                                pathname);
+        }
         Py_INCREF(Py_None);
         return Py_None;  /* signal caller to try alternative */
     }
 
     /* XXX the pyc's size field is ignored; timestamp collisions are probably
        unimportant with zip files. */
-    code = PyMarshal_ReadObjectFromString(buf + 12, size - 12);
-    if (code == NULL)
+    code = PyMarshal_ReadObjectFromString((char *)buf + 12, size - 12);
+    if (code == NULL) {
         return NULL;
+    }
     if (!PyCode_Check(code)) {
         Py_DECREF(code);
         PyErr_Format(PyExc_TypeError,

-- 
Repository URL: https://hg.python.org/cpython