[Python-checkins] cpython: Issue #14684: Add support for predefined compression dictionaries to the zlib

Thu Jun 21 02:20:46 CEST 2012

http://hg.python.org/cpython/rev/dd4f7d5c51c7
changeset:   77528:dd4f7d5c51c7
user:        Nadeem Vawda <nadeem.vawda at gmail.com>
date:        Thu Jun 21 02:13:12 2012 +0200
summary:
  Issue #14684: Add support for predefined compression dictionaries to the zlib module.

Original patch by Sam Rushing.

files:
  Doc/library/zlib.rst  |   31 ++++++-
  Lib/test/test_zlib.py |   30 +++++++
  Misc/NEWS             |    3 +
  Modules/zlibmodule.c  |  116 ++++++++++++++++++++++++-----
  4 files changed, 152 insertions(+), 28 deletions(-)

diff --git a/Doc/library/zlib.rst b/Doc/library/zlib.rst
--- a/Doc/library/zlib.rst
+++ b/Doc/library/zlib.rst
@@ -58,12 +58,19 @@
    exception if any error occurs.
 
 
-.. function:: compressobj([level])
+.. function:: compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])
 
    Returns a compression object, to be used for compressing data streams that won't
-   fit into memory at once.  *level* is an integer from ``1`` to ``9`` controlling
-   the level of compression; ``1`` is fastest and produces the least compression,
-   ``9`` is slowest and produces the most.  The default value is ``6``.
+   fit into memory at once.
+
+   *level* is an integer from ``1`` to ``9`` controlling the level of
+   compression; ``1`` is fastest and produces the least compression, ``9`` is
+   slowest and produces the most.  The default value is ``6``.
+
+   *zdict* is a predefined compression dictionary. This is a sequence of bytes
+   (such as a :class:`bytes` object) containing subsequences that are expected
+   to occur frequently in the data that is to be compressed. Those subsequences
+   that are expected to be most common should come at the end of the dictionary.
 
 
 .. function:: crc32(data[, value])
@@ -114,11 +121,21 @@
    to :c:func:`malloc`.  The default size is 16384.
 
 
-.. function:: decompressobj([wbits])
+.. function:: decompressobj([wbits[, zdict]])
 
    Returns a decompression object, to be used for decompressing data streams that
-   won't fit into memory at once.  The *wbits* parameter controls the size of the
-   window buffer.
+   won't fit into memory at once.
+
+   The *wbits* parameter controls the size of the window buffer.
+
+   The *zdict* parameter specifies a predefined compression dictionary. If
+   provided, this must be the same dictionary as was used by the compressor that
+   produced the data that is to be decompressed.
+
+.. note::
+   If *zdict* is a mutable object (such as a :class:`bytearray`), you must not
+   modify its contents between the call to :func:`decompressobj` and the first
+   call to the decompressor's ``decompress()`` method.
 
 
 Compression objects support the following methods:
diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@@ -425,6 +425,36 @@
         dco = zlib.decompressobj()
         self.assertEqual(dco.flush(), b"") # Returns nothing
 
+    def test_dictionary(self):
+        h = HAMLET_SCENE
+        # build a simulated dictionary out of the words in HAMLET
+        words = h.split()
+        random.shuffle(words)
+        zdict = b''.join(words)
+        # use it to compress HAMLET
+        co = zlib.compressobj(zdict=zdict)
+        cd = co.compress(h) + co.flush()
+        # verify that it will decompress with the dictionary
+        dco = zlib.decompressobj(zdict=zdict)
+        self.assertEqual(dco.decompress(cd) + dco.flush(), h)
+        # verify that it fails when not given the dictionary
+        dco = zlib.decompressobj()
+        self.assertRaises(zlib.error, dco.decompress, cd)
+
+    def test_dictionary_streaming(self):
+        # this is simulating the needs of SPDY to be able to reuse the same
+        #  stream object (with its compression state) between sets of compressed
+        #  headers.
+        co = zlib.compressobj(zdict=HAMLET_SCENE)
+        do = zlib.decompressobj(zdict=HAMLET_SCENE)
+        piece = HAMLET_SCENE[1000:1500]
+        d0 = co.compress(piece) + co.flush(zlib.Z_SYNC_FLUSH)
+        d1 = co.compress(piece[100:]) + co.flush(zlib.Z_SYNC_FLUSH)
+        d2 = co.compress(piece[:-100]) + co.flush(zlib.Z_SYNC_FLUSH)
+        self.assertEqual(do.decompress(d0), piece)
+        self.assertEqual(do.decompress(d1), piece[100:])
+        self.assertEqual(do.decompress(d2), piece[:-100])
+
     def test_decompress_incomplete_stream(self):
         # This is 'foo', deflated
         x = b'x\x9cK\xcb\xcf\x07\x00\x02\x82\x01E'
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -34,6 +34,9 @@
 Library
 -------
 
+- Issue #14684: zlib.compressobj() and zlib.decompressobj() now support the use
+  of predefined compression dictionaries. Original patch by Sam Rushing.
+
 - Fix GzipFile's handling of filenames given as bytes objects.
 
 - Issue #14772: Return destination values from some shutil functions.
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -45,6 +45,7 @@
     PyObject *unconsumed_tail;
     char eof;
     int is_initialised;
+    PyObject *zdict;
     #ifdef WITH_THREAD
         PyThread_type_lock lock;
     #endif
@@ -80,14 +81,21 @@
 }
 
 PyDoc_STRVAR(compressobj__doc__,
-"compressobj([level]) -- Return a compressor object.\n"
+"compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])\n"
+" -- Return a compressor object.\n"
 "\n"
-"Optional arg level is the compression level, in 1-9.");
+"Optional arg level is the compression level, in 1-9.\n"
+"\n"
+"Optional arg zdict is the predefined compression dictionary - a sequence of\n"
+"bytes containing subsequences that are likely to occur in the input data.");
 
 PyDoc_STRVAR(decompressobj__doc__,
-"decompressobj([wbits]) -- Return a decompressor object.\n"
+"decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n"
 "\n"
-"Optional arg wbits is the window buffer size.");
+"Optional arg wbits is the window buffer size.\n"
+"\n"
+"Optional arg zdict is the predefined compression dictionary. This must be\n"
+"the same dictionary as used by the compressor that produced the input data.");
 
 static compobject *
 newcompobject(PyTypeObject *type)
@@ -98,6 +106,7 @@
         return NULL;
     self->eof = 0;
     self->is_initialised = 0;
+    self->zdict = NULL;
     self->unused_data = PyBytes_FromStringAndSize("", 0);
     if (self->unused_data == NULL) {
         Py_DECREF(self);
@@ -316,19 +325,24 @@
 }
 
 static PyObject *
-PyZlib_compressobj(PyObject *selfptr, PyObject *args)
+PyZlib_compressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
 {
     compobject *self;
     int level=Z_DEFAULT_COMPRESSION, method=DEFLATED;
     int wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=0, err;
+    Py_buffer zdict;
+    static char *kwlist[] = {"level", "method", "wbits",
+                             "memLevel", "strategy", "zdict", NULL};
 
-    if (!PyArg_ParseTuple(args, "|iiiii:compressobj", &level, &method, &wbits,
-                          &memLevel, &strategy))
+    zdict.buf = NULL; /* Sentinel, so we can tell whether zdict was supplied. */
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiiy*:compressobj",
+                                     kwlist, &level, &method, &wbits,
+                                     &memLevel, &strategy, &zdict))
         return NULL;
 
     self = newcompobject(&Comptype);
     if (self==NULL)
-        return(NULL);
+        goto error;
     self->zst.zalloc = (alloc_func)NULL;
     self->zst.zfree = (free_func)Z_NULL;
     self->zst.next_in = NULL;
@@ -337,30 +351,58 @@
     switch(err) {
     case (Z_OK):
         self->is_initialised = 1;
-        return (PyObject*)self;
+        if (zdict.buf == NULL) {
+            goto success;
+        } else {
+            err = deflateSetDictionary(&self->zst, zdict.buf, zdict.len);
+            switch (err) {
+            case (Z_OK):
+                goto success;
+            case (Z_STREAM_ERROR):
+                PyErr_SetString(PyExc_ValueError, "Invalid dictionary");
+                goto error;
+            default:
+                PyErr_SetString(PyExc_ValueError, "deflateSetDictionary()");
+                goto error;
+            }
+       }
     case (Z_MEM_ERROR):
-        Py_DECREF(self);
         PyErr_SetString(PyExc_MemoryError,
                         "Can't allocate memory for compression object");
-        return NULL;
+        goto error;
     case(Z_STREAM_ERROR):
-        Py_DECREF(self);
         PyErr_SetString(PyExc_ValueError, "Invalid initialization option");
-        return NULL;
+        goto error;
     default:
         zlib_error(self->zst, err, "while creating compression object");
-        Py_DECREF(self);
-        return NULL;
+        goto error;
     }
+
+ error:
+    Py_XDECREF(self);
+    self = NULL;
+ success:
+    if (zdict.buf != NULL)
+        PyBuffer_Release(&zdict);
+    return (PyObject*)self;
 }
 
 static PyObject *
-PyZlib_decompressobj(PyObject *selfptr, PyObject *args)
+PyZlib_decompressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
 {
+    static char *kwlist[] = {"wbits", "zdict", NULL};
     int wbits=DEF_WBITS, err;
     compobject *self;
-    if (!PyArg_ParseTuple(args, "|i:decompressobj", &wbits))
+    PyObject *zdict=NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO:decompressobj",
+                                     kwlist, &wbits, &zdict))
         return NULL;
+    if (zdict != NULL && !PyObject_CheckBuffer(zdict)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "zdict argument must support the buffer protocol");
+        return NULL;
+    }
 
     self = newcompobject(&Decomptype);
     if (self == NULL)
@@ -369,6 +411,10 @@
     self->zst.zfree = (free_func)Z_NULL;
     self->zst.next_in = NULL;
     self->zst.avail_in = 0;
+    if (zdict != NULL) {
+        Py_INCREF(zdict);
+        self->zdict = zdict;
+    }
     err = inflateInit2(&self->zst, wbits);
     switch(err) {
     case (Z_OK):
@@ -398,6 +444,7 @@
 #endif
     Py_XDECREF(self->unused_data);
     Py_XDECREF(self->unconsumed_tail);
+    Py_XDECREF(self->zdict);
     PyObject_Del(self);
 }
 
@@ -557,6 +604,27 @@
     err = inflate(&(self->zst), Z_SYNC_FLUSH);
     Py_END_ALLOW_THREADS
 
+    if (err == Z_NEED_DICT && self->zdict != NULL) {
+        Py_buffer zdict_buf;
+        if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) {
+            Py_DECREF(RetVal);
+            RetVal = NULL;
+            goto error;
+        }
+        err = inflateSetDictionary(&(self->zst), zdict_buf.buf, zdict_buf.len);
+        PyBuffer_Release(&zdict_buf);
+        if (err != Z_OK) {
+            zlib_error(self->zst, err, "while decompressing data");
+            Py_DECREF(RetVal);
+            RetVal = NULL;
+            goto error;
+        }
+        /* repeat the call to inflate! */
+        Py_BEGIN_ALLOW_THREADS
+        err = inflate(&(self->zst), Z_SYNC_FLUSH);
+        Py_END_ALLOW_THREADS
+    }
+
     /* While Z_OK and the output buffer is full, there might be more output.
        So extend the output buffer and try again.
     */
@@ -770,10 +838,13 @@
     }
     Py_INCREF(self->unused_data);
     Py_INCREF(self->unconsumed_tail);
+    Py_XINCREF(self->zdict);
     Py_XDECREF(retval->unused_data);
     Py_XDECREF(retval->unconsumed_tail);
+    Py_XDECREF(retval->zdict);
     retval->unused_data = self->unused_data;
     retval->unconsumed_tail = self->unconsumed_tail;
+    retval->zdict = self->zdict;
     retval->eof = self->eof;
 
     /* Mark it as being initialized */
@@ -822,10 +893,13 @@
 
     Py_INCREF(self->unused_data);
     Py_INCREF(self->unconsumed_tail);
+    Py_XINCREF(self->zdict);
     Py_XDECREF(retval->unused_data);
     Py_XDECREF(retval->unconsumed_tail);
+    Py_XDECREF(retval->zdict);
     retval->unused_data = self->unused_data;
     retval->unconsumed_tail = self->unconsumed_tail;
+    retval->zdict = self->zdict;
     retval->eof = self->eof;
 
     /* Mark it as being initialized */
@@ -1032,13 +1106,13 @@
                 adler32__doc__},
     {"compress", (PyCFunction)PyZlib_compress,  METH_VARARGS,
                  compress__doc__},
-    {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS,
+    {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS|METH_KEYWORDS,
                     compressobj__doc__},
     {"crc32", (PyCFunction)PyZlib_crc32, METH_VARARGS,
               crc32__doc__},
     {"decompress", (PyCFunction)PyZlib_decompress, METH_VARARGS,
                    decompress__doc__},
-    {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS,
+    {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS|METH_KEYWORDS,
                    decompressobj__doc__},
     {NULL, NULL}
 };
@@ -1112,10 +1186,10 @@
 "\n"
 "adler32(string[, start]) -- Compute an Adler-32 checksum.\n"
 "compress(string[, level]) -- Compress string, with compression level in 1-9.\n"
-"compressobj([level]) -- Return a compressor object.\n"
+"compressobj([level[, ...]]) -- Return a compressor object.\n"
 "crc32(string[, start]) -- Compute a CRC-32 checksum.\n"
 "decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n"
-"decompressobj([wbits]) -- Return a decompressor object.\n"
+"decompressobj([wbits[, zdict]]]) -- Return a decompressor object.\n"
 "\n"
 "'wbits' is window buffer size.\n"
 "Compressor objects support compress() and flush() methods; decompressor\n"

-- 
Repository URL: http://hg.python.org/cpython