[Python-checkins] r85384 - in python/branches/py3k: Lib/test/pickletester.py Misc/NEWS Modules/_pickle.c

antoine.pitrou python-checkins at python.org
Tue Oct 12 22:51:21 CEST 2010


Author: antoine.pitrou
Date: Tue Oct 12 22:51:21 2010
New Revision: 85384

Log:
Issue #3873: Speed up unpickling from file objects which have a peek()
method.



Modified:
   python/branches/py3k/Lib/test/pickletester.py
   python/branches/py3k/Misc/NEWS
   python/branches/py3k/Modules/_pickle.c

Modified: python/branches/py3k/Lib/test/pickletester.py
==============================================================================
--- python/branches/py3k/Lib/test/pickletester.py	(original)
+++ python/branches/py3k/Lib/test/pickletester.py	Tue Oct 12 22:51:21 2010
@@ -30,6 +30,21 @@
             n += 1
     return n
 
+
+class UnseekableIO(io.BytesIO):
+    def peek(self, *args):
+        raise NotImplementedError
+
+    def seekable(self):
+        return False
+
+    def seek(self, *args):
+        raise io.UnsupportedOperation
+
+    def tell(self):
+        raise io.UnsupportedOperation
+
+
 # We can't very well test the extension registry without putting known stuff
 # in it, but we have to be careful to restore its original state.  Code
 # should do this:
@@ -1072,9 +1087,10 @@
         # Test the correctness of internal buffering routines when handling
         # large data.
         for proto in protocols:
-            data = (1, b'x' * (256 * 1024))
+            data = (1, min, b'xy' * (30 * 1024), len)
             dumped = self.dumps(data, proto)
             loaded = self.loads(dumped)
+            self.assertEqual(len(loaded), len(data))
             self.assertEqual(loaded, data)
 
 
@@ -1373,6 +1389,31 @@
         f.seek(0)
         self.assertEqual(unpickler.load(), data2)
 
+    def _check_multiple_unpicklings(self, ioclass):
+        for proto in protocols:
+            data1 = [(x, str(x)) for x in range(2000)] + [b"abcde", len]
+            f = ioclass()
+            pickler = self.pickler_class(f, protocol=proto)
+            pickler.dump(data1)
+            pickled = f.getvalue()
+
+            N = 5
+            f = ioclass(pickled * N)
+            unpickler = self.unpickler_class(f)
+            for i in range(N):
+                if f.seekable():
+                    pos = f.tell()
+                self.assertEqual(unpickler.load(), data1)
+                if f.seekable():
+                    self.assertEqual(f.tell(), pos + len(pickled))
+            self.assertRaises(EOFError, unpickler.load)
+
+    def test_multiple_unpicklings_seekable(self):
+        self._check_multiple_unpicklings(io.BytesIO)
+
+    def test_multiple_unpicklings_unseekable(self):
+        self._check_multiple_unpicklings(UnseekableIO)
+
 
 if __name__ == "__main__":
     # Print some stuff that can be used to rewrite DATA{0,1,2}

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Tue Oct 12 22:51:21 2010
@@ -13,6 +13,9 @@
 Library
 -------
 
+- Issue #3873: Speed up unpickling from file objects which have a peek()
+  method.
+
 - Issue #10075: Add a session_stats() method to SSLContext objects.
 
 - Issue #9948: Fixed problem of losing filename case information.

Modified: python/branches/py3k/Modules/_pickle.c
==============================================================================
--- python/branches/py3k/Modules/_pickle.c	(original)
+++ python/branches/py3k/Modules/_pickle.c	Tue Oct 12 22:51:21 2010
@@ -101,6 +101,9 @@
     /* Maximum size of the write buffer of Pickler when pickling to a
        stream.  This is ignored for in-memory pickling. */
     MAX_WRITE_BUF_SIZE = 64 * 1024,
+
+    /* Prefetch size when unpickling (disabled on unpeekable streams) */
+    PREFETCH = 8192 * 16,
 };
 
 /* Exception classes for pickle. These should override the ones defined in
@@ -355,8 +358,10 @@
     char *input_line;
     Py_ssize_t input_len;
     Py_ssize_t next_read_idx;
+    Py_ssize_t prefetched_idx;  /* index of first prefetched byte */
     PyObject *read;             /* read() method of the input stream. */
     PyObject *readline;         /* readline() method of the input stream. */
+    PyObject *peek;             /* peek() method of the input stream, or NULL */
 
     char *encoding;             /* Name of the encoding to be used for
                                    decoding strings pickled using Python
@@ -859,9 +864,28 @@
     self->input_buffer = self->buffer.buf;
     self->input_len = self->buffer.len;
     self->next_read_idx = 0;
+    self->prefetched_idx = self->input_len;
     return self->input_len;
 }
 
+static int
+_Unpickler_SkipConsumed(UnpicklerObject *self)
+{
+    Py_ssize_t consumed = self->next_read_idx - self->prefetched_idx;
+
+    if (consumed > 0) {
+        PyObject *r;
+        assert(self->peek);  /* otherwise we did something wrong */
+        /* This makes an useless copy... */
+        r = PyObject_CallFunction(self->read, "n", consumed);
+        if (r == NULL)
+            return -1;
+        Py_DECREF(r);
+        self->prefetched_idx = self->next_read_idx;
+    }
+    return 0;
+}
+
 static const Py_ssize_t READ_WHOLE_LINE = -1;
 
 /* If reading from a file, we need to only pull the bytes we need, since there
@@ -882,10 +906,12 @@
 _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
 {
     PyObject *data;
-    Py_ssize_t read_size;
+    Py_ssize_t read_size, prefetched_size = 0;
 
     assert(self->read != NULL);
-    assert(self->next_read_idx == 0);
+    
+    if (_Unpickler_SkipConsumed(self) < 0)
+        return -1;
 
     if (n == READ_WHOLE_LINE)
         data = PyObject_Call(self->readline, empty_tuple, NULL);
@@ -895,13 +921,41 @@
             return -1;
         data = _Unpickler_FastCall(self, self->read, len);
     }
-
     if (data == NULL)
         return -1;
 
-    read_size = _Unpickler_SetStringInput(self, data);
-    self->input_len = 0;
+    /* Prefetch some data without advancing the file pointer, if possible */
+    if (self->peek) {
+        PyObject *len, *prefetched;
+        len = PyLong_FromSsize_t(PREFETCH);
+        if (len == NULL) {
+            Py_DECREF(data);
+            return -1;
+        }
+        prefetched = _Unpickler_FastCall(self, self->peek, len);
+        if (prefetched == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
+                /* peek() is probably not supported by the given file object */
+                PyErr_Clear();
+                Py_CLEAR(self->peek);
+            }
+            else {
+                Py_DECREF(data);
+                return -1;
+            }
+        }
+        else {
+            assert(PyBytes_Check(prefetched));
+            prefetched_size = PyBytes_GET_SIZE(prefetched);
+            PyBytes_ConcatAndDel(&data, prefetched);
+            if (data == NULL)
+                return -1;
+        }
+    }
+
+    read_size = _Unpickler_SetStringInput(self, data) - prefetched_size;
     Py_DECREF(data);
+    self->prefetched_idx = read_size;
     return read_size;
 }
 
@@ -921,30 +975,31 @@
 static Py_ssize_t
 _Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
 {
+    Py_ssize_t num_read;
+
     if (n == 0) {
         *s = NULL;
         return 0;
     }
 
-    /* This condition will always be true if self->read. */
-    if (self->next_read_idx + n > self->input_len) {
-        if (self->read) {
-            Py_ssize_t num_read;
-            assert(self->next_read_idx == self->input_len);
-            num_read = _Unpickler_ReadFromFile(self, n);
-            if (n < 0)
-                 return -1;
-            if (num_read == n) {
-                *s = self->input_buffer;
-                return num_read;
-            }
-        }
+    if (self->next_read_idx + n <= self->input_len) {
+        *s = self->input_buffer + self->next_read_idx;
+        self->next_read_idx += n;
+        return n;
+    }
+    if (!self->read) {
+        PyErr_Format(PyExc_EOFError, "Ran out of input");
+        return -1;
+    }
+    num_read = _Unpickler_ReadFromFile(self, n);
+    if (num_read < 0)
+        return -1;
+    if (num_read < n) {
         PyErr_Format(PyExc_EOFError, "Ran out of input");
         return -1;
     }
-    assert(self->read == NULL);
-    *s = self->input_buffer + self->next_read_idx;
-    self->next_read_idx += n;
+    *s = self->input_buffer;
+    self->next_read_idx = n;
     return n;
 }
 
@@ -972,9 +1027,7 @@
 {
     Py_ssize_t i, num_read;
 
-    /* This loop will never be entered if self->read is not NULL. */
     for (i = self->next_read_idx; i < self->input_len; i++) {
-        assert(self->read == NULL);
         if (self->input_buffer[i] == '\n') {
             char *line_start = self->input_buffer + self->next_read_idx;
             num_read = i - self->next_read_idx + 1;
@@ -983,11 +1036,11 @@
         }
     }
     if (self->read) {
-        assert(self->next_read_idx == self->input_len);
         num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE);
         if (num_read < 0)
             return -1;
         *result = self->input_buffer;
+        self->next_read_idx = num_read;
         return num_read;
     }
  
@@ -1106,8 +1159,10 @@
     self->input_line = NULL;
     self->input_len = 0;
     self->next_read_idx = 0;
+    self->prefetched_idx = 0;
     self->read = NULL;
     self->readline = NULL;
+    self->peek = NULL;
     self->encoding = NULL;
     self->errors = NULL;
     self->marks = NULL;
@@ -1124,6 +1179,13 @@
 static int
 _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
 {
+    self->peek = PyObject_GetAttrString(file, "peek");
+    if (self->peek == NULL) {
+        if (PyErr_ExceptionMatches(PyExc_AttributeError))
+            PyErr_Clear();
+        else
+            return -1;
+    }
     self->read = PyObject_GetAttrString(file, "read");
     self->readline = PyObject_GetAttrString(file, "readline");
     if (self->readline == NULL || self->read == NULL) {
@@ -1132,6 +1194,7 @@
                             "file must have 'read' and 'readline' attributes");
         Py_CLEAR(self->read);
         Py_CLEAR(self->readline);
+        Py_CLEAR(self->peek);
         return -1;
     }
     return 0;
@@ -5207,6 +5270,9 @@
         break;                  /* and we are done! */
     }
 
+    if (_Unpickler_SkipConsumed(self) < 0)
+        return NULL;
+
     /* XXX: It is not clear what this is actually for. */
     if ((err = PyErr_Occurred())) {
         if (err == PyExc_EOFError) {
@@ -5356,6 +5422,7 @@
     PyObject_GC_UnTrack((PyObject *)self);
     Py_XDECREF(self->readline);
     Py_XDECREF(self->read);
+    Py_XDECREF(self->peek);
     Py_XDECREF(self->stack);
     Py_XDECREF(self->pers_func);
     Py_XDECREF(self->arg);
@@ -5378,6 +5445,7 @@
 {
     Py_VISIT(self->readline);
     Py_VISIT(self->read);
+    Py_VISIT(self->peek);
     Py_VISIT(self->stack);
     Py_VISIT(self->pers_func);
     Py_VISIT(self->arg);
@@ -5389,6 +5457,7 @@
 {
     Py_CLEAR(self->readline);
     Py_CLEAR(self->read);
+    Py_CLEAR(self->peek);
     Py_CLEAR(self->stack);
     Py_CLEAR(self->pers_func);
     Py_CLEAR(self->arg);


More information about the Python-checkins mailing list