[Python-checkins] cpython: Issue #13149: Speed up append-only StringIO objects.

antoine.pitrou python-checkins at python.org
Thu Nov 10 21:56:43 CET 2011


http://hg.python.org/cpython/rev/8d9a869db675
changeset:   73487:8d9a869db675
user:        Antoine Pitrou <solipsis at pitrou.net>
date:        Thu Nov 10 21:47:38 2011 +0100
summary:
  Issue #13149: Speed up append-only StringIO objects.
This is very similar to the "lazy strings" idea.

files:
  Misc/NEWS               |    2 +
  Modules/_io/stringio.c  |  109 ++++++++++++++++++++++++++-
  Objects/unicodeobject.c |    2 +-
  3 files changed, 108 insertions(+), 5 deletions(-)


diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -365,6 +365,8 @@
 Library
 -------
 
+- Issue #13149: Speed up append-only StringIO objects.
+
 - Issue #13373: multiprocessing.Queue.get() could sometimes block indefinitely
   when called with a timeout.  Patch by Arnaud Ysmal.
 
diff --git a/Modules/_io/stringio.c b/Modules/_io/stringio.c
--- a/Modules/_io/stringio.c
+++ b/Modules/_io/stringio.c
@@ -7,6 +7,9 @@
    than the enclosed string, for proper functioning of _PyIO_find_line_ending.
 */
 
+#define STATE_REALIZED 1
+#define STATE_ACCUMULATING 2
+
 typedef struct {
     PyObject_HEAD
     Py_UCS4 *buf;
@@ -14,6 +17,15 @@
     Py_ssize_t string_size;
     size_t buf_size;
 
+    /* The stringio object can be in two states: accumulating or realized.
+       In accumulating state, the internal buffer contains nothing and
+       the contents are given by the embedded _PyAccu structure.
+       In realized state, the internal buffer is meaningful and the
+       _PyAccu is destroyed.
+    */
+    int state;
+    _PyAccu accu;
+
     char ok; /* initialized? */
     char closed;
     char readuniversal;
@@ -40,6 +52,11 @@
         return NULL; \
     }
 
+#define ENSURE_REALIZED(self) \
+    if (realize(self) < 0) { \
+        return NULL; \
+    }
+
 PyDoc_STRVAR(stringio_doc,
     "Text I/O implementation using an in-memory buffer.\n"
     "\n"
@@ -102,6 +119,54 @@
     return -1;
 }
 
+static PyObject *
+make_intermediate(stringio *self)
+{
+    PyObject *intermediate = _PyAccu_Finish(&self->accu);
+    self->state = STATE_REALIZED;
+    if (intermediate == NULL)
+        return NULL;
+    if (_PyAccu_Init(&self->accu) ||
+        _PyAccu_Accumulate(&self->accu, intermediate)) {
+        Py_DECREF(intermediate);
+        return NULL;
+    }
+    self->state = STATE_ACCUMULATING;
+    return intermediate;
+}
+
+static int
+realize(stringio *self)
+{
+    Py_ssize_t len;
+    PyObject *intermediate;
+
+    if (self->state == STATE_REALIZED)
+        return 0;
+    assert(self->state == STATE_ACCUMULATING);
+    self->state = STATE_REALIZED;
+
+    intermediate = _PyAccu_Finish(&self->accu);
+    if (intermediate == NULL)
+        return -1;
+
+    /* Append the intermediate string to the internal buffer.
+       The length should be equal to the current cursor position.
+     */
+    len = PyUnicode_GET_LENGTH(intermediate);
+    if (resize_buffer(self, len) < 0) {
+        Py_DECREF(intermediate);
+        return -1;
+    }
+    if (!PyUnicode_AsUCS4(intermediate, self->buf, len, 0)) {
+        Py_DECREF(intermediate);
+        return -1;
+    }
+
+    Py_DECREF(intermediate);
+    return 0;
+}
+
 /* Internal routine for writing a whole PyUnicode object to the buffer of a
    StringIO object. Returns 0 on success, or -1 on error. */
 static Py_ssize_t
@@ -136,7 +201,6 @@
         return -1;
     }
     len = PyUnicode_GET_LENGTH(decoded);
-
     assert(len >= 0);
 
     /* This overflow check is not strictly necessary. However, it avoids us to
@@ -147,6 +211,17 @@
                         "new position too large");
         goto fail;
     }
+
+    if (self->state == STATE_ACCUMULATING) {
+        if (self->string_size == self->pos) {
+            if (_PyAccu_Accumulate(&self->accu, decoded))
+                goto fail;
+            goto success;
+        }
+        if (realize(self))
+            goto fail;
+    }
+
     if (self->pos + len > self->string_size) {
         if (resize_buffer(self, self->pos + len) < 0)
             goto fail;
@@ -174,6 +249,7 @@
                           0))
         goto fail;
 
+success:
     /* Set the new length of the internal string if it has changed. */
     self->pos += len;
     if (self->string_size < self->pos)
@@ -195,6 +271,8 @@
 {
     CHECK_INITIALIZED(self);
     CHECK_CLOSED(self);
+    if (self->state == STATE_ACCUMULATING)
+        return make_intermediate(self);
     return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, self->buf,
                                      self->string_size);
 }
@@ -251,6 +329,14 @@
             size = 0;
     }
 
+    /* Optimization for seek(0); read() */
+    if (self->state == STATE_ACCUMULATING && self->pos == 0 && size == n) {
+        PyObject *result = make_intermediate(self);
+        self->pos = self->string_size;
+        return result;
+    }
+
+    ENSURE_REALIZED(self);
     output = self->buf + self->pos;
     self->pos += size;
     return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, size);
@@ -301,6 +387,7 @@
     if (!PyArg_ParseTuple(args, "|O:readline", &arg))
         return NULL;
     CHECK_CLOSED(self);
+    ENSURE_REALIZED(self);
 
     if (PyNumber_Check(arg)) {
         limit = PyNumber_AsSsize_t(arg, PyExc_OverflowError);
@@ -322,6 +409,7 @@
 
     CHECK_INITIALIZED(self);
     CHECK_CLOSED(self);
+    ENSURE_REALIZED(self);
 
     if (Py_TYPE(self) == &PyStringIO_Type) {
         /* Skip method call overhead for speed */
@@ -392,6 +480,7 @@
     }
 
     if (size < self->string_size) {
+        ENSURE_REALIZED(self);
         if (resize_buffer(self, size) < 0)
             return NULL;
         self->string_size = size;
@@ -492,6 +581,7 @@
     /* Free up some memory */
     if (resize_buffer(self, 0) < 0)
         return NULL;
+    _PyAccu_Destroy(&self->accu);
     Py_CLEAR(self->readnl);
     Py_CLEAR(self->writenl);
     Py_CLEAR(self->decoder);
@@ -521,6 +611,7 @@
         PyMem_Free(self->buf);
         self->buf = NULL;
     }
+    _PyAccu_Destroy(&self->accu);
     Py_CLEAR(self->readnl);
     Py_CLEAR(self->writenl);
     Py_CLEAR(self->decoder);
@@ -559,6 +650,7 @@
     PyObject *value = NULL;
     PyObject *newline_obj = NULL;
     char *newline = "\n";
+    Py_ssize_t value_len;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO:__init__", kwlist,
                                      &value, &newline_obj))
@@ -600,6 +692,7 @@
 
     self->ok = 0;
 
+    _PyAccu_Destroy(&self->accu);
     Py_CLEAR(self->readnl);
     Py_CLEAR(self->writenl);
     Py_CLEAR(self->decoder);
@@ -636,19 +729,27 @@
     /* Now everything is set up, resize buffer to size of initial value,
        and copy it */
     self->string_size = 0;
-    if (value && value != Py_None) {
-        Py_ssize_t len = PyUnicode_GetSize(value);
+    if (value && value != Py_None)
+        value_len = PyUnicode_GetSize(value);
+    else
+        value_len = 0;
+    if (value_len > 0) {
         /* This is a heuristic, for newline translation might change
            the string length. */
-        if (resize_buffer(self, len) < 0)
+        if (resize_buffer(self, 0) < 0)
             return -1;
+        self->state = STATE_REALIZED;
         self->pos = 0;
         if (write_str(self, value) < 0)
             return -1;
     }
     else {
+        /* Empty stringio object, we can start by accumulating */
         if (resize_buffer(self, 0) < 0)
             return -1;
+        if (_PyAccu_Init(&self->accu))
+            return -1;
+        self->state = STATE_ACCUMULATING;
     }
     self->pos = 0;
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2055,7 +2055,7 @@
 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
                  int copy_null)
 {
-    if (target == NULL || targetsize < 1) {
+    if (target == NULL || targetsize < 0) {
         PyErr_BadInternalCall();
         return NULL;
     }

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list