[Python-Dev] cStringIO vs io.BytesIO

dw+python-dev at hmmz.org dw+python-dev at hmmz.org
Thu Jul 17 01:07:54 CEST 2014


On Thu, Jul 17, 2014 at 03:44:23AM +0600, Mikhail Korobov wrote:

> So making code 3.x compatible by ditching cStringIO can cause a serious
> performance/memory  regressions. One can change the code to build the data
> using BytesIO (without creating bytes objects in the first place), but that is
> not always possible or convenient.
> 
> I believe this problem affects tornado (https://github.com/tornadoweb/tornado/
> Do you know if there a workaround? Maybe there is some stdlib part that I'm
> missing, or a module on PyPI? It is not that hard to write an own wrapper that
> won't do copies (or to port [c]StringIO to 3.x), but I wonder if there is an
> existing solution or plans to fix it in Python itself - this BytesIO use case
> looks quite important.

Regarding a fix, the problem seems mostly that the StringI/StringO
specializations were removed, and the new implementation is basically
just a StringO.

At a small cost to memory, it is easy to add a Py_buffer source and
flags variable to the bytesio struct, with the buffers initially setup
for reading, and if a mutation method is called, check for a
copy-on-write flag, duplicate the source object into private memory,
then continue operating as it does now.

Attached is a (rough) patch implementing this, feel free to try it with
hg tip.

    [23:03:44 k2!124 cpython] cat i.py
    import io
    buf = b'x' * (1048576 * 16)
    def x():
        io.BytesIO(buf)

    [23:03:51 k2!125 cpython] ./python -m timeit  -s 'import i' 'i.x()'
    100 loops, best of 3: 2.9 msec per loop

    [23:03:57 k2!126 cpython] ./python-cow -m timeit  -s 'import i' 'i.x()'
    1000000 loops, best of 3: 0.364 usec per loop


David



diff --git a/Modules/_io/bytesio.c b/Modules/_io/bytesio.c
--- a/Modules/_io/bytesio.c
+++ b/Modules/_io/bytesio.c
@@ -2,6 +2,12 @@
 #include "structmember.h"       /* for offsetof() */
 #include "_iomodule.h"
 
+enum io_flags {
+    /* initvalue describes a borrowed buffer we cannot modify and must later
+     * release */
+    IO_SHARED = 1
+};
+
 typedef struct {
     PyObject_HEAD
     char *buf;
@@ -11,6 +17,10 @@
     PyObject *dict;
     PyObject *weakreflist;
     Py_ssize_t exports;
+    Py_buffer initvalue;
+    /* If IO_SHARED, indicates PyBuffer_release(initvalue) required, and that
+     * we don't own buf. */
+    enum io_flags flags;
 } bytesio;
 
 typedef struct {
@@ -33,6 +43,47 @@
         return NULL; \
     }
 
+/* Unshare our buffer in preparation for writing, in the case that an
+ * initvalue object was provided, and we're currently borrowing its buffer.
+ * size indicates the total reserved buffer size allocated as part of
+ * unsharing, to avoid a potentially redundant allocation in the subsequent
+ * mutation.
+ */
+static int
+unshare(bytesio *self, size_t size)
+{
+    Py_ssize_t new_size = size;
+    Py_ssize_t copy_size = size;
+    char *new_buf;
+
+    /* Do nothing if buffer wasn't shared */
+    if (! (self->flags & IO_SHARED)) {
+        return 0;
+    }
+
+    /* If hint provided, adjust our new buffer size and truncate the amount of
+     * source buffer we copy as necessary. */
+    if (size > copy_size) {
+        copy_size = size;
+    }
+
+    /* Allocate or fail. */
+    new_buf = (char *)PyMem_Malloc(new_size);
+    if (new_buf == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Copy the (possibly now truncated) source string to the new buffer, and
+     * forget any reference used to keep the source buffer alive. */
+    memcpy(new_buf, self->buf, copy_size);
+    PyBuffer_Release(&self->initvalue);
+    self->flags &= ~IO_SHARED;
+    self->buf = new_buf;
+    self->buf_size = new_size;
+    self->string_size = (Py_ssize_t) copy_size;
+    return 0;
+}
 
 /* Internal routine to get a line from the buffer of a BytesIO
    object. Returns the length between the current position to the
@@ -125,11 +176,18 @@
 static Py_ssize_t
 write_bytes(bytesio *self, const char *bytes, Py_ssize_t len)
 {
+    size_t desired;
+
     assert(self->buf != NULL);
     assert(self->pos >= 0);
     assert(len >= 0);
 
-    if ((size_t)self->pos + len > self->buf_size) {
+    desired = (size_t)self->pos + len;
+    if (unshare(self, desired)) {
+        return -1;
+    }
+
+    if (desired > self->buf_size) {
         if (resize_buffer(self, (size_t)self->pos + len) < 0)
             return -1;
     }
@@ -502,6 +560,10 @@
         return NULL;
     }
 
+    if (unshare(self, size)) {
+        return NULL;
+    }
+
     if (size < self->string_size) {
         self->string_size = size;
         if (resize_buffer(self, size) < 0)
@@ -655,10 +717,13 @@
 static PyObject *
 bytesio_close(bytesio *self)
 {
-    if (self->buf != NULL) {
+    if (self->flags & IO_SHARED) {
+        PyBuffer_Release(&self->initvalue);
+        self->flags &= ~IO_SHARED;
+    } else if (self->buf != NULL) {
         PyMem_Free(self->buf);
-        self->buf = NULL;
     }
+    self->buf = NULL;
     Py_RETURN_NONE;
 }
 
@@ -788,10 +853,17 @@
                         "deallocated BytesIO object has exported buffers");
         PyErr_Print();
     }
-    if (self->buf != NULL) {
+
+    if (self->flags & IO_SHARED) {
+        /* We borrowed buf from another object */
+        PyBuffer_Release(&self->initvalue);
+        self->flags &= ~IO_SHARED;
+    } else if (self->buf != NULL) {
+        /* We owned buf */
         PyMem_Free(self->buf);
-        self->buf = NULL;
     }
+    self->buf = NULL;
+
     Py_CLEAR(self->dict);
     if (self->weakreflist != NULL)
         PyObject_ClearWeakRefs((PyObject *) self);
@@ -811,12 +883,6 @@
     /* tp_alloc initializes all the fields to zero. So we don't have to
        initialize them here. */
 
-    self->buf = (char *)PyMem_Malloc(0);
-    if (self->buf == NULL) {
-        Py_DECREF(self);
-        return PyErr_NoMemory();
-    }
-
     return (PyObject *)self;
 }
 
@@ -834,13 +900,32 @@
     self->string_size = 0;
     self->pos = 0;
 
+    /* Release any previous initvalue. */
+    if (self->flags & IO_SHARED) {
+        PyBuffer_Release(&self->initvalue);
+        self->buf = NULL;
+        self->flags &= ~IO_SHARED;
+    }
+
     if (initvalue && initvalue != Py_None) {
-        PyObject *res;
-        res = bytesio_write(self, initvalue);
-        if (res == NULL)
+        Py_buffer *buf = &self->initvalue;
+        if (PyObject_GetBuffer(initvalue, buf, PyBUF_CONTIG_RO) < 0) {
             return -1;
-        Py_DECREF(res);
-        self->pos = 0;
+        }
+        self->buf = self->initvalue.buf;
+        self->buf_size = (size_t)self->initvalue.len;
+        self->string_size = self->initvalue.len;
+        self->flags |= IO_SHARED;
+    }
+
+    /* If no initvalue provided, prepare a private buffer now. */
+    if (self->buf == NULL) {
+        self->buf = (char *)PyMem_Malloc(0);
+        if (self->buf == NULL) {
+            Py_DECREF(self);
+            PyErr_NoMemory();
+            return -1;
+        }
     }
 
     return 0;


More information about the Python-Dev mailing list