[Python-Dev] cStringIO vs io.BytesIO

dw+python-dev at hmmz.org dw+python-dev at hmmz.org
Thu Jul 17 02:18:21 CEST 2014


It's worth note that a natural extension of this is to do something very
similar on the write side: instead of generating a temporary private
heap allocation, generate (and freely resize) a private PyBytes object
until it is exposed to the user, at which point, _getvalue() returns it,
and converts its into an IO_SHARED buffer.

That way another copy is avoided in the common case of building a
string, calling getvalue() once, then discarding the IO object.


David

On Wed, Jul 16, 2014 at 11:07:54PM +0000, dw+python-dev at hmmz.org wrote:
> On Thu, Jul 17, 2014 at 03:44:23AM +0600, Mikhail Korobov wrote:
> 
> > So making code 3.x compatible by ditching cStringIO can cause a serious
> > performance/memory  regressions. One can change the code to build the data
> > using BytesIO (without creating bytes objects in the first place), but that is
> > not always possible or convenient.
> > 
> > I believe this problem affects tornado (https://github.com/tornadoweb/tornado/
> > Do you know if there a workaround? Maybe there is some stdlib part that I'm
> > missing, or a module on PyPI? It is not that hard to write an own wrapper that
> > won't do copies (or to port [c]StringIO to 3.x), but I wonder if there is an
> > existing solution or plans to fix it in Python itself - this BytesIO use case
> > looks quite important.
> 
> Regarding a fix, the problem seems mostly that the StringI/StringO
> specializations were removed, and the new implementation is basically
> just a StringO.
> 
> At a small cost to memory, it is easy to add a Py_buffer source and
> flags variable to the bytesio struct, with the buffers initially setup
> for reading, and if a mutation method is called, check for a
> copy-on-write flag, duplicate the source object into private memory,
> then continue operating as it does now.
> 
> Attached is a (rough) patch implementing this, feel free to try it with
> hg tip.
> 
>     [23:03:44 k2!124 cpython] cat i.py
>     import io
>     buf = b'x' * (1048576 * 16)
>     def x():
>         io.BytesIO(buf)
> 
>     [23:03:51 k2!125 cpython] ./python -m timeit  -s 'import i' 'i.x()'
>     100 loops, best of 3: 2.9 msec per loop
> 
>     [23:03:57 k2!126 cpython] ./python-cow -m timeit  -s 'import i' 'i.x()'
>     1000000 loops, best of 3: 0.364 usec per loop
> 
> 
> David
> 
> 
> 
> diff --git a/Modules/_io/bytesio.c b/Modules/_io/bytesio.c
> --- a/Modules/_io/bytesio.c
> +++ b/Modules/_io/bytesio.c
> @@ -2,6 +2,12 @@
>  #include "structmember.h"       /* for offsetof() */
>  #include "_iomodule.h"
>  
> +enum io_flags {
> +    /* initvalue describes a borrowed buffer we cannot modify and must later
> +     * release */
> +    IO_SHARED = 1
> +};
> +
>  typedef struct {
>      PyObject_HEAD
>      char *buf;
> @@ -11,6 +17,10 @@
>      PyObject *dict;
>      PyObject *weakreflist;
>      Py_ssize_t exports;
> +    Py_buffer initvalue;
> +    /* If IO_SHARED, indicates PyBuffer_release(initvalue) required, and that
> +     * we don't own buf. */
> +    enum io_flags flags;
>  } bytesio;
>  
>  typedef struct {
> @@ -33,6 +43,47 @@
>          return NULL; \
>      }
>  
> +/* Unshare our buffer in preparation for writing, in the case that an
> + * initvalue object was provided, and we're currently borrowing its buffer.
> + * size indicates the total reserved buffer size allocated as part of
> + * unsharing, to avoid a potentially redundant allocation in the subsequent
> + * mutation.
> + */
> +static int
> +unshare(bytesio *self, size_t size)
> +{
> +    Py_ssize_t new_size = size;
> +    Py_ssize_t copy_size = size;
> +    char *new_buf;
> +
> +    /* Do nothing if buffer wasn't shared */
> +    if (! (self->flags & IO_SHARED)) {
> +        return 0;
> +    }
> +
> +    /* If hint provided, adjust our new buffer size and truncate the amount of
> +     * source buffer we copy as necessary. */
> +    if (size > copy_size) {
> +        copy_size = size;
> +    }
> +
> +    /* Allocate or fail. */
> +    new_buf = (char *)PyMem_Malloc(new_size);
> +    if (new_buf == NULL) {
> +        PyErr_NoMemory();
> +        return -1;
> +    }
> +
> +    /* Copy the (possibly now truncated) source string to the new buffer, and
> +     * forget any reference used to keep the source buffer alive. */
> +    memcpy(new_buf, self->buf, copy_size);
> +    PyBuffer_Release(&self->initvalue);
> +    self->flags &= ~IO_SHARED;
> +    self->buf = new_buf;
> +    self->buf_size = new_size;
> +    self->string_size = (Py_ssize_t) copy_size;
> +    return 0;
> +}
>  
>  /* Internal routine to get a line from the buffer of a BytesIO
>     object. Returns the length between the current position to the
> @@ -125,11 +176,18 @@
>  static Py_ssize_t
>  write_bytes(bytesio *self, const char *bytes, Py_ssize_t len)
>  {
> +    size_t desired;
> +
>      assert(self->buf != NULL);
>      assert(self->pos >= 0);
>      assert(len >= 0);
>  
> -    if ((size_t)self->pos + len > self->buf_size) {
> +    desired = (size_t)self->pos + len;
> +    if (unshare(self, desired)) {
> +        return -1;
> +    }
> +
> +    if (desired > self->buf_size) {
>          if (resize_buffer(self, (size_t)self->pos + len) < 0)
>              return -1;
>      }
> @@ -502,6 +560,10 @@
>          return NULL;
>      }
>  
> +    if (unshare(self, size)) {
> +        return NULL;
> +    }
> +
>      if (size < self->string_size) {
>          self->string_size = size;
>          if (resize_buffer(self, size) < 0)
> @@ -655,10 +717,13 @@
>  static PyObject *
>  bytesio_close(bytesio *self)
>  {
> -    if (self->buf != NULL) {
> +    if (self->flags & IO_SHARED) {
> +        PyBuffer_Release(&self->initvalue);
> +        self->flags &= ~IO_SHARED;
> +    } else if (self->buf != NULL) {
>          PyMem_Free(self->buf);
> -        self->buf = NULL;
>      }
> +    self->buf = NULL;
>      Py_RETURN_NONE;
>  }
>  
> @@ -788,10 +853,17 @@
>                          "deallocated BytesIO object has exported buffers");
>          PyErr_Print();
>      }
> -    if (self->buf != NULL) {
> +
> +    if (self->flags & IO_SHARED) {
> +        /* We borrowed buf from another object */
> +        PyBuffer_Release(&self->initvalue);
> +        self->flags &= ~IO_SHARED;
> +    } else if (self->buf != NULL) {
> +        /* We owned buf */
>          PyMem_Free(self->buf);
> -        self->buf = NULL;
>      }
> +    self->buf = NULL;
> +
>      Py_CLEAR(self->dict);
>      if (self->weakreflist != NULL)
>          PyObject_ClearWeakRefs((PyObject *) self);
> @@ -811,12 +883,6 @@
>      /* tp_alloc initializes all the fields to zero. So we don't have to
>         initialize them here. */
>  
> -    self->buf = (char *)PyMem_Malloc(0);
> -    if (self->buf == NULL) {
> -        Py_DECREF(self);
> -        return PyErr_NoMemory();
> -    }
> -
>      return (PyObject *)self;
>  }
>  
> @@ -834,13 +900,32 @@
>      self->string_size = 0;
>      self->pos = 0;
>  
> +    /* Release any previous initvalue. */
> +    if (self->flags & IO_SHARED) {
> +        PyBuffer_Release(&self->initvalue);
> +        self->buf = NULL;
> +        self->flags &= ~IO_SHARED;
> +    }
> +
>      if (initvalue && initvalue != Py_None) {
> -        PyObject *res;
> -        res = bytesio_write(self, initvalue);
> -        if (res == NULL)
> +        Py_buffer *buf = &self->initvalue;
> +        if (PyObject_GetBuffer(initvalue, buf, PyBUF_CONTIG_RO) < 0) {
>              return -1;
> -        Py_DECREF(res);
> -        self->pos = 0;
> +        }
> +        self->buf = self->initvalue.buf;
> +        self->buf_size = (size_t)self->initvalue.len;
> +        self->string_size = self->initvalue.len;
> +        self->flags |= IO_SHARED;
> +    }
> +
> +    /* If no initvalue provided, prepare a private buffer now. */
> +    if (self->buf == NULL) {
> +        self->buf = (char *)PyMem_Malloc(0);
> +        if (self->buf == NULL) {
> +            Py_DECREF(self);
> +            PyErr_NoMemory();
> +            return -1;
> +        }
>      }
>  
>      return 0;
> _______________________________________________
> Python-Dev mailing list
> Python-Dev at python.org
> https://mail.python.org/mailman/listinfo/python-dev
> Unsubscribe: https://mail.python.org/mailman/options/python-dev/dw%2Bpython-dev%40hmmz.org


More information about the Python-Dev mailing list