[Python-Dev] cStringIO vs io.BytesIO
dw+python-dev at hmmz.org
dw+python-dev at hmmz.org
Thu Jul 17 02:18:21 CEST 2014
It's worth note that a natural extension of this is to do something very
similar on the write side: instead of generating a temporary private
heap allocation, generate (and freely resize) a private PyBytes object
until it is exposed to the user, at which point, _getvalue() returns it,
and converts its into an IO_SHARED buffer.
That way another copy is avoided in the common case of building a
string, calling getvalue() once, then discarding the IO object.
David
On Wed, Jul 16, 2014 at 11:07:54PM +0000, dw+python-dev at hmmz.org wrote:
> On Thu, Jul 17, 2014 at 03:44:23AM +0600, Mikhail Korobov wrote:
>
> > So making code 3.x compatible by ditching cStringIO can cause a serious
> > performance/memory regressions. One can change the code to build the data
> > using BytesIO (without creating bytes objects in the first place), but that is
> > not always possible or convenient.
> >
> > I believe this problem affects tornado (https://github.com/tornadoweb/tornado/
> > Do you know if there a workaround? Maybe there is some stdlib part that I'm
> > missing, or a module on PyPI? It is not that hard to write an own wrapper that
> > won't do copies (or to port [c]StringIO to 3.x), but I wonder if there is an
> > existing solution or plans to fix it in Python itself - this BytesIO use case
> > looks quite important.
>
> Regarding a fix, the problem seems mostly that the StringI/StringO
> specializations were removed, and the new implementation is basically
> just a StringO.
>
> At a small cost to memory, it is easy to add a Py_buffer source and
> flags variable to the bytesio struct, with the buffers initially setup
> for reading, and if a mutation method is called, check for a
> copy-on-write flag, duplicate the source object into private memory,
> then continue operating as it does now.
>
> Attached is a (rough) patch implementing this, feel free to try it with
> hg tip.
>
> [23:03:44 k2!124 cpython] cat i.py
> import io
> buf = b'x' * (1048576 * 16)
> def x():
> io.BytesIO(buf)
>
> [23:03:51 k2!125 cpython] ./python -m timeit -s 'import i' 'i.x()'
> 100 loops, best of 3: 2.9 msec per loop
>
> [23:03:57 k2!126 cpython] ./python-cow -m timeit -s 'import i' 'i.x()'
> 1000000 loops, best of 3: 0.364 usec per loop
>
>
> David
>
>
>
> diff --git a/Modules/_io/bytesio.c b/Modules/_io/bytesio.c
> --- a/Modules/_io/bytesio.c
> +++ b/Modules/_io/bytesio.c
> @@ -2,6 +2,12 @@
> #include "structmember.h" /* for offsetof() */
> #include "_iomodule.h"
>
> +enum io_flags {
> + /* initvalue describes a borrowed buffer we cannot modify and must later
> + * release */
> + IO_SHARED = 1
> +};
> +
> typedef struct {
> PyObject_HEAD
> char *buf;
> @@ -11,6 +17,10 @@
> PyObject *dict;
> PyObject *weakreflist;
> Py_ssize_t exports;
> + Py_buffer initvalue;
> + /* If IO_SHARED, indicates PyBuffer_release(initvalue) required, and that
> + * we don't own buf. */
> + enum io_flags flags;
> } bytesio;
>
> typedef struct {
> @@ -33,6 +43,47 @@
> return NULL; \
> }
>
> +/* Unshare our buffer in preparation for writing, in the case that an
> + * initvalue object was provided, and we're currently borrowing its buffer.
> + * size indicates the total reserved buffer size allocated as part of
> + * unsharing, to avoid a potentially redundant allocation in the subsequent
> + * mutation.
> + */
> +static int
> +unshare(bytesio *self, size_t size)
> +{
> + Py_ssize_t new_size = size;
> + Py_ssize_t copy_size = size;
> + char *new_buf;
> +
> + /* Do nothing if buffer wasn't shared */
> + if (! (self->flags & IO_SHARED)) {
> + return 0;
> + }
> +
> + /* If hint provided, adjust our new buffer size and truncate the amount of
> + * source buffer we copy as necessary. */
> + if (size > copy_size) {
> + copy_size = size;
> + }
> +
> + /* Allocate or fail. */
> + new_buf = (char *)PyMem_Malloc(new_size);
> + if (new_buf == NULL) {
> + PyErr_NoMemory();
> + return -1;
> + }
> +
> + /* Copy the (possibly now truncated) source string to the new buffer, and
> + * forget any reference used to keep the source buffer alive. */
> + memcpy(new_buf, self->buf, copy_size);
> + PyBuffer_Release(&self->initvalue);
> + self->flags &= ~IO_SHARED;
> + self->buf = new_buf;
> + self->buf_size = new_size;
> + self->string_size = (Py_ssize_t) copy_size;
> + return 0;
> +}
>
> /* Internal routine to get a line from the buffer of a BytesIO
> object. Returns the length between the current position to the
> @@ -125,11 +176,18 @@
> static Py_ssize_t
> write_bytes(bytesio *self, const char *bytes, Py_ssize_t len)
> {
> + size_t desired;
> +
> assert(self->buf != NULL);
> assert(self->pos >= 0);
> assert(len >= 0);
>
> - if ((size_t)self->pos + len > self->buf_size) {
> + desired = (size_t)self->pos + len;
> + if (unshare(self, desired)) {
> + return -1;
> + }
> +
> + if (desired > self->buf_size) {
> if (resize_buffer(self, (size_t)self->pos + len) < 0)
> return -1;
> }
> @@ -502,6 +560,10 @@
> return NULL;
> }
>
> + if (unshare(self, size)) {
> + return NULL;
> + }
> +
> if (size < self->string_size) {
> self->string_size = size;
> if (resize_buffer(self, size) < 0)
> @@ -655,10 +717,13 @@
> static PyObject *
> bytesio_close(bytesio *self)
> {
> - if (self->buf != NULL) {
> + if (self->flags & IO_SHARED) {
> + PyBuffer_Release(&self->initvalue);
> + self->flags &= ~IO_SHARED;
> + } else if (self->buf != NULL) {
> PyMem_Free(self->buf);
> - self->buf = NULL;
> }
> + self->buf = NULL;
> Py_RETURN_NONE;
> }
>
> @@ -788,10 +853,17 @@
> "deallocated BytesIO object has exported buffers");
> PyErr_Print();
> }
> - if (self->buf != NULL) {
> +
> + if (self->flags & IO_SHARED) {
> + /* We borrowed buf from another object */
> + PyBuffer_Release(&self->initvalue);
> + self->flags &= ~IO_SHARED;
> + } else if (self->buf != NULL) {
> + /* We owned buf */
> PyMem_Free(self->buf);
> - self->buf = NULL;
> }
> + self->buf = NULL;
> +
> Py_CLEAR(self->dict);
> if (self->weakreflist != NULL)
> PyObject_ClearWeakRefs((PyObject *) self);
> @@ -811,12 +883,6 @@
> /* tp_alloc initializes all the fields to zero. So we don't have to
> initialize them here. */
>
> - self->buf = (char *)PyMem_Malloc(0);
> - if (self->buf == NULL) {
> - Py_DECREF(self);
> - return PyErr_NoMemory();
> - }
> -
> return (PyObject *)self;
> }
>
> @@ -834,13 +900,32 @@
> self->string_size = 0;
> self->pos = 0;
>
> + /* Release any previous initvalue. */
> + if (self->flags & IO_SHARED) {
> + PyBuffer_Release(&self->initvalue);
> + self->buf = NULL;
> + self->flags &= ~IO_SHARED;
> + }
> +
> if (initvalue && initvalue != Py_None) {
> - PyObject *res;
> - res = bytesio_write(self, initvalue);
> - if (res == NULL)
> + Py_buffer *buf = &self->initvalue;
> + if (PyObject_GetBuffer(initvalue, buf, PyBUF_CONTIG_RO) < 0) {
> return -1;
> - Py_DECREF(res);
> - self->pos = 0;
> + }
> + self->buf = self->initvalue.buf;
> + self->buf_size = (size_t)self->initvalue.len;
> + self->string_size = self->initvalue.len;
> + self->flags |= IO_SHARED;
> + }
> +
> + /* If no initvalue provided, prepare a private buffer now. */
> + if (self->buf == NULL) {
> + self->buf = (char *)PyMem_Malloc(0);
> + if (self->buf == NULL) {
> + Py_DECREF(self);
> + PyErr_NoMemory();
> + return -1;
> + }
> }
>
> return 0;
> _______________________________________________
> Python-Dev mailing list
> Python-Dev at python.org
> https://mail.python.org/mailman/listinfo/python-dev
> Unsubscribe: https://mail.python.org/mailman/options/python-dev/dw%2Bpython-dev%40hmmz.org
More information about the Python-Dev
mailing list