[Python-Dev] Unicode charmap decoders slow

jepler@unpythonic.net jepler at unpythonic.net
Wed Oct 5 14:54:05 CEST 2005


The function the module below, xlate.xlate, doesn't quite do what "".decode
does.  (mostly that characters that don't exist are mapped to u+fffd always,
instead of having the various behaviors avilable to "".decode)

It builds the fast decoding structure once per call, but when decoding 53kb of
data that overhead is small enough to make it much faster than
s.decode('mac-roman').  For smaller buffers (I tried 53 characters), s.decode is
two times faster. (43us vs 21us)

$ timeit.py -s "s='a'*53*1024; import xlate" "s.decode('mac-roman')"
100 loops, best of 3: 12.8 msec per loop
$ timeit.py -s "s='a'*53*1024; import xlate, encodings.mac_roman" \
	"xlate.xlate(s, encodings.mac_roman.decoding_map)"
1000 loops, best of 3: 573 usec per loop

Jeff
-------------- next part --------------
#include <Python.h>
#include <stringobject.h>
#include <dictobject.h>

PyObject *xlate(PyObject *s, PyObject *o) {
    unsigned char *inbuf;
    int i, length, pos=0;
    PyObject *map, *key, *value, *ret;
    Py_UNICODE *u, *ru;

    if(!PyArg_ParseTuple(o, "s#O", (char*)&inbuf, &length, &map)) return NULL;
    if(!PyDict_Check(map)) {
        PyErr_SetString(PyExc_TypeError, "Argument 2 must be a dictionary");
        return NULL;
    }

    u = PyMem_Malloc(sizeof(Py_UNICODE) * 256);
    if(!u) { return NULL; }
    for(i=0; i<256; i++) {
        u[i] = 0xfffd;
    }

    while(PyDict_Next(map, &pos, &key, &value)) {
        int ki, vi;
        if(!PyInt_Check(key)) { 
            PyErr_SetString(PyExc_TypeError, "Dictionary keys must be ints");
            return NULL;
        }
        ki = PyInt_AsLong(key);
        if(ki < 0 || ki > 255) { 
            PyErr_Format(PyExc_TypeError,
                "Dictionary keys must be in the range 0..255 (saw %d)", ki);
            return NULL;
        }
        if(value == Py_None) continue;
        if(!PyInt_Check(value)) { 
            PyErr_SetString(PyExc_TypeError, "Dictionary values must be ints or None");
            return NULL;
        }
        vi = PyInt_AsLong(value);
        u[ki] = vi;
    }

    ret = PyUnicode_FromUnicode(NULL, length);
    if(!ret) { free(u); return NULL; }
    ru = PyUnicode_AsUnicode(ret);
    for(i=0; i<length; i++) {
        ru[i] = u[inbuf[i]];
    }
    free(u);
    return ret;
}

PyMethodDef md[] = {
    {"xlate", (PyCFunction)xlate, METH_VARARGS, NULL},
    {NULL, NULL, 0, NULL}
};

void initxlate(void) {
    Py_InitModule("xlate", md);
}
-------------- next part --------------
import encodings.mac_roman
import xlate

def test(encname, decoding_map):

    s = ""
    for k, v in decoding_map.items():
        if v is not None: 
            s += chr(k)

    u1 = s.decode(encname)
    print decoding_map
    u2 = xlate.xlate(s, decoding_map)
    assert u1 == u2

test("mac-roman", encodings.mac_roman.decoding_map)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
Url : http://mail.python.org/pipermail/python-dev/attachments/20051005/ebe46a0e/attachment.pgp


More information about the Python-Dev mailing list