import pickle class C(object): ... def __init__(self, x): ... self.long_attribute_name = x ... len(pickle.dumps([pickle.loads(pickle.dumps(C(None),
len(pickle.dumps([C(None) for i in range(100)],
Instance attribute names are normally interned - this is done in PyObject_SetAttr (among other places). Unpickling (in pickle and cPickle) directly updates __dict__ on the instance object. This bypasses the interning so you end up with many copies of the strings representing your attribute names, which wastes a lot of space, both in RAM and in pickles of sequences of objects created from pickles. Note that the native python memcached client uses pickle to serialize objects. pickle.HIGHEST_PROTOCOL)) for i in range(100)], pickle.HIGHEST_PROTOCOL)) 3658 pickle.HIGHEST_PROTOCOL)) 1441
Interning the strings on unpickling makes the pickles smaller, and at least for cPickle actually makes unpickling sequences of many objects slightly faster. I have included proposed patches to cPickle.c and pickle.py, and would appreciate any feedback. dhcp-172-31-170-32:~ mcguire$ diff -u Downloads/Python-2.4.3/Modules/ cPickle.c cPickle.c --- Downloads/Python-2.4.3/Modules/cPickle.c 2004-07-26 22:22:33.000000000 -0700 +++ cPickle.c 2009-01-26 23:30:31.000000000 -0800 @@ -4258,6 +4258,8 @@ PyObject *state, *inst, *slotstate; PyObject *__setstate__; PyObject *d_key, *d_value; + PyObject *name; + char * key_str; int i; int res = -1; @@ -4319,8 +4321,24 @@ i = 0; while (PyDict_Next(state, &i, &d_key, &d_value)) { - if (PyObject_SetItem(dict, d_key, d_value) < 0) - goto finally; + /* normally the keys for instance attributes are + interned. we should try to do that here. */ + if (PyString_CheckExact(d_key)) { + key_str = PyString_AsString(d_key); + name = PyString_FromString(key_str); + if (! name) + goto finally; + + PyString_InternInPlace(&name); + if (PyObject_SetItem(dict, name, d_value) < 0) { + Py_DECREF(name); + goto finally; + } + Py_DECREF(name); + } else { + if (PyObject_SetItem(dict, d_key, d_value) < 0) + goto finally; + } } Py_DECREF(dict); } dhcp-172-31-170-32:~ mcguire$ diff -u Downloads/Python-2.4.3/Lib/ pickle.py pickle.py --- Downloads/Python-2.4.3/Lib/pickle.py 2009-01-27 01:41:43.000000000 -0800 +++ pickle.py 2009-01-27 01:41:31.000000000 -0800 @@ -1241,7 +1241,15 @@ state, slotstate = state if state: try: - inst.__dict__.update(state) + d = inst.__dict__ + try: + for k,v in state.items(): + d[intern(k)] = v + # keys in state don't have to be strings + # don't blow up, but don't go out of our way + except TypeError: + d.update(state) + except RuntimeError: # XXX In restricted execution, the instance's __dict__ # is not accessible. Use the old way of unpickling