Support for new items in set type
Prateek
surekap at gmail.com
Mon Apr 23 01:17:49 EDT 2007
Oh dear god, I implemented this and it overall killed performance by
about 50% - 100%. The same script (entering 3000 items) takes between
88 - 109s (it was running in 55s earlier).
Here is the new Set implementation:
class SeaSet(set):
__slots__ = ['master', 'added', 'deleted']
def __init__(self, s=None):
if s is None: s = []
self.master = set(s)
self.added = set()
self.deleted = set()
def add(self, l):
if l not in self.master:
self.added.add(l)
try:
self.deleted.remove(l)
except KeyError:
pass
def remove(self, l):
try:
self.master.remove(l)
self.deleted.add(l)
except KeyError:
try:
self.added.remove(l)
except KeyError:
pass
def __contains__(self, l):
if l in self.deleted:
return False
elif l in self.added:
return True
else:
return l in self.master
def __len__(self):
return len(self.master) + len(self.added)
def __iter__(self):
for x in self.master:
yield x
for x in self.added:
yield x
def __or__(self, s):
return self.union(s)
def __and__(self, s):
return self.intersection(s)
def __sub__(self, s):
return self.difference(s)
def union(self, s):
"""docstring for union"""
if isinstance(s, (set, frozenset)):
return s | self.master | self.added
elif isinstance(s, SeaSet):
return self.master | self.added | s.master | s.added
else:
raise TypeError
def intersection(self, s):
"""docstring for intersection"""
if isinstance(s, (set, frozenset)):
return s & self.master & self.added
elif isinstance(s, SeaSet):
return self.master & self.added & s.master & s.added
else:
raise TypeError
def difference(self, s):
"""docstring for difference"""
if isinstance(s, (set, frozenset)):
self.deleted |= (self.master - s)
self.master -= s
self.added -= s
elif isinstance(s, SeaSet):
t = (s.master | s.deleted)
self.deleted |= self.master - t
self.master -= t
self.added -= t
else:
raise TypeError
The surprising thing is that commits *ARE* running about 50% faster
(according to the time column in the hotshot profiler). But, now, the
longest running operations seem to be the I/O operations which are
taking 10 times longer! (even if they're only reading or writing a few
bytes. Could this have something to do with the set implementation
being in Python as opposed to C?
For instance, this method:
def __readTableHeader(self, f):
hdr = f.read(sz__TABLE_HEADER_FORMAT__)
if len(hdr) < sz__TABLE_HEADER_FORMAT__:
raise EOFError
t = THF_U(hdr)
#t = unpack(__TABLE_HEADER_FORMAT__, hdr)
return t
is now taking > 13s when it was taking less than 0.8s before! (same
number of calls, nothing changed except the set implementation)
sz__TABLE_HEADER_FORMAT__ is a constant = struct.calcsize("<II37s") =
45
THF_U is a reference to the struct.unpack function for the relevant
Struct object
What the heck happenned?!
Prateek
More information about the Python-list
mailing list