
Christian Tismer wrote: ... (my timings) Attached is the updated script with the timings mentioned in the last posting. Sorry, I posted an older version before. -- Christian Tismer :^) <mailto:tismer@tismer.com> Mission Impossible 5oftware : Have a break! Take a ride on Python's Kaunstr. 26 : *Starship* http://starship.python.net 14163 Berlin : PGP key -> http://wwwkeys.pgp.net PGP Fingerprint E182 71C7 1A9D 66E9 9D15 D3CC D4D7 93E2 1FAE F6DF where do you want to jump today? http://www.stackless.com ## dictest.py ## Test of a new rehash algorithm ## Chris Tismer ## 2000-12-17 ## Mission Impossible 5oftware Team # The following is a partial re-implementation of # Python dictionaries in Python. # The original algorithm was literally turned # into Python code. ##/* ##Table of irreducible polynomials to efficiently cycle through ##GF(2^n)-{0}, 2<=n<=30. ##*/ polys = [ 4 + 3, 8 + 3, 16 + 3, 32 + 5, 64 + 3, 128 + 3, 256 + 29, 512 + 17, 1024 + 9, 2048 + 5, 4096 + 83, 8192 + 27, 16384 + 43, 32768 + 3, 65536 + 45, 131072 + 9, 262144 + 39, 524288 + 39, 1048576 + 9, 2097152 + 5, 4194304 + 3, 8388608 + 33, 16777216 + 27, 33554432 + 9, 67108864 + 71, 134217728 + 39, 268435456 + 9, 536870912 + 5, 1073741824 + 83, 0 ] class NULL: pass class Dictionary: dummy = "<dummy key>" def __init__(mp, newalg=0): mp.ma_size = 0 mp.ma_poly = 0 mp.ma_table = [] mp.ma_fill = 0 mp.ma_used = 0 mp.oldalg = not newalg def lookdict(mp, key, _hash): me_hash, me_key, me_value = range(3) # rec slots dummy = mp.dummy mask = mp.ma_size-1 ep0 = mp.ma_table i = (~_hash) & mask ep = ep0[i] if ep[me_key] is NULL or ep[me_key] == key: return ep if ep[me_key] == dummy: freeslot = ep else: if (ep[me_hash] == _hash and cmp(ep[me_key], key) == 0) : return ep freeslot = NULL ###### FROM HERE if mp.oldalg: incr = (_hash ^ (_hash >> 3)) & mask else: # note that we do not mask! # even the shifting my not be worth it. incr = _hash ^ (_hash >> 3) ###### TO HERE if (not incr): incr = mask while 1: ep = ep0[(i+incr)&mask] if (ep[me_key] is NULL) : if (freeslot != NULL) : return freeslot else: return ep if (ep[me_key] == dummy) : if (freeslot == NULL): freeslot = ep elif (ep[me_key] == key or (ep[me_hash] == _hash and cmp(ep[me_key], key) == 0)) : return ep # Cycle through GF(2^n)-{0} ###### FROM HERE if mp.oldalg: incr = incr << 1 if (incr > mask): incr = incr ^ mp.ma_poly else: # new algorithm: do a division if (incr & 1): incr = incr ^ mp.ma_poly incr = incr >> 1 ###### TO HERE def insertdict(mp, key, _hash, value): me_hash, me_key, me_value = range(3) # rec slots ep = mp.lookdict(key, _hash) if (ep[me_value] is not NULL) : old_value = ep[me_value] ep[me_value] = value else : if (ep[me_key] is NULL): mp.ma_fill=mp.ma_fill+1 ep[me_key] = key ep[me_hash] = _hash ep[me_value] = value mp.ma_used = mp.ma_used+1 def dictresize(mp, minused): me_hash, me_key, me_value = range(3) # rec slots oldsize = mp.ma_size oldtable = mp.ma_table MINSIZE = 4 newsize = MINSIZE for i in range(len(polys)): if (newsize > minused) : newpoly = polys[i] break newsize = newsize << 1 else: return -1 _nullentry = range(3) _nullentry[me_hash] = 0 _nullentry[me_key] = NULL _nullentry[me_value] = NULL newtable = map(lambda x,y=_nullentry:y[:], range(newsize)) mp.ma_size = newsize mp.ma_poly = newpoly mp.ma_table = newtable mp.ma_fill = 0 mp.ma_used = 0 for ep in oldtable: if (ep[me_value] is not NULL): mp.insertdict(ep[me_key],ep[me_hash],ep[me_value]) return 0 # PyDict_GetItem def __getitem__(op, key): me_hash, me_key, me_value = range(3) # rec slots if not op.ma_table: raise KeyError, key _hash = hash(key) return op.lookdict(key, _hash)[me_value] # PyDict_SetItem def __setitem__(op, key, value): mp = op _hash = hash(key) ## /* if fill >= 2/3 size, double in size */ if (mp.ma_fill*3 >= mp.ma_size*2) : if (mp.dictresize(mp.ma_used*2) != 0): if (mp.ma_fill+1 > mp.ma_size): raise MemoryError mp.insertdict(key, _hash, value) # more interface functions def keys(self): me_hash, me_key, me_value = range(3) # rec slots res = [] for _hash, _key, _value in self.ma_table: if _value is not NULL: res.append( _key) return res def values(self): me_hash, me_key, me_value = range(3) # rec slots res = [] for _hash, _key, _value in self.ma_table: if _value is not NULL: res.append( _value) return res def items(self): me_hash, me_key, me_value = range(3) # rec slots res = [] for _hash, _key, _value in self.ma_table: if _value is not NULL: res.append( (_key, _value) ) return res def __cmp__(self, other): mine = self.items() others = other.items() mine.sort() others.sort() return cmp(mine, others) ###################################################### ## tests def timing(func, args=None, n=1, **keywords) : import time time=time.time appl=apply if args is None: args = () if type(args) != type(()) : args=(args,) rep=range(n) dummyarg = ("",) dummykw = {} dummyfunc = len if keywords: before=time() for i in rep: res=appl(dummyfunc, dummyarg, dummykw) empty = time()-before before=time() for i in rep: res=appl(func, args, keywords) else: before=time() for i in rep: res=appl(dummyfunc, dummyarg) empty = time()-before before=time() for i in rep: res=appl(func, args) after = time() return round(after-before-empty,4), res def test(lis, dic): for key in lis: dic[key] def nulltest(lis, dic): for key in lis: dic def string_dicts(): d1 = Dictionary() # original d2 = Dictionary(1) # other rehash for i in range(1000): s = str(i) * 5 d1[s] = d2[s] = i return d1, d2 def badnum_dicts(): d1 = Dictionary() # original d2 = Dictionary(1) # other rehash shift = 10 if EXTREME: shift = 16 for i in range(1000): bad = i << 16 d1[bad] = d2[bad] = i return d1, d2 def do_test(dict, keys, n): t0 = timing(nulltest, (keys, dict), n)[0] t1 = timing(test, (keys, dict), n)[0] return t1-t0 EXTREME=1 if __name__ == "__main__": sdold, sdnew = string_dicts() bdold, bdnew = badnum_dicts() print "timing for strings old=%.3f new=%.3f" % ( do_test(sdold, sdold.keys(), 100), do_test(sdnew, sdnew.keys(), 100) ) print "timing for bad integers old=%.3f new=%.3f" % ( do_test(bdold, bdold.keys(), 10) *10, do_test(bdnew, bdnew.keys(), 10) *10) """ Results with a shift of 10 (EXTREME=0): D:\crml_doc\platf\py>python dictest.py timing for strings old=5.097 new=5.088 timing for bad integers old=101.540 new=12.610 Results with a shift of 16 (EXTREME=1): D:\crml_doc\platf\py>python dictest.py timing for strings old=5.218 new=5.147 timing for bad integers old=571.210 new=19.220 """