# GC is very expensive: am I doing something wrong?

Peter Otten __peter__ at web.de
Tue Mar 23 09:43:54 CET 2010

```Steven D'Aprano wrote:

> On Mon, 22 Mar 2010 22:05:40 -0700, Paul Rubin wrote:
>
>> Antoine Pitrou <solipsis at pitrou.net> writes:
>>> "Orders of magnitude worse", in any case, sounds very exaggerated.
>>
>> The worst case can lose orders of magnitude if a lot of values hash to
>> the same bucket.
>
>
> Well, perhaps one order of magnitude.
>
>
>>>> for i in xrange(100):
> ...     n = 32*i+1
> ...     assert hash(2**n) == hash(2)
> ...
>>>> d1 = dict.fromkeys(xrange(100))
>>>> d2 = dict.fromkeys([2**(32*i+1) for i in xrange(100)])
>>>>
>>>> from timeit import Timer
>>>> setup = "from __main__ import d1, d2"
>>>> t1 = Timer("for k in d1.keys(): x = d1[k]", setup)
>>>> t2 = Timer("for k in d2.keys(): x = d2[k]", setup)
>>>>
>>>> min(t1.repeat(number=1000, repeat=5))
> 0.026707887649536133
>>>> min(t2.repeat(number=1000, repeat=5))
> 0.33103203773498535

But the ratio grows with the number of collisions:

\$ python extrapolate.py
10
0.00120401382446
0.00753307342529
ratio: 6.25663366337

100
0.00542402267456
0.316139936447
ratio: 58.2851428571

1000
0.00553417205811
3.36690688133
ratio: 608.384930209

\$ cat extrapolate.py
from timeit import Timer

class Item(object):
def __init__(self, value, hash=None):
self.value = value
self.hash = value if hash is None else hash
def __eq__(self, other):
return self.value == other.value
def __hash__(self):
return self.hash

setup = "from __main__ import d"
bench = "for k in d: x = d[k]"

for n, number in (10,100), (100,100), (1000,10):
print n
d1 = dict.fromkeys(Item(i) for i in xrange(n))
d2 = dict.fromkeys(Item(i, 0) for i in xrange(n))
ab = []
for d in d1, d2:
t = Timer(bench, setup)
ab.append(min(t.repeat(number=number, repeat=3)))
print ab[-1]
print "ratio:", ab[1]/ab[0]
print