don't need dictionary's keys - hash table?
Peter Otten
__peter__ at web.de
Thu Jul 13 04:02:55 EDT 2006
Terry Hancock wrote:
> Nick Vatamaniuc wrote:
>> The original post just said that he wanted to index some values by
>> their urls and didn't really care about the urls themselves, so I
>> suggested that he just use the hash of the key as the key. The
>> dictionary will then take a hash of that [note that:
>> hash(string)=hash(hash(string)) ] then the dictionary will not keep
>> the reference to the urls and GC will collect them. So instead of
>> dic[urlstring']=value he will do dic[hash(urlstring)]=value. But then
>> to retrieve he will have to do old_value=dic[hash(urlstring]].
>
> Note that it is trivial to catch collisions on entry and correct them:
>
> key = hash(url_string)
> i = 0
> if d.has_key(key):
> while d.has_key(key):
> i += 1
> d[key + repr(i)] = val
> else:
> d[key] = val
>
> or something along those lines. No need to keep the original string.
How do you intend to get the value back out of the dictionary?
candidates = []
for key in xrange(hash(url)):
try:
candidates.append(d[key])
except KeyError:
break
Instead, I would put all values with the same hash into a container, along
the lines of
d.setdefault(hash(url), []).append(value))
Here is a simple implementation that can be used to put the idea to a test.
It keeps all but the first url for a cluster with identical hashes.
class Cluster(object):
"""A cluster of dictionary values with the same hash(key).
Helper for Dict.
"""
def __init__(self, default):
self.pairs = {}
self.default = default
def __setitem__(self, key, value):
assert key not in self.pairs
self.pairs[key] = value
def __getitem__(self, key):
return self.pairs.get(key, self.default)
def __repr__(self):
return "Cluster(default=%r, pairs=%r)" % (self.default, self.pairs)
def itervalues(self):
yield self.default
for value in self.pairs.itervalues():
yield value
class Dict(object):
"""A dictionary that stores hashes instead of keys as long as there are
no collisions.
The value entered first becomes the default for a cluster of values with
the same hash. It follows that you cannot implement a reliable
containment test and must make sure by exterior means that only existing
keys are looked up.
"""
def __init__(self):
self.dict = {}
def __setitem__(self, key, value):
short_key = hash(key)
try:
cluster = self.dict[short_key]
except KeyError:
self.dict[short_key] = value
else:
if isinstance(cluster, Cluster):
cluster[key] = value
else:
cluster = Cluster(cluster)
cluster[key] = value
self.dict[short_key] = cluster
def __getitem__(self, key):
cluster = self.dict[hash(key)]
if not isinstance(cluster, Cluster):
return cluster
return cluster[key]
def itervalues(self):
for value in self.dict.itervalues():
if isinstance(value, Cluster):
for v in value.itervalues():
yield v
else:
yield value
def __repr__(self):
return "Dict(%s)" % repr(self.dict)[1:-1]
if __name__ == "__main__":
class BadHash(object):
"""Key with user-defined hash.
Allows enforcing hash collisions.
"""
def __init__(self, value, hash):
self.value = value
self.hash = hash
def __hash__(self):
return hash(self.hash)
def __eq__(self, other):
return self.value == other.value
def __repr__(self):
return "BadHash(value=%r, hash=%r)" % (self.value, self.hash)
d = Dict()
for i, v in enumerate("alpha beta gamma delta".split()):
d[BadHash(v, i)] = v
for v in "sigma tau omega".split():
d[BadHash(v, 42)] = v
print d
assert d[BadHash("gamma", 2)] == "gamma"
assert d[BadHash("sigma", 42)] == "sigma"
assert d[BadHash("tau", 42)] == "tau"
# but be warned that
assert d[BadHash("whatever", 42)] == "sigma"
expected = sorted("alpha beta gamma delta sigma tau omega".split())
assert list(sorted(d.itervalues())) == expected
I fear that the memory footprint of a url is not large enough to warrant the
approach, though.
Peter
More information about the Python-list
mailing list