Performance of list.index - how to speed up a silly algorithm?
Peter Otten
__peter__ at web.de
Fri Apr 30 03:20:26 EDT 2010
MRAB wrote:
> The .index method does a linear search, checking on average 1/2 of the
> items in the list. That's why it's so slow.
>
> In order to avoid that you could build a dict of each value in
> dimension_values[col_idx] and its index in a single pass so that it
> becomes a quick lookup.
For example:
from itertools import count, izip
def iterator_factory():
# columns = ["color", "size", "weight", "value"]
rows = [
["Yellow", "Big", 2, 4],
["Blue", "Big", 3, -4],
["Blue", "Small", 10, 55],
#...
]
return iter(rows)
class Indexer(object):
def __init__(self):
self.lookup = {}
self.counter = count()
def __call__(self, value):
try:
return self.lookup[value]
except KeyError:
result = self.lookup[value] = next(self.counter)
return result
def to_list(self):
d = self.lookup
reverse = dict(izip(d.itervalues(), d.iterkeys()))
assert len(reverse) == len(d)
return [reverse[i] for i in xrange(len(reverse))]
def pairs(rows, dimension_cols, measure_cols, indexers):
for row in rows:
dv = [indexer(row[col])
for indexer, col in izip(indexers, dimension_cols)]
mv = [row[col] for col in measure_cols]
yield dv, mv
def main():
# dimension_names = ["color", "size"]
dimension_cols = [0, 1]
# measure_names = ["weight", "value"]
measure_cols = [2, 3]
indexers = [Indexer() for _ in dimension_cols]
facts = pairs(iterator_factory(),
dimension_cols, measure_cols, indexers)
facts = list(facts)
print facts
for i, indexer in enumerate(indexers):
print "%d: %s" % (i, indexer.to_list())
if __name__ == "__main__":
main()
More information about the Python-list
mailing list