Selecting k smallest or largest elements from a large list in python; (benchmarking)

Dmitry Chichkov dchichkov at gmail.com
Thu Sep 2 11:20:14 CEST 2010


By the way, improving n-ARG-smallest (that returns indexes as well as
values) is actually more desirable than just regular n-smallest:

== Result ==
1.38639092445 nargsmallest
3.1569879055 nargsmallest_numpy_argsort
1.29344892502 nargsmallest_numpy_argmin

Note that numpy array constructor eats around 0.789.


== Code ==
from operator import itemgetter
from random import randint, random
from itertools import islice
from time import time

def nargsmallest(iterable, n):
    it   = enumerate(iterable)
    mins = sorted(islice(it, n), key = itemgetter(1))
    loser = mins[-1][1]         # largest of smallest
    for el in it:
        if el[1] <= loser:      # NOTE: preserve dups
            mins.append(el)
            mins.sort(key = itemgetter(1))
            mins.pop()
            loser = mins[-1][1]
    return mins

def nargsmallest_numpy_argsort(iter, k):
    distances = N.asarray(iter)
    return [(i, distances[i]) for i in distances.argsort()[0:k]]

def nargsmallest_numpy_argmin(iter, k):
    mins = []
    distances = N.asarray(iter)
    for i in xrange(k):
        j = distances.argmin()
        mins.append((j, distances[j]))
        distances[j] = float('inf')
    return mins

test_data = [randint(10, 50) + random() for i in range(10000000)]
K = 10

init = time()
mins = nargsmallest(test_data, K)
print time() - init, 'nargsmallest:', mins[:2]

import numpy as N
init = time()
mins = nargsmallest_numpy_argsort(test_data, K)
print time() - init, 'nargsmallest_numpy_argsort:', mins[:2]

init = time()
mins = nargsmallest_numpy_argmin(test_data, K)
print time() - init, 'nargsmallest_numpy_argmin:', mins[:2]



More information about the Python-list mailing list