[Numpy-discussion] Performance issues with numpy cross tabulation

Justin Thomas jthomas1 at decisionanalyst.com
Mon May 16 15:09:20 EDT 2011


 I am having trouble with performance when trying to create a cross
tabulation using numpy.  Ideally, I would calculate each cell in the
cross tabulation separately because this gives me the greatest amount
of flexibility.  I have included some sample code as a reference and
am really looking for better approaches to the simpleLoop method.  So
far the histogram2d and histogramdd methods seem to outperform any
code I write by a factor of about 100, at least.  I chalk this up to I
just don't understand enough about numpy, yet.  Any help would be

Here is the test code:
import numpy as np
import time
import random

# Create a simple loop and count up the number of matching cases
# Basic cross tabulation or histogram of the data
# This approach is prefered because of the need to customize the
calculation potentially for each cell.
def simpleLoop(c):

       #number of items per inner loop
       a_cnt = len(np.unique(c[:,0]))
       b_cnt = len(np.unique(c[:,1]))
       idx = 0
       result = np.zeros(b_cnt * a_cnt)
       for i in np.unique(c[:,0]):
               for j in np.unique(c[:,1]):
                       result[idx] = np.sum(1*(c[:,0] == i) & (c[:,1] == j))
                       idx += 1

       return result

# Use numpys histogram method to calculate the matrix of combinations
and the number of cases in each one.
def simpleHistogram(c):

       #number of items per inner loop
       return np.histogramdd((c[:,0],c[:,1]), bins=[np.unique(c[:,

# Variation1 of simple histogram
def simpleHistogram1(c):

       #number of items per inner loop
       results = []
       for i in np.unique(c[:,1]):
bins=[np.unique(c[:,0])]) or 0)

       return np.column_stack([result[0] for result in results])

if __name__ == '__main__':
       a = np.random.randint(1,900,200000)
       b = np.random.randint(1,10,200000)
       c = np.column_stack((a,b))

       print '---- Simple Loop ----'
       start = time.time()
       results = simpleLoop(c)
       print results[0]
       print time.time() - start

       print '---- Histogram dd no looping ----'
       start = time.time()
       results = simpleHistogram(c)
       print results[0][0]
       print time.time() - start

       print '---- Histogram run 1 time for each item in column 1 (10 times)
       start = time.time()
       results = simpleHistogram1(c)
       print results[0]
       print time.time() - start
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20110516/cdd148c3/attachment.html>

More information about the NumPy-Discussion mailing list