# [Numpy-discussion] Performance issues with numpy cross tabulation

Justin Thomas jthomas1 at decisionanalyst.com
Mon May 16 15:09:20 EDT 2011

```Hello,

I am having trouble with performance when trying to create a cross
tabulation using numpy.  Ideally, I would calculate each cell in the
cross tabulation separately because this gives me the greatest amount
of flexibility.  I have included some sample code as a reference and
am really looking for better approaches to the simpleLoop method.  So
far the histogram2d and histogramdd methods seem to outperform any
code I write by a factor of about 100, at least.  I chalk this up to I
just don't understand enough about numpy, yet.  Any help would be
appreciated.

Here is the test code:
import numpy as np
import time
import random

# Create a simple loop and count up the number of matching cases
# Basic cross tabulation or histogram of the data
# This approach is prefered because of the need to customize the
calculation potentially for each cell.
def simpleLoop(c):

#number of items per inner loop
a_cnt = len(np.unique(c[:,0]))
b_cnt = len(np.unique(c[:,1]))
idx = 0
result = np.zeros(b_cnt * a_cnt)
for i in np.unique(c[:,0]):
for j in np.unique(c[:,1]):
result[idx] = np.sum(1*(c[:,0] == i) & (c[:,1] == j))
idx += 1

result.resize(len(result)/b_cnt,b_cnt)
return result

# Use numpys histogram method to calculate the matrix of combinations
and the number of cases in each one.
def simpleHistogram(c):

#number of items per inner loop
return np.histogramdd((c[:,0],c[:,1]), bins=[np.unique(c[:,
0]),range(1,11)])

# Variation1 of simple histogram
def simpleHistogram1(c):

#number of items per inner loop
results = []
for i in np.unique(c[:,1]):
results.append(np.histogramdd((c[:,0][c[:,1]==i]),
bins=[np.unique(c[:,0])]) or 0)

return np.column_stack([result for result in results])

if __name__ == '__main__':
a = np.random.randint(1,900,200000)
b = np.random.randint(1,10,200000)
c = np.column_stack((a,b))

print '---- Simple Loop ----'
start = time.time()
results = simpleLoop(c)
print results
print time.time() - start

print '---- Histogram dd no looping ----'
start = time.time()
results = simpleHistogram(c)
print results
print time.time() - start

print '---- Histogram run 1 time for each item in column 1 (10 times)
----'
start = time.time()
results = simpleHistogram1(c)
print results
print time.time() - start
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20110516/cdd148c3/attachment.html>
```