Hello,
I am having trouble with performance when trying to create a cross
tabulation using numpy. Ideally, I would calculate each cell in the
cross tabulation separately because this gives me the greatest amount
of flexibility. I have included some sample code as a reference and
am really looking for better approaches to the simpleLoop method. So
far the histogram2d and histogramdd methods seem to outperform any
code I write by a factor of about 100, at least. I chalk this up to I
just don't understand enough about numpy, yet. Any help would be
appreciated.
Here is the test code:
import numpy as np
import time
import random
# Create a simple loop and count up the number of matching cases
# Basic cross tabulation or histogram of the data
# This approach is prefered because of the need to customize the
calculation potentially for each cell.
def simpleLoop(c):
#number of items per inner loop
a_cnt = len(np.unique(c[:,0]))
b_cnt = len(np.unique(c[:,1]))
idx = 0
result = np.zeros(b_cnt * a_cnt)
for i in np.unique(c[:,0]):
for j in np.unique(c[:,1]):
result[idx] = np.sum(1*(c[:,0] == i) & (c[:,1] == j))
idx += 1
result.resize(len(result)/b_cnt,b_cnt)
return result
# Use numpys histogram method to calculate the matrix of combinations
and the number of cases in each one.
def simpleHistogram(c):
#number of items per inner loop
return np.histogramdd((c[:,0],c[:,1]), bins=[np.unique(c[:,
0]),range(1,11)])
# Variation1 of simple histogram
def simpleHistogram1(c):
#number of items per inner loop
results = []
for i in np.unique(c[:,1]):
results.append(np.histogramdd((c[:,0][c[:,1]==i]),
bins=[np.unique(c[:,0])]) or 0)
return np.column_stack([result[0] for result in results])
if __name__ == '__main__':
a = np.random.randint(1,900,200000)
b = np.random.randint(1,10,200000)
c = np.column_stack((a,b))
print '---- Simple Loop ----'
start = time.time()
results = simpleLoop(c)
print results[0]
print time.time() - start
print '---- Histogram dd no looping ----'
start = time.time()
results = simpleHistogram(c)
print results[0][0]
print time.time() - start
print '---- Histogram run 1 time for each item in column 1 (10 times)
----'
start = time.time()
results = simpleHistogram1(c)
print results[0]
print time.time() - start