
I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu import sys # test harness for Masked array performonce from MA import * #from Numeric import * from Perp.util.TimerUtility import TimerUtility def mult_test(a1, a2): res = a1 * a2 if __name__ == '__main__': repeat = 100 gates = 1000 beams = 370 if len(sys.argv) > 1: repeat = int(sys.argv[1]) t1 = ones((beams, gates), Float) a1 = masked_values(t1, -327.68) a2 = masked_values(t1, -327.68) i = 0 tu = TimerUtility(()) while (i < repeat): i = i+1 res = mult_test(a1, a2) elapsed = tu.elapsed() print 'completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) #include <iostream> #include <stdlib.h> #include "PerfTimer.h" typedef float *FLOAT_PTR; extern void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue); const int gates = 1000; const int beams = 370; int main(int argc, char *argv[]) { int repeat =100; const float missingValue = -327.68; if (argc > 1) repeat = atoi(argv[1]); FLOAT_PTR *a1 = new FLOAT_PTR[beams]; FLOAT_PTR *a2 = new FLOAT_PTR[beams]; FLOAT_PTR *res = new FLOAT_PTR[beams]; // allocate storage for 2d variables for (int b = 0; b < beams; ++b) { a1[b] = new float[gates]; a2[b] = new float[gates]; res[b] = new float[gates]; } PerfTimer pt; for (int r = 0; r < repeat; ++r) { mult_test(a1, a2, res,missingValue); } double elapsed = pt.Elapsed(); double cntMultiply = repeat*gates*beams; cout << repeat << " repetitions completed" << endl; cout << cntMultiply << "checked multiplies" << endl; cout << cntMultiply/elapsed << "checked multiplies/second" << endl; } void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue) { const float atol = 1.e-8; const float rtol = 1.0e-5; for (int b=0; b < beams; ++b) { for (int g = 0; g < gates; ++g) { if (fabs(a1[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { resp[b][g] = missingValue; continue; } else if (fabs(a2[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { resp[b][g] = missingValue; } else { resp[b][g] = a1[b][g] * a2[b][g]; } } // for gates } // for beams } #include <time.h> class PerfTimer { public: // constructor, starts timing PerfTimer(); // reset starting time void Start(); // compute elapsed time since last construction or Start() double Elapsed(); private: struct timespec startTime_; }; #include "PerfTimer.h" PerfTimer::PerfTimer() { Start(); } void PerfTimer::Start() { clock_gettime(CLOCK_REALTIME, &startTime_); } double PerfTimer::Elapsed() { struct timespec stopTime; clock_gettime(CLOCK_REALTIME, &stopTime); return (stopTime.tv_sec + stopTime.tv_nsec/1.0e9 - (startTime_.tv_sec + startTime_.tv_nsec/1.0e9) ); } #CCFLAGS=-O3 CCFLAGS= CXXFLAGS=${CCFLAGS} all: arrayperf arrayperf: arrayperf.cc g++ -o arrayperf arrayperf.cc PerfTimer.cc -lm -lrt # import time class TimerUtility: """ Timer/Utility for performance measurment """ def __init__(self, verbose=1,useElapsed=0): """ ctor: can suppress printing by settings **verbose** to 0 """ # if we're timing elapsed events, including subprocesses, # then use time.time() if useElapsed: self.__timeFunc = time.time else: self.__timeFunc = time.clock self.__startTime = self.__timeFunc() self.__lastTime = self.__startTime self.__verbose = verbose def elapsed(self, msg = 'Elapsed '): """ print elapsed time since instance creation or last **elapsed()** call """ current = self.__timeFunc() delta = (current - self.__lastTime) if (self.__verbose): print '%s : %5.2f' % (msg, float(delta)) self.__lastTime = current return delta def rate(self, count, msg = 'rate'): """ print elapsed time and rate since instance creation or last **elapsed()** call """ current = self.__timeFunc() delta = (current - self.__lastTime) if (self.__verbose): print '%s : %5.2f : %6.2f' % (msg, float(delta), float(count)/float(delta)) self.__lastTime = current return delta def total(self, msg = 'Total '): """ print total time since TimerUtility was created """ current = self.__timeFunc() diff = (current - self.__startTime) if (self.__verbose): print '%s : %5.2f' % (msg, diff) return diff