performance comparison of C++ vs Numeric (MA) operations.

I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu import sys # test harness for Masked array performonce from MA import * #from Numeric import * from Perp.util.TimerUtility import TimerUtility def mult_test(a1, a2): res = a1 * a2 if __name__ == '__main__': repeat = 100 gates = 1000 beams = 370 if len(sys.argv) > 1: repeat = int(sys.argv[1]) t1 = ones((beams, gates), Float) a1 = masked_values(t1, -327.68) a2 = masked_values(t1, -327.68) i = 0 tu = TimerUtility(()) while (i < repeat): i = i+1 res = mult_test(a1, a2) elapsed = tu.elapsed() print 'completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) #include <iostream> #include <stdlib.h> #include "PerfTimer.h" typedef float *FLOAT_PTR; extern void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue); const int gates = 1000; const int beams = 370; int main(int argc, char *argv[]) { int repeat =100; const float missingValue = -327.68; if (argc > 1) repeat = atoi(argv[1]); FLOAT_PTR *a1 = new FLOAT_PTR[beams]; FLOAT_PTR *a2 = new FLOAT_PTR[beams]; FLOAT_PTR *res = new FLOAT_PTR[beams]; // allocate storage for 2d variables for (int b = 0; b < beams; ++b) { a1[b] = new float[gates]; a2[b] = new float[gates]; res[b] = new float[gates]; } PerfTimer pt; for (int r = 0; r < repeat; ++r) { mult_test(a1, a2, res,missingValue); } double elapsed = pt.Elapsed(); double cntMultiply = repeat*gates*beams; cout << repeat << " repetitions completed" << endl; cout << cntMultiply << "checked multiplies" << endl; cout << cntMultiply/elapsed << "checked multiplies/second" << endl; } void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue) { const float atol = 1.e-8; const float rtol = 1.0e-5; for (int b=0; b < beams; ++b) { for (int g = 0; g < gates; ++g) { if (fabs(a1[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { resp[b][g] = missingValue; continue; } else if (fabs(a2[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { resp[b][g] = missingValue; } else { resp[b][g] = a1[b][g] * a2[b][g]; } } // for gates } // for beams } #include <time.h> class PerfTimer { public: // constructor, starts timing PerfTimer(); // reset starting time void Start(); // compute elapsed time since last construction or Start() double Elapsed(); private: struct timespec startTime_; }; #include "PerfTimer.h" PerfTimer::PerfTimer() { Start(); } void PerfTimer::Start() { clock_gettime(CLOCK_REALTIME, &startTime_); } double PerfTimer::Elapsed() { struct timespec stopTime; clock_gettime(CLOCK_REALTIME, &stopTime); return (stopTime.tv_sec + stopTime.tv_nsec/1.0e9 - (startTime_.tv_sec + startTime_.tv_nsec/1.0e9) ); } #CCFLAGS=-O3 CCFLAGS= CXXFLAGS=${CCFLAGS} all: arrayperf arrayperf: arrayperf.cc g++ -o arrayperf arrayperf.cc PerfTimer.cc -lm -lrt # import time class TimerUtility: """ Timer/Utility for performance measurment """ def __init__(self, verbose=1,useElapsed=0): """ ctor: can suppress printing by settings **verbose** to 0 """ # if we're timing elapsed events, including subprocesses, # then use time.time() if useElapsed: self.__timeFunc = time.time else: self.__timeFunc = time.clock self.__startTime = self.__timeFunc() self.__lastTime = self.__startTime self.__verbose = verbose def elapsed(self, msg = 'Elapsed '): """ print elapsed time since instance creation or last **elapsed()** call """ current = self.__timeFunc() delta = (current - self.__lastTime) if (self.__verbose): print '%s : %5.2f' % (msg, float(delta)) self.__lastTime = current return delta def rate(self, count, msg = 'rate'): """ print elapsed time and rate since instance creation or last **elapsed()** call """ current = self.__timeFunc() delta = (current - self.__lastTime) if (self.__verbose): print '%s : %5.2f : %6.2f' % (msg, float(delta), float(count)/float(delta)) self.__lastTime = current return delta def total(self, msg = 'Total '): """ print total time since TimerUtility was created """ current = self.__timeFunc() diff = (current - self.__startTime) if (self.__verbose): print '%s : %5.2f' % (msg, diff) return diff

I have a timing benchmark for MA that computes the ratio MA/Numeric for two cases: 1. there is actually no mask 2. there is a mask For N=50,000 these ratios are usually around 1.3 and 1.8 respectively. It makes sense in the second case that the number might be around 2 since you have to pass through the mask data as well, even if it is only bytes. In short, there is this much overhead to MA. If you got MA/C++ = 1.67 it would indicate Numpy/C++ comparable. The tests Jim did when he first wrote it were about 10% worse than C. Your C++ uses a special value instead of a mask array which may mean that you traded space for CPU time, and using large arrays like that maybe that causes some page faults (?) Anyway you're comparing apples and oranges a little. Anyway, my point is this is probably an MA issue rather than a Numpy issue. However, please note that I did not (yet) do any of the normal profiling and testing that one would do to speed MA up, such as putting key parts in C. This is just not an issue for me right now. -----Original Message----- From: numpy-discussion-admin@lists.sourceforge.net [mailto:numpy-discussion-admin@lists.sourceforge.net]On Behalf Of Joe Van Andel Sent: Tuesday, June 12, 2001 5:20 PM To: numpy-discussion Subject: [Numpy-discussion] performance comparison of C++ vs Numeric (MA) operations. I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu

PS my test was on double precision, failed to notice that too. -----Original Message----- From: numpy-discussion-admin@lists.sourceforge.net [mailto:numpy-discussion-admin@lists.sourceforge.net]On Behalf Of Joe Van Andel Sent: Tuesday, June 12, 2001 5:20 PM To: numpy-discussion Subject: [Numpy-discussion] performance comparison of C++ vs Numeric (MA) operations. I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu

If I read your C++ right (and I may not have, I'm a C++ novice), you allocated the memory for all three arrays, and then performed your loop. In the Python version, the result array is allocated when the multiplication is perfomed, so you are allocating and freeing the result array each tim ein the loop. That may slow things down a little. In a real application, you are less likely to be re-doing the same computation over and over again, so the allocation would happen only once. You might try something like this, and see if it is any faster (it is more memory efficient) Note also that there is some overhead in function calls in Python, so you may get some speed up if you inline the call to mult_test. You can decide for yourself if this would still be a fair comparison. You might try something like this, and see if it is any faster (it is more memory efficient) (unfortunately, MA doesn't seem to support the thiord argument to multiply) My version (I don't have TimerUtility, so I used time.clock instead) got these times: Your code: completed 1000 in 99.050000 seconds 3.74e+06 checked multiplies/second My code: alternative completed 1000 in 80.070000 seconds 4.62e+06 checked multiplies/second It did buy you something: here is the code: #!/usr/bin/env python2.1 import sys # test harness for Masked array performonce #from MA import * from Numeric import * from time import clock def mult_test(a1, a2): res = a1 * a2 if __name__ == '__main__': repeat = 100 gates = 1000 beams = 370 if len(sys.argv) > 1: repeat = int(sys.argv[1]) t1 = ones((beams, gates), Float) a1 = t1 a2 = t1 # a1 = masked_values(t1, -327.68) # a2 = masked_values(t1, -327.68) i = 0 start = clock() while (i < repeat): i = i+1 res = mult_test(a1, a2) elapsed = clock() - start print 'completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) print # alternative: res = zeros(a1.shape,Float) i = 0 start = clock() while (i < repeat): i = i+1 multiply(a1, a2, res) elapsed = clock() - start print 'alternative completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) print Another note: calling ones with Float as your type gives you a Python float, which is a C double. Use 'f' or Float32 to get a C float. I've found on Intel hardware, doubles are just as fast (the FPU used doubles anyway), but they do use more memory, so this could make a difference. -Chris -- Christopher Barker, Ph.D. ChrisHBarker@home.net --- --- --- http://members.home.net/barkerlohmann ---@@ -----@@ -----@@ ------@@@ ------@@@ ------@@@ Oil Spill Modeling ------ @ ------ @ ------ @ Water Resources Engineering ------- --------- -------- Coastal and Fluvial Hydrodynamics -------------------------------------- ------------------------------------------------------------------------

I have a timing benchmark for MA that computes the ratio MA/Numeric for two cases: 1. there is actually no mask 2. there is a mask For N=50,000 these ratios are usually around 1.3 and 1.8 respectively. It makes sense in the second case that the number might be around 2 since you have to pass through the mask data as well, even if it is only bytes. In short, there is this much overhead to MA. If you got MA/C++ = 1.67 it would indicate Numpy/C++ comparable. The tests Jim did when he first wrote it were about 10% worse than C. Your C++ uses a special value instead of a mask array which may mean that you traded space for CPU time, and using large arrays like that maybe that causes some page faults (?) Anyway you're comparing apples and oranges a little. Anyway, my point is this is probably an MA issue rather than a Numpy issue. However, please note that I did not (yet) do any of the normal profiling and testing that one would do to speed MA up, such as putting key parts in C. This is just not an issue for me right now. -----Original Message----- From: numpy-discussion-admin@lists.sourceforge.net [mailto:numpy-discussion-admin@lists.sourceforge.net]On Behalf Of Joe Van Andel Sent: Tuesday, June 12, 2001 5:20 PM To: numpy-discussion Subject: [Numpy-discussion] performance comparison of C++ vs Numeric (MA) operations. I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu

PS my test was on double precision, failed to notice that too. -----Original Message----- From: numpy-discussion-admin@lists.sourceforge.net [mailto:numpy-discussion-admin@lists.sourceforge.net]On Behalf Of Joe Van Andel Sent: Tuesday, June 12, 2001 5:20 PM To: numpy-discussion Subject: [Numpy-discussion] performance comparison of C++ vs Numeric (MA) operations. I was curious about the relative performance of C++ vs Numeric Python, for operations on arrays of roughly 400,000 array elements. I built a simple array single precision multiplication function in C++, that performs an element by element multiply, checking whether each element is "valid" or "missing data". Then, for comparision, I wrote a similar multiplication routine, using the Masked Array (MA) package of Numeric Python. I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py to contain lines like OPTIMIZE=['-O3'] ext_modules = . . Extension('multiarray', ['Src/multiarraymodule.c'], extra_compile_args=OPTIMIZE ), --------------------------------------- On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3, Software Performance ------------------------------------------------ Numeric Python 5.0e6 multiplies/second Numeric Python -03 6.1e6 multiplies/second C++ 10.3e6 multiplies/second C++ -O3 10.3e6 multiplies/second (I tried using "plain" Numeric arrays, rather than Masked arrays, and it didn't seem to make much difference.) Has anyone else benchmarked the relative performance of C/C++ vs Numeric Python? Does anyone know of other optimizations to Numeric Python that could be implemented? I know a more realistic benchmark would include I/O, which might tend to reduce the apparent difference in performance. I've attached the benchmark modules, in case someone would like to examine them. -- Joe VanAndel National Center for Atmospheric Research http://www.atd.ucar.edu/~vanandel/ Internet: vanandel@ucar.edu

If I read your C++ right (and I may not have, I'm a C++ novice), you allocated the memory for all three arrays, and then performed your loop. In the Python version, the result array is allocated when the multiplication is perfomed, so you are allocating and freeing the result array each tim ein the loop. That may slow things down a little. In a real application, you are less likely to be re-doing the same computation over and over again, so the allocation would happen only once. You might try something like this, and see if it is any faster (it is more memory efficient) Note also that there is some overhead in function calls in Python, so you may get some speed up if you inline the call to mult_test. You can decide for yourself if this would still be a fair comparison. You might try something like this, and see if it is any faster (it is more memory efficient) (unfortunately, MA doesn't seem to support the thiord argument to multiply) My version (I don't have TimerUtility, so I used time.clock instead) got these times: Your code: completed 1000 in 99.050000 seconds 3.74e+06 checked multiplies/second My code: alternative completed 1000 in 80.070000 seconds 4.62e+06 checked multiplies/second It did buy you something: here is the code: #!/usr/bin/env python2.1 import sys # test harness for Masked array performonce #from MA import * from Numeric import * from time import clock def mult_test(a1, a2): res = a1 * a2 if __name__ == '__main__': repeat = 100 gates = 1000 beams = 370 if len(sys.argv) > 1: repeat = int(sys.argv[1]) t1 = ones((beams, gates), Float) a1 = t1 a2 = t1 # a1 = masked_values(t1, -327.68) # a2 = masked_values(t1, -327.68) i = 0 start = clock() while (i < repeat): i = i+1 res = mult_test(a1, a2) elapsed = clock() - start print 'completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) print # alternative: res = zeros(a1.shape,Float) i = 0 start = clock() while (i < repeat): i = i+1 multiply(a1, a2, res) elapsed = clock() - start print 'alternative completed %d in %f seconds' % (repeat , elapsed) cntMultiply = repeat*gates*beams print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) print Another note: calling ones with Float as your type gives you a Python float, which is a C double. Use 'f' or Float32 to get a C float. I've found on Intel hardware, doubles are just as fast (the FPU used doubles anyway), but they do use more memory, so this could make a difference. -Chris -- Christopher Barker, Ph.D. ChrisHBarker@home.net --- --- --- http://members.home.net/barkerlohmann ---@@ -----@@ -----@@ ------@@@ ------@@@ ------@@@ Oil Spill Modeling ------ @ ------ @ ------ @ Water Resources Engineering ------- --------- -------- Coastal and Fluvial Hydrodynamics -------------------------------------- ------------------------------------------------------------------------
participants (3)
-
Chris Barker
-
Joe Van Andel
-
Paul F. Dubois