Mailman 3 performance comparison of C++ vs Numeric (MA) operations. - NumPy-Discussion

June 13, 2001

      I was curious about the relative performance of C++ vs Numeric Python,
for operations on arrays of roughly 400,000 array elements.  I built a
simple array single precision multiplication function in C++, that
performs an element by element multiply, checking whether each element
is "valid" or "missing data".

Then, for comparision, I wrote a similar multiplication routine, using
the Masked Array (MA) package of Numeric Python.

I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py
to contain lines like

OPTIMIZE=['-O3']

ext_modules = 
.
.
 Extension('multiarray', ['Src/multiarraymodule.c'],
                                extra_compile_args=OPTIMIZE
                                ),

---------------------------------------

On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3,

Software                Performance
------------------------------------------------                 
Numeric Python	        5.0e6 multiplies/second
Numeric Python -03	6.1e6 multiplies/second
C++	               10.3e6 multiplies/second
C++ -O3		       10.3e6 multiplies/second

(I tried using "plain" Numeric arrays, rather than Masked arrays, and it
didn't seem to make much difference.)

Has anyone else benchmarked the relative performance of C/C++ vs Numeric
Python?  

Does anyone know of other optimizations to Numeric Python that could be
implemented?

I know a more realistic benchmark would include I/O, which might tend to
reduce the apparent difference in performance.

I've attached the benchmark modules, in case someone would like to
examine them. 
-- 
Joe VanAndel  	          
National Center for Atmospheric Research
http://www.atd.ucar.edu/~vanandel/
Internet: vanandel@ucar.edu

import sys

# test harness for Masked array performonce 

from MA import *
#from Numeric import *

from Perp.util.TimerUtility import TimerUtility

def mult_test(a1, a2):
    res = a1 * a2

if __name__ == '__main__':
    repeat = 100
    gates = 1000
    beams = 370

    if len(sys.argv) > 1:
        repeat = int(sys.argv[1])

    t1 = ones((beams, gates), Float)
    a1 = masked_values(t1, -327.68)
    a2 = masked_values(t1, -327.68)

    i = 0
    tu = TimerUtility(()) 
    while (i < repeat):
        i = i+1
        res = mult_test(a1, a2)

    elapsed = tu.elapsed()
    print 'completed %d in %f seconds' % (repeat , elapsed)
    cntMultiply = repeat*gates*beams
    print '%8.3g checked multiplies/second' % (cntMultiply/elapsed) 

#include <iostream>
#include <stdlib.h>

#include "PerfTimer.h"

typedef float *FLOAT_PTR;
extern void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue);

const int gates = 1000;
const int beams =  370;

int main(int argc, char *argv[])

{
    int repeat =100;
    const float missingValue = -327.68;

    if (argc > 1) repeat = atoi(argv[1]);

    FLOAT_PTR *a1 = new FLOAT_PTR[beams];
    FLOAT_PTR *a2 = new FLOAT_PTR[beams];
    FLOAT_PTR *res = new FLOAT_PTR[beams];

    // allocate storage for 2d variables 
    for (int b = 0; b < beams; ++b) {
        a1[b] = new float[gates];
        a2[b] = new float[gates];
        res[b] = new float[gates];
    }
    PerfTimer pt;

    for (int r = 0; r < repeat; ++r) {
        mult_test(a1, a2, res,missingValue);
    }
    double elapsed = pt.Elapsed();
    double cntMultiply = repeat*gates*beams;

    cout << repeat << " repetitions completed" << endl;
    cout << cntMultiply << "checked multiplies" <<  endl;

    cout << cntMultiply/elapsed << "checked multiplies/second" << endl;

}
void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue)
{
    const float atol = 1.e-8;
    const float rtol = 1.0e-5;

    for (int b=0; b < beams; ++b) {
        for (int g = 0; g < gates; ++g) {
            if (fabs(a1[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { 
                resp[b][g] = missingValue;
                continue;
            } else if (fabs(a2[b][g] - missingValue) < atol + rtol * fabs(missingValue)) { 
                resp[b][g] = missingValue;
            } else {
                resp[b][g] = a1[b][g] * a2[b][g];
            }
        } // for gates
    } // for beams
}

#include <time.h>

class PerfTimer {
public: 
    // constructor, starts timing
    PerfTimer();
    // reset starting time
    void Start();
    // compute elapsed time since last construction or Start()
    double Elapsed();

private:
    struct timespec startTime_;
};

#include "PerfTimer.h"

PerfTimer::PerfTimer() {
    Start();
}
void
PerfTimer::Start() {
    clock_gettime(CLOCK_REALTIME, &startTime_);
}

double
PerfTimer::Elapsed() {
    struct timespec stopTime;

    clock_gettime(CLOCK_REALTIME, &stopTime);

    return  (stopTime.tv_sec +  stopTime.tv_nsec/1.0e9  -
	 (startTime_.tv_sec + startTime_.tv_nsec/1.0e9) );
}

#CCFLAGS=-O3
CCFLAGS=
CXXFLAGS=${CCFLAGS}
all: arrayperf 

arrayperf: arrayperf.cc
	g++ -o arrayperf arrayperf.cc PerfTimer.cc -lm -lrt

#

import time

class TimerUtility:
    """ Timer/Utility for performance measurment
    """
    def __init__(self, verbose=1,useElapsed=0):
        """ ctor: can suppress printing by settings **verbose** to 0
        """
	# if we're timing elapsed events, including subprocesses,
	# then use time.time()
        if useElapsed:
            self.__timeFunc = time.time
        else:
            self.__timeFunc = time.clock

        self.__startTime = self.__timeFunc()
        self.__lastTime = self.__startTime
        self.__verbose = verbose

    def elapsed(self, msg = 'Elapsed '):
        """ print elapsed time since instance creation or last **elapsed()** call
        """
        current = self.__timeFunc()
        delta = (current - self.__lastTime)
        if (self.__verbose):
            print '%s : %5.2f' % (msg, float(delta))
        self.__lastTime = current
        return delta

    def rate(self, count, msg = 'rate'):
        """ print elapsed time and rate since instance creation or last **elapsed()** call
        """
        current = self.__timeFunc()
        delta = (current - self.__lastTime)
        if (self.__verbose):
            print '%s : %5.2f : %6.2f' % (msg, float(delta),
                                          float(count)/float(delta))

        self.__lastTime = current
        return delta

    def total(self, msg = 'Total '):
        """ print total time since TimerUtility was created
        """
        current = self.__timeFunc()
        diff = (current - self.__startTime)
        if (self.__verbose):
            print '%s : %5.2f' % (msg, diff)
        return diff

performance comparison of C++ vs Numeric (MA) operations.

Joe Van Andel

Paul F. Dubois

Paul F. Dubois

Chris Barker

Paul F. Dubois

Paul F. Dubois

Chris Barker

tags

participants (3)