[Numpy-discussion] OT: performance in C extension; OpenMP, or SSE ?

Tue Feb 15 22:50:42 EST 2011

I don't have the slightest idea what I'm doing, but....

____
file name - the_lib.c
___
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <math.h>

void dists2d(	   double *a_ps, int na,
		   double *b_ps, int nb,
		   double *dist, int num_threads)
{

    int i, j;
    int dynamic=0;
    omp_set_dynamic(dynamic);
    omp_set_num_threads(num_threads);
    double ax,ay, dif_x, dif_y;
    int nx1=2;
    int nx2=2;

#pragma omp parallel for private(j, i,ax,ay, dif_x, dif_y)
	for(i=0;i<na;i++)
	  {
		ax=a_ps[i*nx1];
                 ay=a_ps[i*nx1+1];
		for(j=0;j<nb;j++)
		  {     dif_x = ax - b_ps[j*nx2];
                         dif_y = ay - b_ps[j*nx2+1];
                         dist[2*i+j]  = sqrt(dif_x*dif_x+dif_y*dif_y);
		  }
	  }
}

________

COMPILE:
__________
gcc -c the_lib.c -fPIC -fopenmp -ffast-math
gcc -shared -o the_lib.so the_lib.o -lgomp -lm

____

the_python_prog.py
_____________

from ctypes import *
my_lib=CDLL('the_lib.so') #or full path to lib
import numpy as np
import time

na=329
nb=340
a=np.random.rand(na,2)
b=np.random.rand(nb,2)
c=np.zeros(na*nb)
trials=100
max_threads = 24
for k in range(1,max_threads):
     n_threads =c_int(k)
     na2=c_int(na)
     nb2=c_int(nb)

     start = time.time()
     for k1 in range(trials):
         ret = 
my_lib.dists2d(a.ctypes.data_as(c_void_p),na2,b.ctypes.data_as(c_void_p),nb2,c.ctypes.data_as(c_void_p),n_threads)
     print "c_threads",k, " time ", (time.time()-start)/trials

____
Results on my machine, dual xeon, 12 cores
na=329
nb=340
____

100 trials each:
c_threads 1  time  0.00109949827194
c_threads 2  time  0.0005726313591
c_threads 3  time  0.000429179668427
c_threads 4  time  0.000349278450012
c_threads 5  time  0.000287139415741
c_threads 6  time  0.000252468585968
c_threads 7  time  0.000222821235657
c_threads 8  time  0.000206289291382
c_threads 9  time  0.000187981128693
c_threads 10  time  0.000172770023346
c_threads 11  time  0.000164999961853
c_threads 12  time  0.000157740116119

____
____
Results on my machine, dual xeon, 12 cores
na=3290
nb=3400
______
100 trials each:
c_threads 1  time  0.10744508028
c_threads 2  time  0.0542239999771
c_threads 3  time  0.037127559185
c_threads 4  time  0.0280736112595
c_threads 5  time  0.0228648614883
c_threads 6  time  0.0194904088974
c_threads 7  time  0.0165715909004
c_threads 8  time  0.0145838689804
c_threads 9  time  0.0130002498627
c_threads 10  time  0.0116940999031
c_threads 11  time  0.0107557415962
c_threads 12  time  0.00990005016327 (speedup almost 11)