[Numpy-discussion] Optimizing reduction loops (sum(), prod(), et al.)
Citi, Luca
lciti at essex.ac.uk
Mon Jul 13 04:28:21 EDT 2009
Hi Pauli,
in my PC I have tried this and some of the regressions disappear,
maybe you can give it a try.
At the present state is compiler- and architecture-dependent,
therefore not the best choice. But it may be worth trying.
Best,
Luca
/* My additions are unindented */
/*
* "Vectorized" reduction along an axis
*
* Evaluating the inner loop in smaller blocks interleaved with the
* reduction loop aims to avoid cache misses in the loop->ret array.
*/
{
typedef unsigned long long ticks;
__inline__ ticks getticks(void)
{
unsigned a, d;
/* return clock();*/
/* asm("cpuid");*/
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return (((ticks)a) | (((ticks)d) << 32));
}
npy_intp new_block_size;
ticks t0, t1;
int delta = 8;
int speed, speed_p;
/*t0 = getticks();
t0 = getticks();*/
t0 = getticks();
speed_p = 0.;
block_size = 2 + (loop->bufsize / loop->outsize / 2);
new_block_size = block_size;
/*printf("was %d", block_size);*/
for (k = 0; k < loop->size; k += block_size) {
char *bufptr[3];
block_size = new_block_size;
/*printf(" then %d (speed_p %d)", block_size, speed_p);*/
bufptr[0] = loop->bufptr[0] + k * loop->steps[0];
bufptr[1] = loop->bufptr[1] + k * loop->steps[1];
bufptr[2] = loop->bufptr[2] + k * loop->steps[2];
if (k + block_size > loop->size) {
block_size = loop->size - k;
}
for (i = i0; i <= loop->N; ++i) {
bufptr[1] += loop->instrides;
loop->function((char **)bufptr, &block_size,
loop->steps, loop->funcdata);
UFUNC_CHECK_ERROR(loop);
}
t1 = getticks();
speed = (block_size << 12) / (t1 - t0);
if (speed < speed_p)
delta = -delta;
new_block_size = (1 + ((block_size * (128 + delta)) >> 10)) << 3;
speed_p = speed;
t0 = t1;
}
/*printf(" is %d (speed_p %d)\n", block_size, speed_p);*/
}
PyArray_ITER_NEXT(loop->it);
PyArray_ITER_NEXT(loop->rit);
}
More information about the NumPy-Discussion
mailing list