[Numpy-discussion] Optimizing reduction loops (sum(), prod(), et al.)

Mon Jul 13 04:28:21 EDT 2009

Hi Pauli,
in my PC I have tried this and some of the regressions disappear,
maybe you can give it a try.
At the present state is compiler- and architecture-dependent,
therefore not the best choice. But it may be worth trying.
Best,
Luca

/* My additions are unindented */

            /*
             * "Vectorized" reduction along an axis
             *
             * Evaluating the inner loop in smaller blocks interleaved with the
             * reduction loop aims to avoid cache misses in the loop->ret array.
             */
{
typedef unsigned long long ticks;
__inline__ ticks getticks(void)
{
 unsigned a, d;
/* return clock();*/
/* asm("cpuid");*/
 asm volatile("rdtsc" : "=a" (a), "=d" (d));
 return (((ticks)a) | (((ticks)d) << 32));
}
npy_intp new_block_size;
ticks t0, t1;
int delta = 8;
int speed, speed_p;
/*t0 = getticks();
t0 = getticks();*/
t0 = getticks();
speed_p = 0.;
            block_size = 2 + (loop->bufsize / loop->outsize / 2);
new_block_size = block_size;
/*printf("was %d", block_size);*/
            for (k = 0; k < loop->size; k += block_size) {
                char *bufptr[3];
block_size = new_block_size;
/*printf(" then %d (speed_p %d)", block_size, speed_p);*/

                bufptr[0] = loop->bufptr[0] + k * loop->steps[0];
                bufptr[1] = loop->bufptr[1] + k * loop->steps[1];
                bufptr[2] = loop->bufptr[2] + k * loop->steps[2];

                if (k + block_size > loop->size) {
                    block_size = loop->size - k;
                }

                for (i = i0; i <= loop->N; ++i) {
                    bufptr[1] += loop->instrides;
                    loop->function((char **)bufptr, &block_size,
                                   loop->steps, loop->funcdata);
                    UFUNC_CHECK_ERROR(loop);
                }
t1 = getticks();
speed = (block_size << 12) / (t1 - t0);
if (speed < speed_p)
  delta = -delta;
new_block_size = (1 + ((block_size * (128 + delta)) >> 10)) << 3;
speed_p = speed;
t0 = t1;
            }
/*printf(" is %d (speed_p %d)\n", block_size, speed_p);*/
}                
            PyArray_ITER_NEXT(loop->it);
            PyArray_ITER_NEXT(loop->rit);
        }