[Numpy-discussion] OpenBLAS on Mac

Sat Feb 22 20:43:00 EST 2014

On 23/02/14 00:11, Sturla Molden wrote:

> Did the upgrade to Mavericks do this?
>

Testing different matrix sizes and averaging 30 trials, they are quite 
similar, actually. Accelerate is perhaps the winner, but it really 
depends on the matrix size.

See for yourself.

:-)

Sturla

List of attachments:

Plots of the average runtime:
dgemm_test.png
dgemm_test2.png

C codes:
perftest_openblas.c
perftest_accelerate.c
perftest_mkl.c

Timings from my MacBook Pro (2.4 GHz i7)
accelerate.txt
openblas.txt
mkl.txt

-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test.png
Type: image/png
Size: 59517 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment.png>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test2.png
Type: image/png
Size: 49065 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment-0001.png>
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "mkl.h"

const int matrix_size[] = {
     10,    13,    16,    21,    26,    34,    43,    55,
     70,    89,   113,   144,   183,   234,   298,   379,
    483,   616,   785,  1000
};

const int matrix_size_pow2[] = {
    4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048
};

const int NREPEATS = 30;
const int MAX_N = 2048;

double nanodiff(const uint64_t _t0, const uint64_t _t1, 
                  const mach_timebase_info_data_t *tb_info)
{   
    long double t0, t1, numer, denom, nanosec;
    numer = (long double)(tb_info->numer);
    denom = (long double)(tb_info->denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

void fill_with_random(const int n, double *x)
{
     static unsigned int m_w = 123456;
     static unsigned int m_z = 5635273;
     int i;
     for (i=0; i<n; i++) {
         m_z = 36969 * (m_z & 65535) + (m_z >> 16);
         m_w = 18000 * (m_w & 65535) + (m_w >> 16);
         *x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;
     }
}

void statistics(const int n, const double *x, 
                   double *m, double *s, 
                   double *min, double *max)
{
    double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
    double minval, maxval, v;
    int i;
    for (i=0; i<n; i++) sum_x += x[i];
    _m = sum_x / (double)n;
    for (i=0; i<n; i++) {
        cx = x[i] - _m;
        sum_cxcx += cx*cx;
    }    
    *m = _m;
    *s = sqrt(sum_cxcx / (double)(n-1));
    minval = *x;
    maxval = *x;
    for (i=1; i<n; i++) {
        v = *x++;
        maxval = (maxval < v ? v : maxval);
        minval = (minval > v ? v : minval);
    }
    *max = maxval;
    *min = minval;
}

int main(int argc, char **argv)
{

    double nanosec[NREPEATS];
    uint64_t t0, t1;
    mach_timebase_info_data_t tb_info;

    double *A = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64); 
    double *B = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64); 
    double *C = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64);
    double mean, std, min, max;

    int i, j, k, m, n;

    mach_timebase_info(&tb_info);

    fill_with_random(MAX_N*MAX_N, A);
    fill_with_random(MAX_N*MAX_N, B);
    fill_with_random(MAX_N*MAX_N, C);

    for (i=0; i<20; i++) {
        n = matrix_size[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {

            t0 = mach_absolute_time();

            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();

            nanosec[j] = nanodiff(t0, t1, &tb_info);
        }

        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);

        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);

    }

    printf("\n\n");

    for (i=0; i<10; i++) {
        n = matrix_size_pow2[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {

            t0 = mach_absolute_time();

            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();

            nanosec[j] = nanodiff(t0, t1, &tb_info);
        }

        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);

        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);        
    }

    mkl_free(A); mkl_free(B); mkl_free(C);
}

-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cblas.h>

const int matrix_size[] = {
     10,    13,    16,    21,    26,    34,    43,    55,
     70,    89,   113,   144,   183,   234,   298,   379,
    483,   616,   785,  1000
};

const int matrix_size_pow2[] = {
    4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048
};

const int NREPEATS = 30;
const int MAX_N = 2048;

double nanodiff(const uint64_t _t0, const uint64_t _t1, 
                  const mach_timebase_info_data_t *tb_info)
{   
    long double t0, t1, numer, denom, nanosec;
    numer = (long double)(tb_info->numer);
    denom = (long double)(tb_info->denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

void fill_with_random(const int n, double *x)
{
     static unsigned int m_w = 123456;
     static unsigned int m_z = 5635273;
     int i;
     for (i=0; i<n; i++) {
         m_z = 36969 * (m_z & 65535) + (m_z >> 16);
         m_w = 18000 * (m_w & 65535) + (m_w >> 16);
         *x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;
     }
}

void statistics(const int n, const double *x, 
                   double *m, double *s, 
                   double *min, double *max)
{
    double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
    double minval, maxval, v;
    int i;
    for (i=0; i<n; i++) sum_x += x[i];
    _m = sum_x / (double)n;
    for (i=0; i<n; i++) {
        cx = x[i] - _m;
        sum_cxcx += cx*cx;
    }    
    *m = _m;
    *s = sqrt(sum_cxcx / (double)(n-1));
    minval = *x;
    maxval = *x;
    for (i=1; i<n; i++) {
        v = *x++;
        maxval = (maxval < v ? v : maxval);
        minval = (minval > v ? v : minval);
    }
    *max = maxval;
    *min = minval;
}

int main(int argc, char **argv)
{

    double nanosec[NREPEATS];
    uint64_t t0, t1;
    mach_timebase_info_data_t tb_info;

    double *A = (double*)malloc(MAX_N*MAX_N*sizeof(double)); 
    double *B = (double*)malloc(MAX_N*MAX_N*sizeof(double)); 
    double *C = (double*)malloc(MAX_N*MAX_N*sizeof(double));
    double mean, std, min, max;

    int i, j, k, m, n;

    mach_timebase_info(&tb_info);

    fill_with_random(MAX_N*MAX_N, A);
    fill_with_random(MAX_N*MAX_N, B);
    fill_with_random(MAX_N*MAX_N, C);

    for (i=0; i<20; i++) {
        n = matrix_size[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {

            t0 = mach_absolute_time();

            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();

            nanosec[j] = nanodiff(t0, t1, &tb_info);
        }

        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);

        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);        
    }

    printf("\n\n");

    for (i=0; i<10; i++) {
        n = matrix_size_pow2[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {

            t0 = mach_absolute_time();

            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();

            nanosec[j] = nanodiff(t0, t1, &tb_info);
        }

        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);

        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);

    }

    free(A); free(B); free(C);
}

-------------- next part --------------
/*

See license.txt.

*/

#include <assert.h>
/* #include <CoreServices/CoreServices.h> */
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <unistd.h>
#include <stdlib.h>

typedef struct {
    int status;
    uint64_t t0;
    uint64_t t1; 
    mach_timebase_info_data_t tb_info;
} perf_timer_t;

const int PERF_TIMER_CLEAR = 0;
const int PERF_TIMER_RUNNING = 1;
const int PERF_TIMER_STOPPED = 2;

perf_timer_t *create_perf_timer(void)
{
    perf_timer_t *timer = (perf_timer_t *)malloc(sizeof(perf_timer_t));
    if (timer == NULL) goto error;
    timer->status = PERF_TIMER_CLEAR;
    mach_timebase_info(&timer->tb_info);
    return timer;
error:
    if (timer != NULL) free(timer);
    return NULL;
}

void destroy_perf_timer(perf_timer_t *timer)
{
    if (timer != NULL) free(timer);
}

int start_perf_timer(perf_timer_t *timer)
{
    if (timer == NULL) goto error;
    if (timer->status == PERF_TIMER_RUNNING) goto error;
    timer->t0 = mach_absolute_time();
    timer->status = PERF_TIMER_RUNNING;    
    return 0;
error:
    return -1;
}

int stop_perf_timer(perf_timer_t *timer)
{
    if (timer == NULL) goto error;
    if (timer->status != PERF_TIMER_RUNNING) goto error;
    timer->t1 = mach_absolute_time();
    timer->status = PERF_TIMER_STOPPED;    
    return 0;
error:
    return -1;
}

int clear_perf_timer(perf_timer_t *timer)
{
    if (timer == NULL) return -1;
    if (timer->status != PERF_TIMER_STOPPED) return -1;
    timer->status = PERF_TIMER_CLEAR;
    return 0;
}

int nanodiff_perf_timer(const perf_timer_t *timer, long double *nanosec)
{   
    long double t0, t1, numer, denom;    
    if (timer == NULL) return -1;
    if (timer->status != PERF_TIMER_STOPPED) return -1;    
    numer = (long double)(timer->tb_info.numer);
    denom = (long double)(timer->tb_info.denom);    
    t0 = (long double)(timer->t0);
    t1 = (long double)(timer->t1);
    if (nanosec != NULL) {
        *nanosec = (t1 - t0) * numer / denom;
        return 0;
    } else return -1;
}

int nanores_perf_timer(const perf_timer_t *timer, long double *nanosec_resolution)
{
    long double numer, denom; 
    if (timer == NULL) return -1;
    numer = (long double)(timer->tb_info.numer);
    denom = (long double)(timer->tb_info.denom);
    if (nanosec_resolution != NULL) {
        *nanosec_resolution = numer / denom;
        return 0;
    } else return -1;
}

-------------- next part --------------
[  10,    1.8249e+03,    7.5052e+03,    4.3600e+02,    4.1562e+04],
[  13,    9.4907e+02,    2.7929e+02,    8.9100e+02,    2.4260e+03],
[  16,    2.3644e+03,    6.0826e+03,    1.0810e+03,    3.4278e+04],
[  21,    2.7637e+03,    2.5322e+03,    2.0580e+03,    1.3706e+04],
[  26,    3.6573e+03,    1.8597e+03,    2.9700e+03,    9.9230e+03],
[  34,    5.7156e+03,    8.1641e+02,    5.4040e+03,    8.9120e+03],
[  43,    1.0956e+04,    2.3088e+03,    1.0122e+04,    1.9766e+04],
[  55,    2.1788e+04,    1.1596e+04,    1.8955e+04,    8.2381e+04],
[  70,    3.8077e+04,    4.2441e+03,    3.6798e+04,    5.4417e+04],
[  89,    7.0032e+04,    5.1650e+03,    6.8568e+04,    9.6636e+04],
[ 113,    1.3922e+05,    1.3897e+04,    1.3513e+05,    2.0781e+05],
[ 144,    1.2979e+05,    7.2822e+04,    9.9868e+04,    4.4901e+05],
[ 183,    2.0268e+05,    1.0931e+04,    1.9760e+05,    2.4071e+05],
[ 234,    4.0132e+05,    9.1694e+04,    3.5535e+05,    7.1502e+05],
[ 298,    8.6309e+05,    2.6523e+05,    6.9578e+05,    1.4252e+06],
[ 379,    1.5985e+06,    4.0820e+05,    1.4171e+06,    2.8624e+06],
[ 483,    3.0864e+06,    6.0088e+05,    2.8326e+06,    5.4301e+06],
[ 616,    6.6489e+06,    1.5387e+06,    5.7692e+06,    1.0613e+07],
[ 785,    1.4754e+07,    3.4156e+06,    1.2107e+07,    2.1649e+07],
[1000,    3.1154e+07,    5.5740e+06,    2.4195e+07,    4.0360e+07],

[   4,    1.9477e+02,    3.2552e+02,    1.2800e+02,    1.9150e+03],
[   8,    3.8077e+02,    7.9966e+01,    3.5500e+02,    8.0200e+02],
[  16,    2.7588e+03,    6.3393e+03,    1.4950e+03,    3.6304e+04],
[  32,    6.6004e+03,    2.1850e+03,    5.5530e+03,    1.5607e+04],
[  64,    3.5622e+04,    1.2820e+03,    3.4828e+04,    4.1095e+04],
[ 128,    9.6848e+04,    3.6344e+04,    7.9238e+04,    2.3208e+05],
[ 256,    4.5615e+05,    9.2865e+04,    4.2092e+05,    8.2640e+05],
[ 512,    3.3334e+06,    2.6552e+05,    3.2502e+06,    4.7093e+06],
[1024,    3.5191e+07,    5.5548e+06,    2.5881e+07,    4.5181e+07],
[2048,    2.7725e+08,    1.5691e+07,    2.3737e+08,    2.9737e+08],
-------------- next part --------------
[  10,    2.4680e+03,    1.0001e+04,    5.2700e+02,    5.5399e+04],
[  13,    8.8493e+02,    3.0951e+02,    8.0500e+02,    2.5100e+03],
[  16,    3.3365e+03,    6.1387e+03,    2.0660e+03,    3.5831e+04],
[  21,    3.5532e+03,    6.4100e+02,    3.1980e+03,    6.2850e+03],
[  26,    4.5164e+03,    1.6061e+02,    4.3510e+03,    5.0860e+03],
[  34,    5.4646e+03,    8.0315e+02,    5.0180e+03,    9.6500e+03],
[  43,    9.5117e+03,    7.1500e+02,    9.1560e+03,    1.3219e+04],
[  55,    1.5673e+04,    9.1927e+02,    1.5228e+04,    2.0464e+04],
[  70,    2.3438e+04,    2.3378e+03,    2.2688e+04,    3.5773e+04],
[  89,    4.8458e+04,    9.8972e+02,    4.7769e+04,    5.3483e+04],
[ 113,    6.3075e+04,    1.8880e+04,    4.7137e+04,    1.0836e+05],
[ 144,    9.1631e+04,    1.5589e+04,    8.3371e+04,    1.3086e+05],
[ 183,    1.7476e+05,    6.5323e+03,    1.7318e+05,    2.0928e+05],
[ 234,    4.6118e+05,    1.5100e+05,    3.2846e+05,    6.8653e+05],
[ 298,    1.3137e+06,    2.2717e+04,    1.2116e+06,    1.3345e+06],
[ 379,    2.6917e+06,    1.7294e+05,    1.8249e+06,    2.8141e+06],
[ 483,    4.5088e+06,    1.1748e+06,    2.5604e+06,    5.5026e+06],
[ 616,    9.0293e+06,    2.1052e+06,    5.3924e+06,    1.1081e+07],
[ 785,    2.2010e+07,    1.7049e+06,    1.7738e+07,    2.6944e+07],
[1000,    3.4778e+07,    8.2156e+06,    2.2372e+07,    4.5916e+07],

[   4,    4.3840e+02,    4.3370e+02,    2.5200e+02,    2.6970e+03],
[   8,    4.4577e+02,    1.7493e+02,    3.8900e+02,    1.3540e+03],
[  16,    1.9848e+03,    4.6771e+02,    1.6710e+03,    4.4150e+03],
[  32,    3.0532e+03,    5.4374e+02,    2.7920e+03,    5.2560e+03],
[  64,    9.3250e+03,    6.6518e+02,    8.9210e+03,    1.2746e+04],
[ 128,    5.4620e+04,    2.0069e+03,    5.3877e+04,    6.5200e+04],
[ 256,    3.7464e+05,    7.6448e+03,    3.7055e+05,    4.0864e+05],
[ 512,    3.7553e+06,    1.3328e+06,    2.9490e+06,    6.1633e+06],
[1024,    2.9947e+07,    5.5860e+06,    2.3243e+07,    4.3105e+07],
[2048,    2.4223e+08,    2.3419e+07,    1.9434e+08,    2.8706e+08],
-------------- next part --------------
[  10,    1.3607e+03,    5.1284e+03,    3.4300e+02,    2.8490e+04],
[  13,    6.4280e+02,    1.9452e+02,    5.9600e+02,    1.6710e+03],
[  16,    7.2043e+02,    6.6691e+01,    6.9700e+02,    1.0690e+03],
[  21,    1.5310e+04,    7.3601e+04,    1.8240e+03,    4.0500e+05],
[  26,    3.1701e+03,    1.6513e+03,    2.7050e+03,    1.0508e+04],
[  34,    1.2321e+04,    3.9041e+04,    4.9730e+03,    2.1902e+05],
[  43,    9.0754e+03,    1.8779e+03,    8.5860e+03,    1.8982e+04],
[  55,    1.4239e+04,    3.5869e+03,    1.3076e+04,    3.0638e+04],
[  70,    2.2525e+04,    5.4555e+03,    2.1133e+04,    5.1306e+04],
[  89,    4.4427e+04,    6.8388e+03,    4.2697e+04,    8.0575e+04],
[ 113,    8.3632e+04,    1.0850e+04,    8.0893e+04,    1.4103e+05],
[ 144,    1.5535e+05,    1.6323e+04,    1.5149e+05,    2.4163e+05],
[ 183,    3.1204e+05,    2.8211e+04,    2.5645e+05,    4.4332e+05],
[ 234,    6.2586e+05,    3.5000e+04,    6.1664e+05,    8.1070e+05],
[ 298,    1.1201e+06,    1.3749e+05,    9.2511e+05,    1.3264e+06],
[ 379,    2.5531e+06,    6.4311e+05,    1.6633e+06,    3.3693e+06],
[ 483,    5.3612e+06,    7.7730e+05,    2.9665e+06,    5.8146e+06],
[ 616,    8.0512e+06,    2.3215e+06,    5.5969e+06,    1.2458e+07],
[ 785,    1.8979e+07,    4.2977e+06,    1.1792e+07,    2.3341e+07],
[1000,    3.6060e+07,    5.8910e+06,    2.2430e+07,    4.7225e+07],

[   4,    3.7207e+02,    6.6756e+02,    1.9800e+02,    3.9020e+03],
[   8,    3.7760e+02,    1.2692e+02,    3.2600e+02,    1.0430e+03],
[  16,    1.2662e+03,    1.9406e+02,    1.1360e+03,    2.2460e+03],
[  32,    6.7047e+03,    2.3775e+03,    5.6690e+03,    1.9156e+04],
[  64,    2.1552e+04,    1.6013e+04,    1.4132e+04,    6.6789e+04],
[ 128,    1.0602e+05,    4.5159e+03,    1.0420e+05,    1.2973e+05],
[ 256,    7.7634e+05,    8.7983e+04,    6.3991e+05,    9.0461e+05],
[ 512,    6.4976e+06,    2.7818e+05,    5.3517e+06,    6.6782e+06],
[1024,    3.7353e+07,    6.4473e+06,    2.4760e+07,    4.7515e+07],
[2048,    2.8886e+08,    2.5273e+07,    2.4149e+08,    3.3363e+08],