[Numpy-discussion] OpenBLAS on Mac

Sturla Molden sturla.molden at gmail.com
Sat Feb 22 20:43:00 EST 2014

On 23/02/14 00:11, Sturla Molden wrote:

> Did the upgrade to Mavericks do this?

Testing different matrix sizes and averaging 30 trials, they are quite 
similar, actually. Accelerate is perhaps the winner, but it really 
depends on the matrix size.

See for yourself.



List of attachments:

Plots of the average runtime:

C codes:

Timings from my MacBook Pro (2.4 GHz i7)

-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test.png
Type: image/png
Size: 59517 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment.png>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test2.png
Type: image/png
Size: 49065 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment-0001.png>
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "mkl.h"

const int matrix_size[] = {
     10,    13,    16,    21,    26,    34,    43,    55,
     70,    89,   113,   144,   183,   234,   298,   379,
    483,   616,   785,  1000

const int matrix_size_pow2[] = {
    4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048

const int NREPEATS = 30;
const int MAX_N = 2048;

double nanodiff(const uint64_t _t0, const uint64_t _t1, 
                  const mach_timebase_info_data_t *tb_info)
    long double t0, t1, numer, denom, nanosec;
    numer = (long double)(tb_info->numer);
    denom = (long double)(tb_info->denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;

void fill_with_random(const int n, double *x)
     static unsigned int m_w = 123456;
     static unsigned int m_z = 5635273;
     int i;
     for (i=0; i<n; i++) {
         m_z = 36969 * (m_z & 65535) + (m_z >> 16);
         m_w = 18000 * (m_w & 65535) + (m_w >> 16);
         *x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;

void statistics(const int n, const double *x, 
                   double *m, double *s, 
                   double *min, double *max)
    double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
    double minval, maxval, v;
    int i;
    for (i=0; i<n; i++) sum_x += x[i];
    _m = sum_x / (double)n;
    for (i=0; i<n; i++) {
        cx = x[i] - _m;
        sum_cxcx += cx*cx;
    *m = _m;
    *s = sqrt(sum_cxcx / (double)(n-1));
    minval = *x;
    maxval = *x;
    for (i=1; i<n; i++) {
        v = *x++;
        maxval = (maxval < v ? v : maxval);
        minval = (minval > v ? v : minval);
    *max = maxval;
    *min = minval;

int main(int argc, char **argv)

    double nanosec[NREPEATS];
    uint64_t t0, t1;
    mach_timebase_info_data_t tb_info;
    double *A = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64); 
    double *B = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64); 
    double *C = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64);
    double mean, std, min, max;
    int i, j, k, m, n;
    fill_with_random(MAX_N*MAX_N, A);
    fill_with_random(MAX_N*MAX_N, B);
    fill_with_random(MAX_N*MAX_N, C);
    for (i=0; i<20; i++) {
        n = matrix_size[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {
            t0 = mach_absolute_time();
            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();
            nanosec[j] = nanodiff(t0, t1, &tb_info);
        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);
    for (i=0; i<10; i++) {
        n = matrix_size_pow2[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {
            t0 = mach_absolute_time();
            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();
            nanosec[j] = nanodiff(t0, t1, &tb_info);
        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);        
    mkl_free(A); mkl_free(B); mkl_free(C);

-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cblas.h>

const int matrix_size[] = {
     10,    13,    16,    21,    26,    34,    43,    55,
     70,    89,   113,   144,   183,   234,   298,   379,
    483,   616,   785,  1000

const int matrix_size_pow2[] = {
    4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048

const int NREPEATS = 30;
const int MAX_N = 2048;

double nanodiff(const uint64_t _t0, const uint64_t _t1, 
                  const mach_timebase_info_data_t *tb_info)
    long double t0, t1, numer, denom, nanosec;
    numer = (long double)(tb_info->numer);
    denom = (long double)(tb_info->denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;

void fill_with_random(const int n, double *x)
     static unsigned int m_w = 123456;
     static unsigned int m_z = 5635273;
     int i;
     for (i=0; i<n; i++) {
         m_z = 36969 * (m_z & 65535) + (m_z >> 16);
         m_w = 18000 * (m_w & 65535) + (m_w >> 16);
         *x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;

void statistics(const int n, const double *x, 
                   double *m, double *s, 
                   double *min, double *max)
    double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
    double minval, maxval, v;
    int i;
    for (i=0; i<n; i++) sum_x += x[i];
    _m = sum_x / (double)n;
    for (i=0; i<n; i++) {
        cx = x[i] - _m;
        sum_cxcx += cx*cx;
    *m = _m;
    *s = sqrt(sum_cxcx / (double)(n-1));
    minval = *x;
    maxval = *x;
    for (i=1; i<n; i++) {
        v = *x++;
        maxval = (maxval < v ? v : maxval);
        minval = (minval > v ? v : minval);
    *max = maxval;
    *min = minval;

int main(int argc, char **argv)

    double nanosec[NREPEATS];
    uint64_t t0, t1;
    mach_timebase_info_data_t tb_info;
    double *A = (double*)malloc(MAX_N*MAX_N*sizeof(double)); 
    double *B = (double*)malloc(MAX_N*MAX_N*sizeof(double)); 
    double *C = (double*)malloc(MAX_N*MAX_N*sizeof(double));
    double mean, std, min, max;
    int i, j, k, m, n;
    fill_with_random(MAX_N*MAX_N, A);
    fill_with_random(MAX_N*MAX_N, B);
    fill_with_random(MAX_N*MAX_N, C);
    for (i=0; i<20; i++) {
        n = matrix_size[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {
            t0 = mach_absolute_time();
            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();
            nanosec[j] = nanodiff(t0, t1, &tb_info);
        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);        
    for (i=0; i<10; i++) {
        n = matrix_size_pow2[i];
        m = n; k = n;
        for (j=0; j<NREPEATS; j++) {
            t0 = mach_absolute_time();
            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, 1.0, A, k, B, n, 1.0, C, n);

            t1 = mach_absolute_time();
            nanosec[j] = nanodiff(t0, t1, &tb_info);
        statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
        printf("[%4d,    %.4e,    %.4e,    %.4e,    %.4e],\n", 
                n, mean, std, min, max);
    free(A); free(B); free(C);

-------------- next part --------------

Copyright (c) 2014, Sturla Molden.
All rights reserved.

See license.txt.


#include <assert.h>
/* #include <CoreServices/CoreServices.h> */
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <unistd.h>
#include <stdlib.h>

typedef struct {
    int status;
    uint64_t t0;
    uint64_t t1; 
    mach_timebase_info_data_t tb_info;
} perf_timer_t;

const int PERF_TIMER_CLEAR = 0;
const int PERF_TIMER_RUNNING = 1;
const int PERF_TIMER_STOPPED = 2;

perf_timer_t *create_perf_timer(void)
    perf_timer_t *timer = (perf_timer_t *)malloc(sizeof(perf_timer_t));
    if (timer == NULL) goto error;
    timer->status = PERF_TIMER_CLEAR;
    return timer;
    if (timer != NULL) free(timer);
    return NULL;

void destroy_perf_timer(perf_timer_t *timer)
    if (timer != NULL) free(timer);

int start_perf_timer(perf_timer_t *timer)
    if (timer == NULL) goto error;
    if (timer->status == PERF_TIMER_RUNNING) goto error;
    timer->t0 = mach_absolute_time();
    timer->status = PERF_TIMER_RUNNING;    
    return 0;
    return -1;

int stop_perf_timer(perf_timer_t *timer)
    if (timer == NULL) goto error;
    if (timer->status != PERF_TIMER_RUNNING) goto error;
    timer->t1 = mach_absolute_time();
    timer->status = PERF_TIMER_STOPPED;    
    return 0;
    return -1;

int clear_perf_timer(perf_timer_t *timer)
    if (timer == NULL) return -1;
    if (timer->status != PERF_TIMER_STOPPED) return -1;
    timer->status = PERF_TIMER_CLEAR;
    return 0;

int nanodiff_perf_timer(const perf_timer_t *timer, long double *nanosec)
    long double t0, t1, numer, denom;    
    if (timer == NULL) return -1;
    if (timer->status != PERF_TIMER_STOPPED) return -1;    
    numer = (long double)(timer->tb_info.numer);
    denom = (long double)(timer->tb_info.denom);    
    t0 = (long double)(timer->t0);
    t1 = (long double)(timer->t1);
    if (nanosec != NULL) {
        *nanosec = (t1 - t0) * numer / denom;
        return 0;
    } else return -1;

int nanores_perf_timer(const perf_timer_t *timer, long double *nanosec_resolution)
    long double numer, denom; 
    if (timer == NULL) return -1;
    numer = (long double)(timer->tb_info.numer);
    denom = (long double)(timer->tb_info.denom);
    if (nanosec_resolution != NULL) {
        *nanosec_resolution = numer / denom;
        return 0;
    } else return -1;

-------------- next part --------------
[  10,    1.8249e+03,    7.5052e+03,    4.3600e+02,    4.1562e+04],
[  13,    9.4907e+02,    2.7929e+02,    8.9100e+02,    2.4260e+03],
[  16,    2.3644e+03,    6.0826e+03,    1.0810e+03,    3.4278e+04],
[  21,    2.7637e+03,    2.5322e+03,    2.0580e+03,    1.3706e+04],
[  26,    3.6573e+03,    1.8597e+03,    2.9700e+03,    9.9230e+03],
[  34,    5.7156e+03,    8.1641e+02,    5.4040e+03,    8.9120e+03],
[  43,    1.0956e+04,    2.3088e+03,    1.0122e+04,    1.9766e+04],
[  55,    2.1788e+04,    1.1596e+04,    1.8955e+04,    8.2381e+04],
[  70,    3.8077e+04,    4.2441e+03,    3.6798e+04,    5.4417e+04],
[  89,    7.0032e+04,    5.1650e+03,    6.8568e+04,    9.6636e+04],
[ 113,    1.3922e+05,    1.3897e+04,    1.3513e+05,    2.0781e+05],
[ 144,    1.2979e+05,    7.2822e+04,    9.9868e+04,    4.4901e+05],
[ 183,    2.0268e+05,    1.0931e+04,    1.9760e+05,    2.4071e+05],
[ 234,    4.0132e+05,    9.1694e+04,    3.5535e+05,    7.1502e+05],
[ 298,    8.6309e+05,    2.6523e+05,    6.9578e+05,    1.4252e+06],
[ 379,    1.5985e+06,    4.0820e+05,    1.4171e+06,    2.8624e+06],
[ 483,    3.0864e+06,    6.0088e+05,    2.8326e+06,    5.4301e+06],
[ 616,    6.6489e+06,    1.5387e+06,    5.7692e+06,    1.0613e+07],
[ 785,    1.4754e+07,    3.4156e+06,    1.2107e+07,    2.1649e+07],
[1000,    3.1154e+07,    5.5740e+06,    2.4195e+07,    4.0360e+07],

[   4,    1.9477e+02,    3.2552e+02,    1.2800e+02,    1.9150e+03],
[   8,    3.8077e+02,    7.9966e+01,    3.5500e+02,    8.0200e+02],
[  16,    2.7588e+03,    6.3393e+03,    1.4950e+03,    3.6304e+04],
[  32,    6.6004e+03,    2.1850e+03,    5.5530e+03,    1.5607e+04],
[  64,    3.5622e+04,    1.2820e+03,    3.4828e+04,    4.1095e+04],
[ 128,    9.6848e+04,    3.6344e+04,    7.9238e+04,    2.3208e+05],
[ 256,    4.5615e+05,    9.2865e+04,    4.2092e+05,    8.2640e+05],
[ 512,    3.3334e+06,    2.6552e+05,    3.2502e+06,    4.7093e+06],
[1024,    3.5191e+07,    5.5548e+06,    2.5881e+07,    4.5181e+07],
[2048,    2.7725e+08,    1.5691e+07,    2.3737e+08,    2.9737e+08],
-------------- next part --------------
[  10,    2.4680e+03,    1.0001e+04,    5.2700e+02,    5.5399e+04],
[  13,    8.8493e+02,    3.0951e+02,    8.0500e+02,    2.5100e+03],
[  16,    3.3365e+03,    6.1387e+03,    2.0660e+03,    3.5831e+04],
[  21,    3.5532e+03,    6.4100e+02,    3.1980e+03,    6.2850e+03],
[  26,    4.5164e+03,    1.6061e+02,    4.3510e+03,    5.0860e+03],
[  34,    5.4646e+03,    8.0315e+02,    5.0180e+03,    9.6500e+03],
[  43,    9.5117e+03,    7.1500e+02,    9.1560e+03,    1.3219e+04],
[  55,    1.5673e+04,    9.1927e+02,    1.5228e+04,    2.0464e+04],
[  70,    2.3438e+04,    2.3378e+03,    2.2688e+04,    3.5773e+04],
[  89,    4.8458e+04,    9.8972e+02,    4.7769e+04,    5.3483e+04],
[ 113,    6.3075e+04,    1.8880e+04,    4.7137e+04,    1.0836e+05],
[ 144,    9.1631e+04,    1.5589e+04,    8.3371e+04,    1.3086e+05],
[ 183,    1.7476e+05,    6.5323e+03,    1.7318e+05,    2.0928e+05],
[ 234,    4.6118e+05,    1.5100e+05,    3.2846e+05,    6.8653e+05],
[ 298,    1.3137e+06,    2.2717e+04,    1.2116e+06,    1.3345e+06],
[ 379,    2.6917e+06,    1.7294e+05,    1.8249e+06,    2.8141e+06],
[ 483,    4.5088e+06,    1.1748e+06,    2.5604e+06,    5.5026e+06],
[ 616,    9.0293e+06,    2.1052e+06,    5.3924e+06,    1.1081e+07],
[ 785,    2.2010e+07,    1.7049e+06,    1.7738e+07,    2.6944e+07],
[1000,    3.4778e+07,    8.2156e+06,    2.2372e+07,    4.5916e+07],

[   4,    4.3840e+02,    4.3370e+02,    2.5200e+02,    2.6970e+03],
[   8,    4.4577e+02,    1.7493e+02,    3.8900e+02,    1.3540e+03],
[  16,    1.9848e+03,    4.6771e+02,    1.6710e+03,    4.4150e+03],
[  32,    3.0532e+03,    5.4374e+02,    2.7920e+03,    5.2560e+03],
[  64,    9.3250e+03,    6.6518e+02,    8.9210e+03,    1.2746e+04],
[ 128,    5.4620e+04,    2.0069e+03,    5.3877e+04,    6.5200e+04],
[ 256,    3.7464e+05,    7.6448e+03,    3.7055e+05,    4.0864e+05],
[ 512,    3.7553e+06,    1.3328e+06,    2.9490e+06,    6.1633e+06],
[1024,    2.9947e+07,    5.5860e+06,    2.3243e+07,    4.3105e+07],
[2048,    2.4223e+08,    2.3419e+07,    1.9434e+08,    2.8706e+08],
-------------- next part --------------
[  10,    1.3607e+03,    5.1284e+03,    3.4300e+02,    2.8490e+04],
[  13,    6.4280e+02,    1.9452e+02,    5.9600e+02,    1.6710e+03],
[  16,    7.2043e+02,    6.6691e+01,    6.9700e+02,    1.0690e+03],
[  21,    1.5310e+04,    7.3601e+04,    1.8240e+03,    4.0500e+05],
[  26,    3.1701e+03,    1.6513e+03,    2.7050e+03,    1.0508e+04],
[  34,    1.2321e+04,    3.9041e+04,    4.9730e+03,    2.1902e+05],
[  43,    9.0754e+03,    1.8779e+03,    8.5860e+03,    1.8982e+04],
[  55,    1.4239e+04,    3.5869e+03,    1.3076e+04,    3.0638e+04],
[  70,    2.2525e+04,    5.4555e+03,    2.1133e+04,    5.1306e+04],
[  89,    4.4427e+04,    6.8388e+03,    4.2697e+04,    8.0575e+04],
[ 113,    8.3632e+04,    1.0850e+04,    8.0893e+04,    1.4103e+05],
[ 144,    1.5535e+05,    1.6323e+04,    1.5149e+05,    2.4163e+05],
[ 183,    3.1204e+05,    2.8211e+04,    2.5645e+05,    4.4332e+05],
[ 234,    6.2586e+05,    3.5000e+04,    6.1664e+05,    8.1070e+05],
[ 298,    1.1201e+06,    1.3749e+05,    9.2511e+05,    1.3264e+06],
[ 379,    2.5531e+06,    6.4311e+05,    1.6633e+06,    3.3693e+06],
[ 483,    5.3612e+06,    7.7730e+05,    2.9665e+06,    5.8146e+06],
[ 616,    8.0512e+06,    2.3215e+06,    5.5969e+06,    1.2458e+07],
[ 785,    1.8979e+07,    4.2977e+06,    1.1792e+07,    2.3341e+07],
[1000,    3.6060e+07,    5.8910e+06,    2.2430e+07,    4.7225e+07],

[   4,    3.7207e+02,    6.6756e+02,    1.9800e+02,    3.9020e+03],
[   8,    3.7760e+02,    1.2692e+02,    3.2600e+02,    1.0430e+03],
[  16,    1.2662e+03,    1.9406e+02,    1.1360e+03,    2.2460e+03],
[  32,    6.7047e+03,    2.3775e+03,    5.6690e+03,    1.9156e+04],
[  64,    2.1552e+04,    1.6013e+04,    1.4132e+04,    6.6789e+04],
[ 128,    1.0602e+05,    4.5159e+03,    1.0420e+05,    1.2973e+05],
[ 256,    7.7634e+05,    8.7983e+04,    6.3991e+05,    9.0461e+05],
[ 512,    6.4976e+06,    2.7818e+05,    5.3517e+06,    6.6782e+06],
[1024,    3.7353e+07,    6.4473e+06,    2.4760e+07,    4.7515e+07],
[2048,    2.8886e+08,    2.5273e+07,    2.4149e+08,    3.3363e+08],

More information about the NumPy-Discussion mailing list