[Numpy-discussion] OpenBLAS on Mac
Sturla Molden
sturla.molden at gmail.com
Sat Feb 22 20:43:00 EST 2014
On 23/02/14 00:11, Sturla Molden wrote:
> Did the upgrade to Mavericks do this?
>
Testing different matrix sizes and averaging 30 trials, they are quite
similar, actually. Accelerate is perhaps the winner, but it really
depends on the matrix size.
See for yourself.
:-)
Sturla
List of attachments:
Plots of the average runtime:
dgemm_test.png
dgemm_test2.png
C codes:
perftest_openblas.c
perftest_accelerate.c
perftest_mkl.c
Timings from my MacBook Pro (2.4 GHz i7)
accelerate.txt
openblas.txt
mkl.txt
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test.png
Type: image/png
Size: 59517 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment.png>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: dgemm_test2.png
Type: image/png
Size: 49065 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20140223/2df62421/attachment-0001.png>
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "mkl.h"
const int matrix_size[] = {
10, 13, 16, 21, 26, 34, 43, 55,
70, 89, 113, 144, 183, 234, 298, 379,
483, 616, 785, 1000
};
const int matrix_size_pow2[] = {
4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048
};
const int NREPEATS = 30;
const int MAX_N = 2048;
double nanodiff(const uint64_t _t0, const uint64_t _t1,
const mach_timebase_info_data_t *tb_info)
{
long double t0, t1, numer, denom, nanosec;
numer = (long double)(tb_info->numer);
denom = (long double)(tb_info->denom);
t0 = (long double)(_t0);
t1 = (long double)(_t1);
nanosec = (t1 - t0) * numer / denom;
return (double)nanosec;
}
void fill_with_random(const int n, double *x)
{
static unsigned int m_w = 123456;
static unsigned int m_z = 5635273;
int i;
for (i=0; i<n; i++) {
m_z = 36969 * (m_z & 65535) + (m_z >> 16);
m_w = 18000 * (m_w & 65535) + (m_w >> 16);
*x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;
}
}
void statistics(const int n, const double *x,
double *m, double *s,
double *min, double *max)
{
double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
double minval, maxval, v;
int i;
for (i=0; i<n; i++) sum_x += x[i];
_m = sum_x / (double)n;
for (i=0; i<n; i++) {
cx = x[i] - _m;
sum_cxcx += cx*cx;
}
*m = _m;
*s = sqrt(sum_cxcx / (double)(n-1));
minval = *x;
maxval = *x;
for (i=1; i<n; i++) {
v = *x++;
maxval = (maxval < v ? v : maxval);
minval = (minval > v ? v : minval);
}
*max = maxval;
*min = minval;
}
int main(int argc, char **argv)
{
double nanosec[NREPEATS];
uint64_t t0, t1;
mach_timebase_info_data_t tb_info;
double *A = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64);
double *B = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64);
double *C = (double*)mkl_malloc(MAX_N*MAX_N*sizeof(double),64);
double mean, std, min, max;
int i, j, k, m, n;
mach_timebase_info(&tb_info);
fill_with_random(MAX_N*MAX_N, A);
fill_with_random(MAX_N*MAX_N, B);
fill_with_random(MAX_N*MAX_N, C);
for (i=0; i<20; i++) {
n = matrix_size[i];
m = n; k = n;
for (j=0; j<NREPEATS; j++) {
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec[j] = nanodiff(t0, t1, &tb_info);
}
statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
printf("[%4d, %.4e, %.4e, %.4e, %.4e],\n",
n, mean, std, min, max);
}
printf("\n\n");
for (i=0; i<10; i++) {
n = matrix_size_pow2[i];
m = n; k = n;
for (j=0; j<NREPEATS; j++) {
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec[j] = nanodiff(t0, t1, &tb_info);
}
statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
printf("[%4d, %.4e, %.4e, %.4e, %.4e],\n",
n, mean, std, min, max);
}
mkl_free(A); mkl_free(B); mkl_free(C);
}
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cblas.h>
const int matrix_size[] = {
10, 13, 16, 21, 26, 34, 43, 55,
70, 89, 113, 144, 183, 234, 298, 379,
483, 616, 785, 1000
};
const int matrix_size_pow2[] = {
4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048
};
const int NREPEATS = 30;
const int MAX_N = 2048;
double nanodiff(const uint64_t _t0, const uint64_t _t1,
const mach_timebase_info_data_t *tb_info)
{
long double t0, t1, numer, denom, nanosec;
numer = (long double)(tb_info->numer);
denom = (long double)(tb_info->denom);
t0 = (long double)(_t0);
t1 = (long double)(_t1);
nanosec = (t1 - t0) * numer / denom;
return (double)nanosec;
}
void fill_with_random(const int n, double *x)
{
static unsigned int m_w = 123456;
static unsigned int m_z = 5635273;
int i;
for (i=0; i<n; i++) {
m_z = 36969 * (m_z & 65535) + (m_z >> 16);
m_w = 18000 * (m_w & 65535) + (m_w >> 16);
*x++ = ((m_z << 16) + m_w) * 2.3283064365386963e-10;
}
}
void statistics(const int n, const double *x,
double *m, double *s,
double *min, double *max)
{
double sum_x=0.0, cx=0.0, sum_cxcx=0.0, _m;
double minval, maxval, v;
int i;
for (i=0; i<n; i++) sum_x += x[i];
_m = sum_x / (double)n;
for (i=0; i<n; i++) {
cx = x[i] - _m;
sum_cxcx += cx*cx;
}
*m = _m;
*s = sqrt(sum_cxcx / (double)(n-1));
minval = *x;
maxval = *x;
for (i=1; i<n; i++) {
v = *x++;
maxval = (maxval < v ? v : maxval);
minval = (minval > v ? v : minval);
}
*max = maxval;
*min = minval;
}
int main(int argc, char **argv)
{
double nanosec[NREPEATS];
uint64_t t0, t1;
mach_timebase_info_data_t tb_info;
double *A = (double*)malloc(MAX_N*MAX_N*sizeof(double));
double *B = (double*)malloc(MAX_N*MAX_N*sizeof(double));
double *C = (double*)malloc(MAX_N*MAX_N*sizeof(double));
double mean, std, min, max;
int i, j, k, m, n;
mach_timebase_info(&tb_info);
fill_with_random(MAX_N*MAX_N, A);
fill_with_random(MAX_N*MAX_N, B);
fill_with_random(MAX_N*MAX_N, C);
for (i=0; i<20; i++) {
n = matrix_size[i];
m = n; k = n;
for (j=0; j<NREPEATS; j++) {
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec[j] = nanodiff(t0, t1, &tb_info);
}
statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
printf("[%4d, %.4e, %.4e, %.4e, %.4e],\n",
n, mean, std, min, max);
}
printf("\n\n");
for (i=0; i<10; i++) {
n = matrix_size_pow2[i];
m = n; k = n;
for (j=0; j<NREPEATS; j++) {
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec[j] = nanodiff(t0, t1, &tb_info);
}
statistics(NREPEATS, nanosec, &mean, &std, &min, &max);
printf("[%4d, %.4e, %.4e, %.4e, %.4e],\n",
n, mean, std, min, max);
}
free(A); free(B); free(C);
}
-------------- next part --------------
/*
Copyright (c) 2014, Sturla Molden.
All rights reserved.
See license.txt.
*/
#include <assert.h>
/* #include <CoreServices/CoreServices.h> */
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <unistd.h>
#include <stdlib.h>
typedef struct {
int status;
uint64_t t0;
uint64_t t1;
mach_timebase_info_data_t tb_info;
} perf_timer_t;
const int PERF_TIMER_CLEAR = 0;
const int PERF_TIMER_RUNNING = 1;
const int PERF_TIMER_STOPPED = 2;
perf_timer_t *create_perf_timer(void)
{
perf_timer_t *timer = (perf_timer_t *)malloc(sizeof(perf_timer_t));
if (timer == NULL) goto error;
timer->status = PERF_TIMER_CLEAR;
mach_timebase_info(&timer->tb_info);
return timer;
error:
if (timer != NULL) free(timer);
return NULL;
}
void destroy_perf_timer(perf_timer_t *timer)
{
if (timer != NULL) free(timer);
}
int start_perf_timer(perf_timer_t *timer)
{
if (timer == NULL) goto error;
if (timer->status == PERF_TIMER_RUNNING) goto error;
timer->t0 = mach_absolute_time();
timer->status = PERF_TIMER_RUNNING;
return 0;
error:
return -1;
}
int stop_perf_timer(perf_timer_t *timer)
{
if (timer == NULL) goto error;
if (timer->status != PERF_TIMER_RUNNING) goto error;
timer->t1 = mach_absolute_time();
timer->status = PERF_TIMER_STOPPED;
return 0;
error:
return -1;
}
int clear_perf_timer(perf_timer_t *timer)
{
if (timer == NULL) return -1;
if (timer->status != PERF_TIMER_STOPPED) return -1;
timer->status = PERF_TIMER_CLEAR;
return 0;
}
int nanodiff_perf_timer(const perf_timer_t *timer, long double *nanosec)
{
long double t0, t1, numer, denom;
if (timer == NULL) return -1;
if (timer->status != PERF_TIMER_STOPPED) return -1;
numer = (long double)(timer->tb_info.numer);
denom = (long double)(timer->tb_info.denom);
t0 = (long double)(timer->t0);
t1 = (long double)(timer->t1);
if (nanosec != NULL) {
*nanosec = (t1 - t0) * numer / denom;
return 0;
} else return -1;
}
int nanores_perf_timer(const perf_timer_t *timer, long double *nanosec_resolution)
{
long double numer, denom;
if (timer == NULL) return -1;
numer = (long double)(timer->tb_info.numer);
denom = (long double)(timer->tb_info.denom);
if (nanosec_resolution != NULL) {
*nanosec_resolution = numer / denom;
return 0;
} else return -1;
}
-------------- next part --------------
[ 10, 1.8249e+03, 7.5052e+03, 4.3600e+02, 4.1562e+04],
[ 13, 9.4907e+02, 2.7929e+02, 8.9100e+02, 2.4260e+03],
[ 16, 2.3644e+03, 6.0826e+03, 1.0810e+03, 3.4278e+04],
[ 21, 2.7637e+03, 2.5322e+03, 2.0580e+03, 1.3706e+04],
[ 26, 3.6573e+03, 1.8597e+03, 2.9700e+03, 9.9230e+03],
[ 34, 5.7156e+03, 8.1641e+02, 5.4040e+03, 8.9120e+03],
[ 43, 1.0956e+04, 2.3088e+03, 1.0122e+04, 1.9766e+04],
[ 55, 2.1788e+04, 1.1596e+04, 1.8955e+04, 8.2381e+04],
[ 70, 3.8077e+04, 4.2441e+03, 3.6798e+04, 5.4417e+04],
[ 89, 7.0032e+04, 5.1650e+03, 6.8568e+04, 9.6636e+04],
[ 113, 1.3922e+05, 1.3897e+04, 1.3513e+05, 2.0781e+05],
[ 144, 1.2979e+05, 7.2822e+04, 9.9868e+04, 4.4901e+05],
[ 183, 2.0268e+05, 1.0931e+04, 1.9760e+05, 2.4071e+05],
[ 234, 4.0132e+05, 9.1694e+04, 3.5535e+05, 7.1502e+05],
[ 298, 8.6309e+05, 2.6523e+05, 6.9578e+05, 1.4252e+06],
[ 379, 1.5985e+06, 4.0820e+05, 1.4171e+06, 2.8624e+06],
[ 483, 3.0864e+06, 6.0088e+05, 2.8326e+06, 5.4301e+06],
[ 616, 6.6489e+06, 1.5387e+06, 5.7692e+06, 1.0613e+07],
[ 785, 1.4754e+07, 3.4156e+06, 1.2107e+07, 2.1649e+07],
[1000, 3.1154e+07, 5.5740e+06, 2.4195e+07, 4.0360e+07],
[ 4, 1.9477e+02, 3.2552e+02, 1.2800e+02, 1.9150e+03],
[ 8, 3.8077e+02, 7.9966e+01, 3.5500e+02, 8.0200e+02],
[ 16, 2.7588e+03, 6.3393e+03, 1.4950e+03, 3.6304e+04],
[ 32, 6.6004e+03, 2.1850e+03, 5.5530e+03, 1.5607e+04],
[ 64, 3.5622e+04, 1.2820e+03, 3.4828e+04, 4.1095e+04],
[ 128, 9.6848e+04, 3.6344e+04, 7.9238e+04, 2.3208e+05],
[ 256, 4.5615e+05, 9.2865e+04, 4.2092e+05, 8.2640e+05],
[ 512, 3.3334e+06, 2.6552e+05, 3.2502e+06, 4.7093e+06],
[1024, 3.5191e+07, 5.5548e+06, 2.5881e+07, 4.5181e+07],
[2048, 2.7725e+08, 1.5691e+07, 2.3737e+08, 2.9737e+08],
-------------- next part --------------
[ 10, 2.4680e+03, 1.0001e+04, 5.2700e+02, 5.5399e+04],
[ 13, 8.8493e+02, 3.0951e+02, 8.0500e+02, 2.5100e+03],
[ 16, 3.3365e+03, 6.1387e+03, 2.0660e+03, 3.5831e+04],
[ 21, 3.5532e+03, 6.4100e+02, 3.1980e+03, 6.2850e+03],
[ 26, 4.5164e+03, 1.6061e+02, 4.3510e+03, 5.0860e+03],
[ 34, 5.4646e+03, 8.0315e+02, 5.0180e+03, 9.6500e+03],
[ 43, 9.5117e+03, 7.1500e+02, 9.1560e+03, 1.3219e+04],
[ 55, 1.5673e+04, 9.1927e+02, 1.5228e+04, 2.0464e+04],
[ 70, 2.3438e+04, 2.3378e+03, 2.2688e+04, 3.5773e+04],
[ 89, 4.8458e+04, 9.8972e+02, 4.7769e+04, 5.3483e+04],
[ 113, 6.3075e+04, 1.8880e+04, 4.7137e+04, 1.0836e+05],
[ 144, 9.1631e+04, 1.5589e+04, 8.3371e+04, 1.3086e+05],
[ 183, 1.7476e+05, 6.5323e+03, 1.7318e+05, 2.0928e+05],
[ 234, 4.6118e+05, 1.5100e+05, 3.2846e+05, 6.8653e+05],
[ 298, 1.3137e+06, 2.2717e+04, 1.2116e+06, 1.3345e+06],
[ 379, 2.6917e+06, 1.7294e+05, 1.8249e+06, 2.8141e+06],
[ 483, 4.5088e+06, 1.1748e+06, 2.5604e+06, 5.5026e+06],
[ 616, 9.0293e+06, 2.1052e+06, 5.3924e+06, 1.1081e+07],
[ 785, 2.2010e+07, 1.7049e+06, 1.7738e+07, 2.6944e+07],
[1000, 3.4778e+07, 8.2156e+06, 2.2372e+07, 4.5916e+07],
[ 4, 4.3840e+02, 4.3370e+02, 2.5200e+02, 2.6970e+03],
[ 8, 4.4577e+02, 1.7493e+02, 3.8900e+02, 1.3540e+03],
[ 16, 1.9848e+03, 4.6771e+02, 1.6710e+03, 4.4150e+03],
[ 32, 3.0532e+03, 5.4374e+02, 2.7920e+03, 5.2560e+03],
[ 64, 9.3250e+03, 6.6518e+02, 8.9210e+03, 1.2746e+04],
[ 128, 5.4620e+04, 2.0069e+03, 5.3877e+04, 6.5200e+04],
[ 256, 3.7464e+05, 7.6448e+03, 3.7055e+05, 4.0864e+05],
[ 512, 3.7553e+06, 1.3328e+06, 2.9490e+06, 6.1633e+06],
[1024, 2.9947e+07, 5.5860e+06, 2.3243e+07, 4.3105e+07],
[2048, 2.4223e+08, 2.3419e+07, 1.9434e+08, 2.8706e+08],
-------------- next part --------------
[ 10, 1.3607e+03, 5.1284e+03, 3.4300e+02, 2.8490e+04],
[ 13, 6.4280e+02, 1.9452e+02, 5.9600e+02, 1.6710e+03],
[ 16, 7.2043e+02, 6.6691e+01, 6.9700e+02, 1.0690e+03],
[ 21, 1.5310e+04, 7.3601e+04, 1.8240e+03, 4.0500e+05],
[ 26, 3.1701e+03, 1.6513e+03, 2.7050e+03, 1.0508e+04],
[ 34, 1.2321e+04, 3.9041e+04, 4.9730e+03, 2.1902e+05],
[ 43, 9.0754e+03, 1.8779e+03, 8.5860e+03, 1.8982e+04],
[ 55, 1.4239e+04, 3.5869e+03, 1.3076e+04, 3.0638e+04],
[ 70, 2.2525e+04, 5.4555e+03, 2.1133e+04, 5.1306e+04],
[ 89, 4.4427e+04, 6.8388e+03, 4.2697e+04, 8.0575e+04],
[ 113, 8.3632e+04, 1.0850e+04, 8.0893e+04, 1.4103e+05],
[ 144, 1.5535e+05, 1.6323e+04, 1.5149e+05, 2.4163e+05],
[ 183, 3.1204e+05, 2.8211e+04, 2.5645e+05, 4.4332e+05],
[ 234, 6.2586e+05, 3.5000e+04, 6.1664e+05, 8.1070e+05],
[ 298, 1.1201e+06, 1.3749e+05, 9.2511e+05, 1.3264e+06],
[ 379, 2.5531e+06, 6.4311e+05, 1.6633e+06, 3.3693e+06],
[ 483, 5.3612e+06, 7.7730e+05, 2.9665e+06, 5.8146e+06],
[ 616, 8.0512e+06, 2.3215e+06, 5.5969e+06, 1.2458e+07],
[ 785, 1.8979e+07, 4.2977e+06, 1.1792e+07, 2.3341e+07],
[1000, 3.6060e+07, 5.8910e+06, 2.2430e+07, 4.7225e+07],
[ 4, 3.7207e+02, 6.6756e+02, 1.9800e+02, 3.9020e+03],
[ 8, 3.7760e+02, 1.2692e+02, 3.2600e+02, 1.0430e+03],
[ 16, 1.2662e+03, 1.9406e+02, 1.1360e+03, 2.2460e+03],
[ 32, 6.7047e+03, 2.3775e+03, 5.6690e+03, 1.9156e+04],
[ 64, 2.1552e+04, 1.6013e+04, 1.4132e+04, 6.6789e+04],
[ 128, 1.0602e+05, 4.5159e+03, 1.0420e+05, 1.2973e+05],
[ 256, 7.7634e+05, 8.7983e+04, 6.3991e+05, 9.0461e+05],
[ 512, 6.4976e+06, 2.7818e+05, 5.3517e+06, 6.6782e+06],
[1024, 3.7353e+07, 6.4473e+06, 2.4760e+07, 4.7515e+07],
[2048, 2.8886e+08, 2.5273e+07, 2.4149e+08, 3.3363e+08],
More information about the NumPy-Discussion
mailing list