[Numpy-discussion] OpenBLAS on Mac
Sturla Molden
sturla.molden at gmail.com
Sat Feb 22 17:39:11 EST 2014
On 22/02/14 22:15, Nathaniel Smith wrote:
>> $ make TARGET=SANDYBRIDGE USE_OPENMP=0 BINARY=64 NOFORTRAN=1
>
> You'll definitely want to disable the affinity support too, and
> probably memory warmup. And possibly increase the maximum thread
> count, unless you'll only use the library on the computer it was built
> on. And maybe other things. The OpenBLAS build process has so many
> ways to accidentally impale yourself, it's an object lesson in why
> building regulations are a good thing.
Thanks for the advice.
Right now I am just testing on my own computer.
cblas_dgemm is running roughly 50 % faster with OpenBLAS than MKL 11.1
update 2, sometimes OpenBLAS is twice as fast as MKL.
WTF???
:-D
Ok, next runner up is Accelerate. Let's see how it compares to OpenBLAS
and MKL on Mavericks.
Sturla
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include "mkl.h"
double nanodiff(const uint64_t _t0, const uint64_t _t1)
{
long double t0, t1, numer, denom, nanosec;
mach_timebase_info_data_t tb_info;
mach_timebase_info(&tb_info);
numer = (long double)(tb_info.numer);
denom = (long double)(tb_info.denom);
t0 = (long double)(_t0);
t1 = (long double)(_t1);
nanosec = (t1 - t0) * numer / denom;
return (double)nanosec;
}
int main(int argc, char **argv)
{
const int BOUNDARY = 64;
long double nanosec;
int n = 512;
int m = n, k = n;
double *A = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY);
double *B = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY);
double *C = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY);
uint64_t t0, t1;
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec = nanodiff(t0, t1);
printf("elapsed time: %g ns\n", (double)nanosec);
mkl_free(A); mkl_free(B); mkl_free(C);
}
-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <cblas.h>
double nanodiff(const uint64_t _t0, const uint64_t _t1)
{
long double t0, t1, numer, denom, nanosec;
mach_timebase_info_data_t tb_info;
mach_timebase_info(&tb_info);
numer = (long double)(tb_info.numer);
denom = (long double)(tb_info.denom);
t0 = (long double)(_t0);
t1 = (long double)(_t1);
nanosec = (t1 - t0) * numer / denom;
return (double)nanosec;
}
int main(int argc, char **argv)
{
long double nanosec;
int n = 512;
int m = n, k = n;
double *A = (double*)malloc(n*n*sizeof(double));
double *B = (double*)malloc(n*n*sizeof(double));
double *C = (double*)malloc(n*n*sizeof(double));
uint64_t t0, t1;
t0 = mach_absolute_time();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, 1.0, A, k, B, n, 1.0, C, n);
t1 = mach_absolute_time();
nanosec = nanodiff(t0, t1);
printf("elapsed time: %g ns\n", (double)nanosec);
free(A); free(B); free(C);
}
More information about the NumPy-Discussion
mailing list