Hi all,
I just noticed a potential performance bug in the DGEMM implementation of MKL (16.0.1)
when using a single thread. I merely want to make someone at Intel aware of it, in case it is of interest.
Strangely DGEMM performs better for beta=1 than for beta=0 in certain
situations. Here is an example:
Intel(R) Xeon(R) CPU E5-2650:
m=72, n=373248, k=72, beta=0.00 : 14.25 GF
m=72, n=373248, k=72, beta=1.00 : 18.36 GF
Intel(R) Xeon(R) CPU E5-2650:
m=72, n=373248, k=72, beta=0.00 : 19.25 GF
m=72, n=373248, k=72, beta=1.00 : 28.34 GF
As you can see, the performance difference is significant. It is
actually so significant that it pays off to set C to zero explicitly
before calling MKL and then using the more efficient beta=1
implementation instead.
Here is a quick test driver:
#include <stdio.h> #include <stdlib.h> #include <omp.h> extern "C" int dgemm_(char *transa, char *transb, int *m, int * n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); void trashCache(float* trash1, float* trash2, int nTotal){ for(int i = 0; i < nTotal; i ++) trash1[i] += 0.99 * trash2[i]; } int main(int argc, char ** argv) { if(argc < 2 ){ printf("Usage: <beta>\n"); exit(-1); } float *trash1, *trash2; int nTotal = 1024*1024*100; trash1 = (float*) malloc(sizeof(float)*nTotal); trash2 = (float*) malloc(sizeof(float)*nTotal); int m = 72; int n = 72*72*72; int k = 72; double flops = 2.E-9 * m*n*k; double alpha=1; double beta=atof(argv[1]); double *A, *B, *C; int ret = posix_memalign((void**) &A, 64, sizeof(double) * m*k); ret += posix_memalign((void**) &B, 64, sizeof(double) * n*k); ret += posix_memalign((void**) &C, 64, sizeof(double) * m*n); double minTime = 1e100; for (int i=0; i<3; i++){ trashCache(trash1, trash2, nTotal); double t = omp_get_wtime(); dgemm_("T", "N", &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); t = omp_get_wtime() - t; minTime = (minTime < t) ? minTime : t; } printf("m=%d, n=%d, k=%d, beta=%.2f : %.2lf GF\n", m,n,k,beta,flops/minTime); free(A); free(B); free(C); free(trash1); free(trash2); return 0; }
Best, Paul