For matrix mul A(m,k) * B(k,n):
m=9, k=256, n=256 is faster than m=9, k=512, n=512 and all larger k and n.
On my E5-2630v3 (16 cores, HT disabled), k,n=256 get 850 GFLOPS while k,n=512 only get 257 GFLOPS.
Here is my testing code. I am doing 64 gemms here:
#include <cstdio> #include <cstdlib> #include <chrono> #include <algorithm> #include <functional> #include <random> #include <omp.h> #include <mkl.h> #include <unistd.h> #define ITERATION 10000 int main(int argc, char *argv[]) { int opt; int n = 9; int c = 256; int c_block = 256; int k = 256; int k_block = 256; int t = 1; while ((opt = getopt(argc, argv, "n:c:k:t:")) != -1) { switch (opt) { case 'n': n = strtol(optarg, NULL, 10); break; case 'c': c = strtol(optarg, NULL, 10); break; case 'k': k = strtol(optarg, NULL, 10); break; case 't': t = strtol(optarg, NULL, 10); break; default: printf("unknown option\n"); } } omp_set_dynamic(0); omp_set_num_threads(t); float *AS[64], *BS[64], *CS[64]; for (int i = 0; i < 64; ++i) { AS[i] = (float*)mkl_malloc(sizeof(float)*n*c, 64); BS[i] = (float*)mkl_malloc(sizeof(float)*c*k, 64); CS[i] = (float*)mkl_malloc(sizeof(float)*n*k, 64); } auto randgen = std::bind(std::uniform_real_distribution<float>(), std::mt19937(0)); for (int i = 0; i < 64; ++i) { std::generate(AS[i], AS[i]+n*c, std::ref(randgen)); std::generate(BS[i], BS[i]+c*k, std::ref(randgen)); // std::generate(CS[i], CS[i]+n*k, std::ref(randgen)); } using Clock = std::chrono::high_resolution_clock; auto t1 = Clock::now(); for (int iter = 0; iter < ITERATION; ++iter) { #pragma omp parallel { const int nthreads = omp_get_num_threads(); const int mythread = omp_get_thread_num(); const int start = mythread*64/nthreads; const int finish = (mythread+1)*64/nthreads; mkl_set_num_threads_local(1); for (int i = start; i < finish; ++i) { float * A = AS[i]; float * B = BS[i]; float * C = CS[i]; cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, k, c, 1, A, c, B, k, 0, C, k); } } } auto t2 = Clock::now(); auto elapsed = t2 - t1; auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count(); // printf("%.1lfs\n", 1e-9 * time); printf("%.lfGFLOPS\n", 1.0 * ITERATION * 64 * 2 * n * c * k / time); for (int i = 0; i < 64; ++i) { mkl_free(AS[i]); mkl_free(BS[i]); mkl_free(CS[i]); } return 0; }