Quantcast
Channel: Intel® oneAPI Math Kernel Library & Intel® Math Kernel Library
Viewing all articles
Browse latest Browse all 2652

MKL GEMM slower for larger matrices

$
0
0

For matrix mul A(m,k) * B(k,n):

m=9, k=256, n=256 is faster than m=9, k=512, n=512 and all larger k and n.

On my E5-2630v3 (16 cores, HT disabled), k,n=256 get 850 GFLOPS while k,n=512 only get 257 GFLOPS.

Here is my testing code. I am doing 64 gemms here: 

#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <algorithm>
#include <functional>
#include <random>
#include <omp.h>
#include <mkl.h>
#include <unistd.h>


#define ITERATION 10000

int main(int argc, char *argv[])
{
	int opt;
	int n = 9;
	int c = 256; int c_block = 256;
	int k = 256; int k_block = 256;
	int t = 1;
	while ((opt = getopt(argc, argv, "n:c:k:t:")) != -1) {
		switch (opt) {
			case 'n': n = strtol(optarg, NULL, 10); break;
			case 'c': c = strtol(optarg, NULL, 10); break;
			case 'k': k = strtol(optarg, NULL, 10); break;
			case 't': t = strtol(optarg, NULL, 10); break;
			default: printf("unknown option\n");
		}
	}

	omp_set_dynamic(0);
	omp_set_num_threads(t);
	
	float *AS[64], *BS[64], *CS[64];
	for (int i = 0; i < 64; ++i) {
		AS[i] = (float*)mkl_malloc(sizeof(float)*n*c, 64);
		BS[i] = (float*)mkl_malloc(sizeof(float)*c*k, 64);
		CS[i] = (float*)mkl_malloc(sizeof(float)*n*k, 64);
	} 
	
	auto randgen = std::bind(std::uniform_real_distribution<float>(), std::mt19937(0));
	for (int i = 0; i < 64; ++i) {
		std::generate(AS[i], AS[i]+n*c, std::ref(randgen));
		std::generate(BS[i], BS[i]+c*k, std::ref(randgen));
		// std::generate(CS[i], CS[i]+n*k, std::ref(randgen));
	}

	using Clock = std::chrono::high_resolution_clock;
	auto t1 = Clock::now();
	for (int iter = 0; iter < ITERATION; ++iter) {
		#pragma omp parallel
		{
			const int nthreads = omp_get_num_threads();
    		const int mythread = omp_get_thread_num();
    		const int start = mythread*64/nthreads;
   			const int finish = (mythread+1)*64/nthreads;  
			mkl_set_num_threads_local(1);
			for (int i = start; i < finish; ++i)
			{
				float * A = AS[i];
				float * B = BS[i];
				float * C = CS[i];
				cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, k, c, 1, A, c, B, k, 0, C, k);
			}
		}
		
	}
	auto t2 = Clock::now();
	auto elapsed = t2 - t1;
	auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
	// printf("%.1lfs\n", 1e-9 * time);
	printf("%.lfGFLOPS\n", 1.0 * ITERATION * 64 * 2 * n * c * k / time);
	
	for (int i = 0; i < 64; ++i) {
		mkl_free(AS[i]);
		mkl_free(BS[i]);
		mkl_free(CS[i]);
	} 
	return 0;
}

 


Viewing all articles
Browse latest Browse all 2652

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>