MKL GEMM slower for larger matrices

For matrix mul A(m,k) * B(k,n):

m=9, k=256, n=256 is faster than m=9, k=512, n=512 and all larger k and n.

On my E5-2630v3 (16 cores, HT disabled), k,n=256 get 850 GFLOPS while k,n=512 only get 257 GFLOPS.

Here is my testing code. I am doing 64 gemms here:

#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <algorithm>
#include <functional>
#include <random>
#include <omp.h>
#include <mkl.h>
#include <unistd.h>


#define ITERATION 10000

int main(int argc, char *argv[])
{
	int opt;
	int n = 9;
	int c = 256; int c_block = 256;
	int k = 256; int k_block = 256;
	int t = 1;
	while ((opt = getopt(argc, argv, "n:c:k:t:")) != -1) {
		switch (opt) {
			case 'n': n = strtol(optarg, NULL, 10); break;
			case 'c': c = strtol(optarg, NULL, 10); break;
			case 'k': k = strtol(optarg, NULL, 10); break;
			case 't': t = strtol(optarg, NULL, 10); break;
			default: printf("unknown option\n");
		}
	}

	omp_set_dynamic(0);
	omp_set_num_threads(t);
	
	float *AS[64], *BS[64], *CS[64];
	for (int i = 0; i < 64; ++i) {
		AS[i] = (float*)mkl_malloc(sizeof(float)*n*c, 64);
		BS[i] = (float*)mkl_malloc(sizeof(float)*c*k, 64);
		CS[i] = (float*)mkl_malloc(sizeof(float)*n*k, 64);
	} 
	
	auto randgen = std::bind(std::uniform_real_distribution<float>(), std::mt19937(0));
	for (int i = 0; i < 64; ++i) {
		std::generate(AS[i], AS[i]+n*c, std::ref(randgen));
		std::generate(BS[i], BS[i]+c*k, std::ref(randgen));
		// std::generate(CS[i], CS[i]+n*k, std::ref(randgen));
	}

	using Clock = std::chrono::high_resolution_clock;
	auto t1 = Clock::now();
	for (int iter = 0; iter < ITERATION; ++iter) {
		#pragma omp parallel
		{
			const int nthreads = omp_get_num_threads();
    		const int mythread = omp_get_thread_num();
    		const int start = mythread*64/nthreads;
   			const int finish = (mythread+1)*64/nthreads;  
			mkl_set_num_threads_local(1);
			for (int i = start; i < finish; ++i)
			{
				float * A = AS[i];
				float * B = BS[i];
				float * C = CS[i];
				cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, k, c, 1, A, c, B, k, 0, C, k);
			}
		}
		
	}
	auto t2 = Clock::now();
	auto elapsed = t2 - t1;
	auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
	// printf("%.1lfs\n", 1e-9 * time);
	printf("%.lfGFLOPS\n", 1.0 * ITERATION * 64 * 2 * n * c * k / time);
	
	for (int i = 0; i < 64; ++i) {
		mkl_free(AS[i]);
		mkl_free(BS[i]);
		mkl_free(CS[i]);
	} 
	return 0;
}

MKL GEMM slower for larger matrices

Trending Articles

Police confirm man stabbed to death in Selsdon was Andrew David Else of Croydon

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

Practice Sheet of Right form of verbs for HSC Students

Thread: Ticket to Ride Legacy: Legends of the West:: General:: [SPOILERS]...

Kalank - Malayalam (1CD ) - subtitles

Stephanie cheung vs victoria hay vs estrina ang

District Child Protection Unit (DCPU)Phone Numbers/Mobile Numbers in Kerala...

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

libdevinfo を使ってネットワークインターフェイスデバイスの一覧を取得する

Mp3 Download: Mdu - Nammer

Brunei reaffirms healthcare commitment

The Ten Commandments of Digital Control (Part 4)

Muloraki Au

99 God Status for Whatsapp, Facebook

Skint TV teen to be sentenced

Born To Be Wild: Chicago Outfit Hit Squad Littered The Streets With Bodies...

Raj Panchayat 3rd / Third Grade Teacher Revised Result 2012 Level 1-2...

Ilahi mera jee aaye/ Shaame Malang si Lyrics Translation

DD Kashir channel packaging bids invited by 29 june

Procedure for conduct of supplementary DPC