Hello,
I implemented below code segment to compare "memset", "for loop" and "cblas_dscal (Intel MKL)".
The test results are as:
memset: G0 = 0.000000 6954 ns
for: G0 = 1.000000 38741 ns
MKL: G0 = 2.000000 12911294 ns
MKL: G0 = 4.000000 13907 ns
MKL: G0 = 8.000000 10264 ns
MKL: G0 = 16.000000 10265 ns
As seen above first MKL cblas_dscal calculation is 12911294 ns but after first iteration it is decreased to 13907 ns.
I want to learn what makes first cblas_dscal call to take such high amount of time? And how can I make this call to take less time?
My code is:
#include "stdafx.h"
#include "mkl_dfti.h"
#include "mkl.h"
#include <windows.h>
static LARGE_INTEGER freq;
DWORD getTimeInNanoSec()
{
LARGE_INTEGER counterCurrent;
QueryPerformanceCounter(&counterCurrent);
//frequency gives the tick number in 1 second, therefore we need to multiply with 1000000000 to get nanoseconds
return counterCurrent.QuadPart * (1000000000) / freq.QuadPart;
}
int _tmain(int argc, _TCHAR* argv[])
{
MKL_Complex16 *xxx;
const int N = 10240;
DWORD start , end;
double* G= NULL;
int idx = 0;
//set timer counters frq
QueryPerformanceFrequency(&freq); //ticks per secon
G = (double*)malloc(N*sizeof(double));//not used
//1. memset
start = getTimeInNanoSec();
memset(G,0x0,N*sizeof(double));
end = getTimeInNanoSec();
printf("memset: G0:%f %llu ns\n\n", G[100], end - start);
//2. for
start = getTimeInNanoSec();
for(idx = 0;idx < N ;idx = idx=idx+5)
{
G[idx] = 1.0;
G[idx+1] = 1.0;
G[idx+2] = 1.0;
G[idx+3] = 1.0;
G[idx+4] = 1.0;
}
end = getTimeInNanoSec();
printf("for: G0:%f %llu ns\n\n", G[100], end - start);
//3. mkl blas
for(idx = 0;idx <4 ;idx++)
{
start = getTimeInNanoSec();
cblas_dscal(N, 2.0 , G , 1 );
end = getTimeInNanoSec();
printf("MKL: G0:%f %llu ns\n\n", G[100], end - start);
}
getchar();
return 0;
}