My code:
// -*- C++ -*- # include <cmath> # include <ctime> # include <cstring> # include <cstdio> # include "mkl.h" int main (int argc, char * argv[]) { MKL_LONG D[2] = {SIZE, SIZE}; MKL_LONG C = COUNT; MKL_LONG ST[3] = {0, (D[1] * sizeof(double) + 63) / 64 * (64 / sizeof(double)), 1}; MKL_LONG DI = D[0] * ST[1]; MKL_LONG SI = D[0] * ST[1]; double SC = 1.0 / std::sqrt((double)SI); struct timespec BE, EN; double*const Efft_r = (double*)_mm_malloc(sizeof(double) * SI * C * 2, 64); memset(Efft_r, 0, sizeof(double) * SI * C * 2); double*const Efft_i = Efft_r + SI * C; Efft_r[0] = 1.0; clock_gettime (CLOCK_REALTIME, &BE); for (int i=0; i<LOOP; ++i) { MKL_LONG status; DFTI_DESCRIPTOR_HANDLE hand; DftiCreateDescriptor(&hand, DFTI_DOUBLE, DFTI_COMPLEX, 2, D); DftiSetValue(hand, DFTI_INPUT_STRIDES, ST); DftiSetValue(hand, DFTI_OUTPUT_STRIDES, ST); DftiSetValue(hand, DFTI_NUMBER_OF_TRANSFORMS, C); DftiSetValue(hand, DFTI_INPUT_DISTANCE, DI); DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_REAL_REAL); DftiSetValue(hand, DFTI_FORWARD_SCALE, SC); DftiSetValue(hand, DFTI_BACKWARD_SCALE, SC); DftiSetValue(hand, DFTI_THREAD_LIMIT, 1); DftiSetValue(hand, DFTI_NUMBER_OF_USER_THREADS, 1); DftiCommitDescriptor(hand); __assume_aligned(Efft_r, 64); __assume_aligned(Efft_i, 64); DftiComputeForward(hand, Efft_r, Efft_i); DftiFreeDescriptor(&hand); } clock_gettime (CLOCK_REALTIME, &EN); printf("DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL\nLOOP: \t%d\nSIZE: \t%d X %d\nSTRIDES:\t%d %d %d\nNUMBER: \t%d\nDISTANCE:\t%d\n\t\t\t\t%.9fs\n", LOOP, D[0], D[1], ST[0], ST[1], ST[2], C, DI, double(EN.tv_sec-BE.tv_sec)+double(EN.tv_nsec-BE.tv_nsec)/1e9); _mm_free(Efft_r); return 0; }
This code was compiled by icpc with flag "-mkl DSIZE=4096 -DLOOP=1 -DCOUNT=3".
When I run this program without MPI, the output is below:
$ ./a.out DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL LOOP: 1 SIZE: 4096 X 4096 STRIDES: 0 4096 1 NUMBER: 3 DISTANCE: 16777216 0.322017125s
When I run the same program with MPI, the output is below:
$ mpirun -n 1 ./a.out DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL LOOP: 1 SIZE: 4096 X 4096 STRIDES: 0 4096 1 NUMBER: 3 DISTANCE: 16777216 1.606980538s
The program without MPI runs much faster than with MPI. I have tried different value of SIZE, but the results are alike.
I have not known why. If I must use MPI, is there any way to keep the speed of MKL?