Why MPI impact the speed of MKL's DFT

My code:

// -*- C++ -*-

# include <cmath>
# include <ctime>
# include <cstring>
# include <cstdio>

# include "mkl.h"

int main (int argc, char * argv[])
{
  MKL_LONG D[2] = {SIZE, SIZE};
  MKL_LONG C = COUNT;
  MKL_LONG ST[3] = {0, (D[1] * sizeof(double) + 63) / 64 * (64 / sizeof(double)), 1};
  MKL_LONG DI = D[0] * ST[1];
  MKL_LONG SI = D[0] * ST[1];
  double SC = 1.0 / std::sqrt((double)SI);
  struct timespec BE, EN;

  double*const Efft_r = (double*)_mm_malloc(sizeof(double) * SI * C * 2, 64);
  memset(Efft_r, 0, sizeof(double) * SI  * C * 2);
  double*const Efft_i = Efft_r + SI * C;

  Efft_r[0] = 1.0;

  clock_gettime (CLOCK_REALTIME, &BE);
  for (int i=0; i<LOOP; ++i)
    {
      MKL_LONG status;
      DFTI_DESCRIPTOR_HANDLE hand;
      DftiCreateDescriptor(&hand, DFTI_DOUBLE, DFTI_COMPLEX, 2, D);
      DftiSetValue(hand, DFTI_INPUT_STRIDES, ST);
      DftiSetValue(hand, DFTI_OUTPUT_STRIDES, ST);
      DftiSetValue(hand, DFTI_NUMBER_OF_TRANSFORMS, C);
      DftiSetValue(hand, DFTI_INPUT_DISTANCE, DI);
      DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_REAL_REAL);
      DftiSetValue(hand, DFTI_FORWARD_SCALE, SC);
      DftiSetValue(hand, DFTI_BACKWARD_SCALE, SC);
      DftiSetValue(hand, DFTI_THREAD_LIMIT, 1);
      DftiSetValue(hand, DFTI_NUMBER_OF_USER_THREADS, 1);
      DftiCommitDescriptor(hand);
      __assume_aligned(Efft_r, 64);
      __assume_aligned(Efft_i, 64);
      DftiComputeForward(hand, Efft_r, Efft_i);
      DftiFreeDescriptor(&hand);
    }
  clock_gettime (CLOCK_REALTIME, &EN);
  printf("DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL\nLOOP:   \t%d\nSIZE:   \t%d X %d\nSTRIDES:\t%d %d %d\nNUMBER: \t%d\nDISTANCE:\t%d\n\t\t\t\t%.9fs\n",
	 LOOP,
	 D[0], D[1],
	 ST[0], ST[1], ST[2],
	 C,
	 DI,
	 double(EN.tv_sec-BE.tv_sec)+double(EN.tv_nsec-BE.tv_nsec)/1e9);
  _mm_free(Efft_r);

  return 0;
}

This code was compiled by icpc with flag "-mkl DSIZE=4096 -DLOOP=1 -DCOUNT=3".

When I run this program without MPI, the output is below:

$ ./a.out
DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL
LOOP:   	1
SIZE:   	4096 X 4096
STRIDES:	0 4096 1
NUMBER: 	3
DISTANCE:	16777216
				0.322017125s

When I run the same program with MPI, the output is below:

$ mpirun -n 1 ./a.out
DFTI_COMPLEX_STORAGE: DFTI_REAL_REAL
LOOP:   	1
SIZE:   	4096 X 4096
STRIDES:	0 4096 1
NUMBER: 	3
DISTANCE:	16777216
				1.606980538s

The program without MPI runs much faster than with MPI. I have tried different value of SIZE, but the results are alike.

I have not known why. If I must use MPI, is there any way to keep the speed of MKL?