Dear all,
I am new to the forum, and of course, to MKL (though I've used TBB before). I am using the MKL Link Helper to compile and link the first C example dgemm_threading_effect_example.c, but I cannot figure how to use TBB.
I know it is possible to use just TBB without OpenMP (which I don't have, being on a Mac), but it seems that I need to link the mkl_sequential library, and it seems no threads can be used.
Below you can find the example with my few added lines of code, and here are my linker switches:
-L/usr/local/lib -ltbb -ltbbmalloc -L/opt/intel/compilers_and_libraries_2016/mac/mkl/lib -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential
Thanks for any help you can give me!
Franco
#include <stdio.h> #include <stdlib.h> #include "mkl.h" #include <tbb/task_scheduler_init.h> /* Consider adjusting LOOP_COUNT based on the performance of your computer */ /* to make sure that total run time is at least 1 second */ #define LOOP_COUNT 10 int main() { double *A, *B, *C; int m, n, p, i, j, r, max_threads; double alpha, beta; double s_initial, s_elapsed; printf ("\n This example demonstrates threading impact on computing real matrix product \n"" C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"" matrices and alpha and beta are double precision scalars \n\n"); m = 2000, p = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n"" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"" performance \n\n"); A = (double *)mkl_malloc( m*p*sizeof( double ), 64 ); B = (double *)mkl_malloc( p*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*p); i++) { A[i] = (double)(i+1); } for (i = 0; i < (p*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } // HERE I TRY BUT IT'S ALWAYS ONE SINGLE THREAD tbb::task_scheduler_init scheduler(4); mkl_set_num_threads(4); mkl_set_num_threads_local(4); printf (" Finding max number of threads Intel(R) MKL can use for parallel runs \n\n"); // HERE I ALWAYS GET ONE max_threads = mkl_get_max_threads(); printf (" Running Intel(R) MKL from 1 to %i threads \n\n", max_threads); for (i = 1; i <= max_threads; i++) { for (j = 0; j < (m*n); j++) C[j] = 0.0; printf (" Requesting Intel(R) MKL to use %i thread(s) \n\n", i); mkl_set_num_threads(i); printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"" via CBLAS interface to get stable run time measurements \n\n"); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, p, B, n, beta, C, n); printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"" via CBLAS interface on %i thread(s) \n\n", i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; r++) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, p, B, n, beta, C, n); } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf (" == Matrix multiplication using Intel(R) MKL dgemm completed ==\n"" == at %.5f milliseconds using %d thread(s) ==\n\n", (s_elapsed * 1000), i); } printf (" Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); if (s_elapsed < 0.9/LOOP_COUNT) { s_elapsed=1.0/LOOP_COUNT/s_elapsed; i=(int)(s_elapsed*LOOP_COUNT)+1; printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"" computer as %i to have total execution time about 1 second for reliability \n"" of measurements\n\n", i); } printf (" Example completed. \n\n"); return 0; }