Dear Intel
With your help a month ago, I could set up the 'cluster sparse solver 64' program using 'iparm[1]=10 (The MPI version of the nested dissection and symbolic factorization algorithms)' with my 4 cluster computers.
This program is devised for very large sparse matrix, which contains about 10^8 - 10^9 rows.
However, it shows the lower performance(4MPI & 4OpenMP) than the result of a single machine(1MPI & 4OpenMP)
Following table is the result of the test.
target : 4*10^8 rows matrix
time consumption (1MPI & 4OpenMP || 4MPI & 4OpenMP(iparm[1]=3) || 4MPI & 4OpenMP(iparm[1]=10))
-------------------------------------------------------------------------------------------------------------------------
Reorder time (1021.6 s || 2094.3 s || 7644.3 s)
Factorization time (1403.2 s || 2136.6 s || 9263.1 s)
Solution time (158.6 s || 684.9 s || 554.14 s)
-------------------------------------------------------------------------------------------------------------------------
Could you please look into this issue?
I attache my code below.
Thank you very much in advance!!!
Regards,
Yong-hee
P.S. This is my code. (almost same with the example code)
=================================================================================
/******************************************************************************* * Copyright 2004-2015 Intel Corporation All Rights Reserved. * * The source code, information and material ("Material") contained herein is * owned by Intel Corporation or its suppliers or licensors, and title to such * Material remains with Intel Corporation or its suppliers or licensors. The * Material contains proprietary information of Intel or its suppliers and * licensors. The Material is protected by worldwide copyright laws and treaty * provisions. No part of the Material may be used, copied, reproduced, * modified, published, uploaded, posted, transmitted, distributed or disclosed * in any way without Intel's prior express written permission. No license under * any patent, copyright or other intellectual property rights in the Material * is granted to or conferred upon you, either expressly, by implication, * inducement, estoppel or otherwise. Any license under such intellectual * property rights must be express and approved by Intel in writing. * * Unless otherwise agreed by Intel in writing, you may not remove or alter this * notice or any other notice embedded in Materials by Intel or Intel's * suppliers or licensors in any way. *******************************************************************************/ /* * * MKL Cluster Sparse Solver example demonstrating the case when initial data (matrix * and rhs) distributed between several MPI processes, final solution is * distributed between MPI processes in the same way as they hold initial data. * ******************************************************************************** */ #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <math.h> #include <time.h> #include "mpi.h" #include "mkl.h" #include "mkl_cluster_sparse_solver.h" #ifdef MKL_ILP64 #define MPI_DT MPI_LONG #else #define MPI_DT MPI_INT #endif #define MPI_REDUCE_AND_BCAST \ MPI_Reduce(&err_mem, &error, 1, MPI_DT, MPI_SUM, 0, MPI_COMM_WORLD); \ MPI_Bcast(&error, 1, MPI_DT, 0, MPI_COMM_WORLD); int main(void) { clock_t before; double result; MKL_INT64 AllocatedBytes; int N_AllocatedBuffers; mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); AllocatedBytes = mkl_mem_stat(&N_AllocatedBuffers); mkl_set_num_threads(2); FILE* pre_inputFile=fopen("/user/source/CREATE_MATRIX/SPM_test_20000_whole.txt", "r"); if (pre_inputFile==NULL){ puts( "There is no file." );} MKL_INT64 n; fscanf(pre_inputFile, "%lld", &n); n -= 1; fclose(pre_inputFile); /* Matrix data. */ //MKL_INT64 n = 5; MKL_INT64 mtype = 2; /* Real symmetric definite matrix */ MKL_INT64 *ia = NULL; MKL_INT64 *ja = NULL; double *a = NULL; /* RHS and solution vectors. */ double *b = NULL; double *x = NULL; char hostName[1024] = "\0"; MKL_INT64 nrhs = 1; /* Number of right hand sides. */ /* Internal solver memory pointer pt, */ /* 32-bit: int pt[64]; 64-bit: long int pt[64] */ /* or void *pt[64] should be OK on both architectures */ void *pt[64] = { 0 }; /* Cluster Sparse Solver control parameters. */ MKL_INT64 iparm[64] = { 0 }; MKL_INT64 maxfct, mnum, phase, msglvl, error, err_mem; /* Auxiliary variables. */ double ddum; /* Double dummy */ MKL_INT64 idum; /* Integer dummy. */ MKL_INT64 j; int mpi_stat = 0; int argc = 0; int comm, rank, size; char** argv; printf("at the beginning of the program, peak memory : %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM)); /* -------------------------------------------------------------------- */ /* .. Init MPI. */ /* -------------------------------------------------------------------- */ mpi_stat = MPI_Init( &argc, &argv ); mpi_stat = MPI_Comm_rank( MPI_COMM_WORLD, &rank ); mpi_stat = MPI_Comm_size( MPI_COMM_WORLD, &size ); comm = MPI_Comm_c2f( MPI_COMM_WORLD ); //printf ("comm : %d, rank : %d, size : %d", comm, rank, size); if( size < 2 ) { printf("\nERROR: this example doesn't work on number of MPI less than 2"); mpi_stat = MPI_Finalize(); return 1; } /* -------------------------------------------------------------------- */ /* .. Setup Cluster Sparse Solver control parameters. */ /* -------------------------------------------------------------------- */ iparm[ 0] = 1; /* Solver default parameters overriden with provided by iparm */ iparm[ 1] = 10; /* Use METIS for fill-in reordering */ iparm[ 5] = 0; /* Write solution into x */ iparm[ 7] = 2; /* Max number of iterative refinement steps */ iparm[ 9] = 13; /* Perturb the pivot elements with 1E-13 */ iparm[10] = 0; /* Don't use nonsymmetric permutation and scaling MPS */ iparm[12] = 1; /* Switch on Maximum Weighted Matching algorithm (default for non-symmetric) */ //iparm[17] = -1; /* Output: Number of nonzeros in the factor LU */ //iparm[18] = -1; /* Output: Mflops for LU factorization */ //iparm[26] = 1; /* Check input data for correctness */ //iparm[34] = 1; /* Cluster Sparse Solver use C-style indexing for ia and ja arrays */ //iparm[39] = 2; /* Input: matrix/rhs/solution are distributed between MPI processes */ /* If iparm[39]=2, the matrix is provided in distributed assembled matrix input format. In this case, each MPI process stores only a part (or domain) of the matrix A data. The bounds of the domain should be set via iparm(41) and iparm(42). Solution vector is distributed between process in same manner with rhs. */ maxfct = 1; /* Maximum number of numerical factorizations. */ mnum = 1; /* Which factorization to use. */ msglvl = 1; /* Print statistical information in file */ error = 0; /* Initialize error flag */ err_mem = 0; /* Initialize error flag for memory allocation */ /* Initialize matrix and rhs components on each process: In this example initial matrix is distributed between 2 processes so for MPI processes with rank > 1 input domains are empty */ MKL_INT64 ii, ia_index, ja_index, up_r, down_r; FILE * inputFile; if (rank == 0) { //mkl_set_num_threads(2); inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_1.txt", "r"); fscanf(inputFile, "%lld %lld", &ia_index, &ja_index); iparm[40] = 1; /* The number of row in global matrix, rhs element and solution vector that begins the input domain belonging to this MPI process */ iparm[41] = n/4; /* The number of row in global matrix, rhs element and solution vector that ends the input domain belonging to this MPI process */ //printf("%d %d %d %d", ia_index, ja_index, up_r, down_r); ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64); ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64); a = (double*) MKL_malloc (sizeof (double) * ja_index, 64); x = (double*) MKL_malloc (sizeof (double) * n, 64); b = (double*) MKL_malloc (sizeof (double) * n, 64); MPI_REDUCE_AND_BCAST; for (ii=0; ii<ia_index; ii++) fscanf(inputFile, "%lld", &ia[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lld", &ja[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lf", &a[ii]); for (ii=0; ii<n; ii++) b[ii] = 1.0; fclose(inputFile); //printf("%d %d %d %d %lf %lf", ia[0], ia[ia_index-1], ja[0], ja[ja_index-1], a[0], a[ja_index-1]); //printf("%d %d", iparm[40], iparm[41]); //printf("%lf %lf %lf %lf",b[0],b[1],b[ia_index-3],b[ia_index-1]); if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL ) { if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 1; } } else if (rank == 1) { //mkl_set_num_threads(2); inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_2.txt", "r"); fscanf(inputFile, "%lld %lld", &ia_index, &ja_index); iparm[40] = (n/4)+1; /* The number of row in global matrix, rhs element and solution vector that begins the input domain belonging to this MPI process */ iparm[41] = (n/4)*2; /* The number of row in global matrix, rhs element and solution vector that ends the input domain belonging to this MPI process */ ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64); ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64); a = (double*) MKL_malloc (sizeof (double) * ja_index, 64); x = (double*) MKL_malloc (sizeof (double) * n, 64); b = (double*) MKL_malloc (sizeof (double) * n, 64); MPI_REDUCE_AND_BCAST; for (ii=0; ii<ia_index; ii++) fscanf(inputFile, "%lld", &ia[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lld", &ja[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lf", &a[ii]); fclose(inputFile); if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL ) { if ( rank == 1 ) printf ("\nERROR during memory allocation: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 1; } } else if (rank == 2) { //mkl_set_num_threads(2); inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_3.txt", "r"); fscanf(inputFile, "%lld %lld", &ia_index, &ja_index); iparm[40] = (n/4)*2+1; /* The number of row in global matrix, rhs element and solution vector that begins the input domain belonging to this MPI process */ iparm[41] = (n/4)*3; /* The number of row in global matrix, rhs element and solution vector that ends the input domain belonging to this MPI process */ ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64); ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64); a = (double*) MKL_malloc (sizeof (double) * ja_index, 64); x = (double*) MKL_malloc (sizeof (double) * n, 64); b = (double*) MKL_malloc (sizeof (double) * n, 64); MPI_REDUCE_AND_BCAST; for (ii=0; ii<ia_index; ii++) fscanf(inputFile, "%lld", &ia[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lld", &ja[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lf", &a[ii]); fclose(inputFile); if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL ) { if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 1; } } else if (rank == 3) { //mkl_set_num_threads(2); inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_4.txt", "r"); fscanf(inputFile, "%lld %lld", &ia_index, &ja_index); iparm[40] = (n/4)*3+1; /* The number of row in global matrix, rhs element and solution vector that begins the input domain belonging to this MPI process */ iparm[41] = (n/4)*4; /* The number of row in global matrix, rhs element and solution vector that ends the input domain belonging to this MPI process */ ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64); ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64); a = (double*) MKL_malloc (sizeof (double) * ja_index, 64); x = (double*) MKL_malloc (sizeof (double) * n, 64); b = (double*) MKL_malloc (sizeof (double) * n, 64); MPI_REDUCE_AND_BCAST; for (ii=0; ii<ia_index; ii++) fscanf(inputFile, "%lld", &ia[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lld", &ja[ii]); for (ii=0; ii<ja_index; ii++) fscanf(inputFile, "%lf", &a[ii]); fclose(inputFile); if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL ) { if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 1; } } else { MPI_REDUCE_AND_BCAST; /* In this example MPI processes with rank > 1 doesn't have input domain so iparm[40] need to be greater then iparm[41] */ iparm[40] = 2; iparm[41] = 1; } //*************************************************************************************************************************************************** gethostname(hostName, 1023); printf("\n%d th rank at the end of the loading phase, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); if (rank == 0 ) before = clock(); /* -------------------------------------------------------------------- */ /* .. Reordering and Symbolic Factorization. This step also allocates */ /* all memory that is necessary for the factorization. */ /* -------------------------------------------------------------------- */ phase = 11; cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error ); if ( error != 0 ) { if ( rank == 0 ) printf ("\nERROR during symbolic factorization: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 1; } if ( rank == 0 ) printf ("\nReordering completed ... "); printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); if (rank == 0 ) { result = (double)(clock()-before)/CLOCKS_PER_SEC; printf("##### Time consumption for reordering is %7.2lf seconds.\n", result); before = clock(); } //*************************************************************************************************************************************************** /* -------------------------------------------------------------------- */ /* .. Numerical factorization. */ /* -------------------------------------------------------------------- */ phase = 22; cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error ); if ( error != 0 ) { if ( rank == 0 ) printf ("\nERROR during numerical factorization: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 2; } if ( rank == 0 ) printf ("\nFactorization completed ... "); printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); if (rank == 0 ) { result = (double)(clock()-before)/CLOCKS_PER_SEC; printf("##### Time consumption for factorization is %7.2lf seconds.\n", result); before = clock(); } //*************************************************************************************************************************************************** /* -------------------------------------------------------------------- */ /* .. Back substitution and iterative refinement. */ /* -------------------------------------------------------------------- */ phase = 33; if ( rank == 0 ) printf ("\nSolving system..."); cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, b, x, &comm, &error ); if ( error != 0 ) { if ( rank == 0 ) printf ("\nERROR during solution: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 4; } /* The solution of the system is distributed between MPI processes like as input matrix so MPI processes with rank 0 and 1 keep only part of solution */ printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); if (rank == 0 ) { result = (double)(clock()-before)/CLOCKS_PER_SEC; printf("##### Time consumption for solving is %7.2lf seconds.\n", result); } //*************************************************************************************************************************************************** if ( rank == 0 ) { printf ("\nThe solution of the system is: "); for ( j = 0; j < 10; j++ ) { printf ("\n on zero process x [%lli] = % f", (long long int)j, x[j]); } printf ("\n"); } MPI_Barrier(MPI_COMM_WORLD); if ( rank == 1 ) { printf ("\nThe solution of the system is: "); for ( j = n-10; j < n; j++ ) { printf ("\n on first process x [%lli] = % f", (long long int)j, x[j]); } printf ("\n"); } MPI_Barrier(MPI_COMM_WORLD); //double res, res0; //char* uplo; //uplo = "Upper-triangle"; //mkl_cspblas_scsrsymv ( uplo, &n, a, ia, ja, x, bs ); //res = 0.0; //res0 = 0.0; //printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]); //printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]); //printf("XXXX"); /* -------------------------------------------------------------------- */ /* .. Termination and release of memory. */ /* -------------------------------------------------------------------- */ phase = -1; /* Release internal memory. */ cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, &ddum, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error ); //printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]); //printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]); if ( error != 0 ) { if ( rank == 0 ) printf ("\nERROR during release memory: %lli", (long long int)error); mpi_stat = MPI_Finalize(); return 5; } if ( rank < size ) { MKL_free(ia); MKL_free(ja); MKL_free(a); MKL_free(x); MKL_free(b); } printf("\n%d th rank at the end of the phase 33, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); //gethostname(hostName, 1023); //printf("\n%d th rank hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM)); //printf("\nPeak memory allocated by Intel MKL memory allocator after reset of peak memory counter %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM)); mpi_stat = MPI_Finalize(); return 0; }