Quantcast
Channel: Intel® oneAPI Math Kernel Library & Intel® Math Kernel Library
Viewing all articles
Browse latest Browse all 2652

Large sparse matrix solving problem with cluster

$
0
0

Dear Intel

With your help a month ago, I could set up the 'cluster sparse solver 64' program using 'iparm[1]=10 (The MPI version of the nested dissection and symbolic factorization algorithms)' with my 4 cluster computers.
This program is devised for very large sparse matrix, which contains about 10^8 - 10^9 rows.
However, it shows the lower performance(4MPI & 4OpenMP) than the result of a single machine(1MPI & 4OpenMP)

Following table is the result of the test.

  target : 4*10^8 rows matrix
  time consumption   (1MPI & 4OpenMP  ||  4MPI & 4OpenMP(iparm[1]=3)  ||  4MPI & 4OpenMP(iparm[1]=10))
-------------------------------------------------------------------------------------------------------------------------
    Reorder time       (1021.6 s    ||    2094.3 s    ||    7644.3 s)
    Factorization time  (1403.2 s    ||    2136.6 s    ||    9263.1 s)
    Solution time       (158.6 s    ||    684.9 s    ||    554.14 s)
-------------------------------------------------------------------------------------------------------------------------

Could you please look into this issue?
I attache my code below.

Thank you very much in advance!!!

Regards,
 Yong-hee

P.S. This is my code. (almost same with the example code)
=================================================================================

/*******************************************************************************
* Copyright 2004-2015 Intel Corporation All Rights Reserved.
*
* The source code,  information  and material  ("Material") contained  herein is
* owned by Intel Corporation or its  suppliers or licensors,  and  title to such
* Material remains with Intel  Corporation or its  suppliers or  licensors.  The
* Material  contains  proprietary  information  of  Intel or  its suppliers  and
* licensors.  The Material is protected by  worldwide copyright  laws and treaty
* provisions.  No part  of  the  Material   may  be  used,  copied,  reproduced,
* modified, published,  uploaded, posted, transmitted,  distributed or disclosed
* in any way without Intel's prior express written permission.  No license under
* any patent,  copyright or other  intellectual property rights  in the Material
* is granted to  or  conferred  upon  you,  either   expressly,  by implication,
* inducement,  estoppel  or  otherwise.  Any  license   under such  intellectual
* property rights must be express and approved by Intel in writing.
*
* Unless otherwise agreed by Intel in writing,  you may not remove or alter this
* notice or  any  other  notice   embedded  in  Materials  by  Intel  or Intel's
* suppliers or licensors in any way.
*******************************************************************************/

/*
*
*   MKL Cluster Sparse Solver example demonstrating the case when initial data (matrix
*   and rhs) distributed between several MPI processes, final solution is
*   distributed between MPI processes in the same way as they hold initial data.
*
********************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <time.h>
#include "mpi.h"
#include "mkl.h"
#include "mkl_cluster_sparse_solver.h"

#ifdef MKL_ILP64
#define MPI_DT MPI_LONG
#else
#define MPI_DT MPI_INT
#endif
#define MPI_REDUCE_AND_BCAST \
        MPI_Reduce(&err_mem, &error, 1, MPI_DT, MPI_SUM, 0, MPI_COMM_WORLD); \
        MPI_Bcast(&error, 1, MPI_DT, 0, MPI_COMM_WORLD);

int main(void)

{
	clock_t before;
	double result;

	MKL_INT64 AllocatedBytes;

	int N_AllocatedBuffers;
	mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
	AllocatedBytes = mkl_mem_stat(&N_AllocatedBuffers);

	mkl_set_num_threads(2);
	FILE* pre_inputFile=fopen("/user/source/CREATE_MATRIX/SPM_test_20000_whole.txt", "r");
	if (pre_inputFile==NULL){
		puts( "There is no file." );}

	MKL_INT64 n;
	fscanf(pre_inputFile, "%lld", &n);
	n -= 1;
    fclose(pre_inputFile);

    /* Matrix data. */
	//MKL_INT64 n = 5;

    MKL_INT64 mtype = 2; /* Real symmetric definite matrix */
    MKL_INT64 *ia = NULL;
    MKL_INT64 *ja = NULL;
    double  *a = NULL;
    /* RHS and solution vectors. */
    double  *b = NULL;
    double  *x = NULL;

	char	hostName[1024]	= "\0";

    MKL_INT64 nrhs = 1; /* Number of right hand sides. */
    /* Internal solver memory pointer pt, */
    /* 32-bit: int pt[64]; 64-bit: long int pt[64] */
    /* or void *pt[64] should be OK on both architectures */
    void *pt[64] = { 0 };
    /* Cluster Sparse Solver control parameters. */
    MKL_INT64 iparm[64] = { 0 };
    MKL_INT64 maxfct, mnum, phase, msglvl, error, err_mem;

    /* Auxiliary variables. */
    double  ddum; /* Double dummy   */
    MKL_INT64 idum; /* Integer dummy. */
    MKL_INT64 j;
    int     mpi_stat = 0;
    int     argc = 0;
    int     comm, rank, size;
    char**  argv;

	printf("at the beginning of the program, peak memory : %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));

	/* -------------------------------------------------------------------- */
    /* .. Init MPI.                                                         */
    /* -------------------------------------------------------------------- */
    mpi_stat = MPI_Init( &argc, &argv );
    mpi_stat = MPI_Comm_rank( MPI_COMM_WORLD, &rank );
	mpi_stat = MPI_Comm_size( MPI_COMM_WORLD, &size );
    comm =  MPI_Comm_c2f( MPI_COMM_WORLD );
	//printf ("comm : %d, rank : %d, size : %d", comm, rank, size);

	if( size < 2 )
	{
		printf("\nERROR: this example doesn't work on number of MPI less than 2");
		mpi_stat = MPI_Finalize();
		return 1;
	}

    /* -------------------------------------------------------------------- */
    /* .. Setup Cluster Sparse Solver control parameters.                                 */
    /* -------------------------------------------------------------------- */
    iparm[ 0] =  1; /* Solver default parameters overriden with provided by iparm */
    iparm[ 1] =  10; /* Use METIS for fill-in reordering */
    iparm[ 5] =  0; /* Write solution into x */
    iparm[ 7] =  2; /* Max number of iterative refinement steps */
    iparm[ 9] = 13; /* Perturb the pivot elements with 1E-13 */
    iparm[10] =  0; /* Don't use nonsymmetric permutation and scaling MPS */
    iparm[12] =  1; /* Switch on Maximum Weighted Matching algorithm (default for non-symmetric) */
    //iparm[17] = -1; /* Output: Number of nonzeros in the factor LU */
    //iparm[18] = -1; /* Output: Mflops for LU factorization */
    //iparm[26] =  1; /* Check input data for correctness */
	//iparm[34] =  1; /* Cluster Sparse Solver use C-style indexing for ia and ja arrays */
    //iparm[39] =  2; /* Input: matrix/rhs/solution are distributed between MPI processes  */
    /* If iparm[39]=2, the matrix is provided in distributed assembled matrix input
       format. In this case, each MPI process stores only a part (or domain) of the matrix A
       data. The bounds of the domain should be set via iparm(41) and iparm(42). Solution
       vector is distributed between process in same manner with rhs. */

    maxfct = 1; /* Maximum number of numerical factorizations. */
    mnum   = 1; /* Which factorization to use. */
    msglvl = 1; /* Print statistical information in file */
    error  = 0; /* Initialize error flag */
	err_mem = 0; /* Initialize error flag for memory allocation */

    /* Initialize matrix and rhs components on each process:
       In this example initial matrix is distributed between 2 processes
       so for MPI processes with rank > 1 input domains are empty */

	MKL_INT64 ii, ia_index, ja_index, up_r, down_r;
	FILE * inputFile;

    if (rank == 0)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_1.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = 1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = n/4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

		//printf("%d %d %d %d", ia_index, ja_index, up_r, down_r);

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);
		for (ii=0; ii<n; ii++)
			b[ii] = 1.0;

	    fclose(inputFile);
		//printf("%d %d %d %d %lf %lf", ia[0], ia[ia_index-1], ja[0], ja[ja_index-1], a[0], a[ja_index-1]);
		//printf("%d %d", iparm[40], iparm[41]);
		//printf("%lf %lf %lf %lf",b[0],b[1],b[ia_index-3],b[ia_index-1]);
        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 1)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_2.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*2; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);

	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 1 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 2)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_3.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)*2+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*3; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);

	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 3)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_4.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)*3+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);

	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else {
		MPI_REDUCE_AND_BCAST;
        /* In this example MPI processes with rank > 1 doesn't have input domain
           so iparm[40] need to be greater then iparm[41] */
        iparm[40] = 2;
        iparm[41] = 1;
    }

	//***************************************************************************************************************************************************

	gethostname(hostName, 1023);
    printf("\n%d th rank at the end of the loading phase, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
		before = clock();
	/* -------------------------------------------------------------------- */
    /* .. Reordering and Symbolic Factorization. This step also allocates   */
    /* all memory that is necessary for the factorization.                  */
    /* -------------------------------------------------------------------- */
    phase = 11;
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during symbolic factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 1;
    }

    if ( rank == 0 ) printf ("\nReordering completed ... ");

	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for reordering is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Numerical factorization.                                          */
    /* -------------------------------------------------------------------- */
    phase = 22;
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during numerical factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 2;
    }
    if ( rank == 0 ) printf ("\nFactorization completed ... ");

	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for factorization is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Back substitution and iterative refinement.                       */
    /* -------------------------------------------------------------------- */
    phase = 33;

    if ( rank == 0 ) printf ("\nSolving system...");
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, b, x, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during solution: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 4;
    }
    /* The solution of the system is distributed between MPI processes like as input matrix
       so MPI processes with rank 0 and 1 keep only part of solution */

	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for solving is %7.2lf seconds.\n", result);
	}

	//***************************************************************************************************************************************************
    if ( rank == 0 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = 0; j < 10; j++ )
        {
            printf ("\n on zero process x [%lli] = % f", (long long int)j, x[j]);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if ( rank == 1 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = n-10; j < n; j++ )
        {
            printf ("\n on first process x [%lli] = % f", (long long int)j, x[j]);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

	//double res, res0;
    //char*   uplo;
	//uplo = "Upper-triangle";
    //mkl_cspblas_scsrsymv ( uplo, &n, a, ia, ja, x, bs );
    //res  = 0.0;
    //res0 = 0.0;

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);
	//printf("XXXX");

    /* -------------------------------------------------------------------- */
    /* .. Termination and release of memory. */
    /* -------------------------------------------------------------------- */
    phase = -1; /* Release internal memory. */
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,&n, &ddum, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);

    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during release memory: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 5;
    }
    if ( rank < size )
    {
        MKL_free(ia);
        MKL_free(ja);
        MKL_free(a);
        MKL_free(x);
        MKL_free(b);
    }

	printf("\n%d th rank at the end of the phase 33, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);

	//gethostname(hostName, 1023);
	//printf("\n%d th rank hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	//printf("\nPeak memory allocated by Intel MKL memory allocator after reset of peak memory counter %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));

    mpi_stat = MPI_Finalize();
    return 0;
}

 


Viewing all articles
Browse latest Browse all 2652

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>