Directly calling mkl from python, and try to use more than one thread.

I am doing some sparse matrix calculation, and called mkl directly from python.

That worked, but only a single thread is used. When I use the top command, one of the cpu core has 100% usage, other cpu cores has about 0% usage.

How to make the mkl function use multiple threads?

I have tried setting the OMP_NUM_THREADS, MKL_NUM_THREADS, MKL_DOMAIN_NUM_THREADS environmental variables to 12.

The code also try to set number of mkl threads to 12 by mkl.mkl_set_num_threads(byref(c_int(num_cpu)))

Does the sparse matrix routines of mkl support multithreading calculation?

The mkl is the 2016 version.

Thank you.

The code is below:

from ctypes import *
import scipy.sparse as spsp
import numpy as np
import multiprocessing as mp

# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")


def get_csr_handle2(data, indices, indptr, shape):
	a_pointer   = data.ctypes.data_as(POINTER(c_float))
	ja_pointer  = indices.ctypes.data_as(POINTER(c_int))
	ia_pointer  = indptr.ctypes.data_as(POINTER(c_int))
	return (a_pointer, ja_pointer, ia_pointer, shape)


def get_csr_handle(A,clear=False):
	if clear == True:
		A.indptr[:] = 0
		A.indices[:] = 0
		A.data[:] = 0
	return get_csr_handle2(A.data, A.indices, A.indptr, A.shape)


def csr_t_dot_csr(A_handle, C_handle, nz=None):
	# Calculate (A.T).dot(A) and put result into C
	#
	# This uses one-based indexing
	#
	# Both C.data and A.data must be in np.float32 type.
	#
	# Number of nonzero elements in C must be greater than
	#     or equal to the size of C.data
	#
	# size of C.indptr must be greater than or equal to
	#     1 + (num rows of A).
	#
	# C_data    = np.zeros((nz), dtype=np.single)
	# C_indices = np.zeros((nz), dtype=np.int32)
	# C_indptr  = np.zeros((m+1),dtype=np.int32)

	(a_pointer, ja_pointer, ia_pointer, A_shape) = A_handle
	(c_pointer, jc_pointer, ic_pointer, C_shape) = C_handle

	trans_pointer   = byref(c_char('T'))
	sort_pointer    = byref(c_int(0))

	(m, n)          = A_shape
	sort_pointer        = byref(c_int(0))
	m_pointer           = byref(c_int(m))     # Number of rows of matrix A
	n_pointer           = byref(c_int(n))     # Number of columns of matrix A
	k_pointer           = byref(c_int(n))     # Number of columns of matrix B
	                                          # should be n when trans='T'
						  # Otherwise, I guess should be m
	###
	b_pointer   = a_pointer
	jb_pointer  = ja_pointer
	ib_pointer  = ia_pointer
	###
	if nz == None:
		nz = n*n #*n # m*m # Number of nonzero elements expected
			 # probably can use lower value for sparse
			 # matrices.
	nzmax_pointer   = byref(c_int(nz))
	 # length of arrays c and jc. (which are data and
	 # indices of csr_matrix). So this is the number of
	 # nonzero elements of matrix C
	 #
	 # This parameter is used only if request=0.
	 # The routine stops calculation if the number of
	 # elements in the result matrix C exceeds the
	 # specified value of nzmax.

	info = c_int(-3)
	info_pointer = byref(info)
	request_pointer_list = [byref(c_int(0)), byref(c_int(1)), byref(c_int(2))]
	return_list = []
	for ii in [0]:
		request_pointer = request_pointer_list[ii]
		ret = mkl.mkl_scsrmultcsr(trans_pointer, request_pointer, sort_pointer,
				    m_pointer, n_pointer, k_pointer,
				    a_pointer, ja_pointer, ia_pointer,
				    b_pointer, jb_pointer, ib_pointer,
				    c_pointer, jc_pointer, ic_pointer,
				    nzmax_pointer, info_pointer)
		info_val = info.value
		return_list += [ (ret,info_val) ]
	return return_list

def test():
	num_cpu = 12
	mkl.mkl_set_num_threads(byref(c_int(num_cpu))) # try to set number of mkl threads
	print "mkl get max thread:", mkl.mkl_get_max_threads()
	test_csr_t_dot_csr()

def test_csr_t_dot_csr():
	AA = np.random.choice([0,1], size=(12,750000), replace=True, p=[0.99,0.01])
	A_original = spsp.csr_matrix(AA)
	A = A_original.astype(np.float32).tocsc()
	A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )

	A.indptr  += 1 # convert to 1-based indexing
	A.indices += 1 # convert to 1-based indexing
	A_ptrs = get_csr_handle(A)

	C = spsp.csr_matrix( np.ones((12,12)), dtype=np.float32)
	C_ptrs = get_csr_handle(C, clear=True)

	print "=call mkl function="

	while (True):
		return_list = csr_t_dot_csr(A_ptrs, C_ptrs)

if __name__ == "__main__":
	test()

So far, numpy linked with mkl can use multiple threads in the following code without setting any environment variables.

import ctypes
mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
print mkl.mkl_get_max_threads()
import numpy as np

a = np.random.normal( 0,1, (100,1000))

while True:
        a.dot(a.T)

Directly calling mkl from python, and try to use more than one thread.

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List