MKL does not respect affinity on master thread

This is follow-up/semi-related post to this rather long post:

https://software.intel.com/en-us/comment/1825717#comment-1825717

So like the title says:

MKL does not respect affinity on the master thread. Even though I enforce MKL to spawn threads on the NUMA node the master thread is running on, only one thread runs on this NUMA node and the rest gets pushed to other NUMA nodes. This is only a problem for the masterthread, since all the other threads actually respecet the affinity they have been given.

The code below is made to run on systems with a minimum of 6 NUMA nodes with at least 6 cores on each thread. (In my case I run it on a system with 8 NUMA nodes with 6 cores on each.)

It is very clear to see what happens if you comment/uncommet case (1) in the first nested region aka this line:

       CASE(1)
          !call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)<--- comment out this one

This bug or whatever it is, is extremely problematic for us, since it in practice makes it impossible to use nested MKL on NUMA systems. Does anyone have a solution to it? Or does Intel have an idea about when this could be resolved?

program NumaAwareDGEMM

 use IFPORT
 use omp_lib
 use mkl_service
 implicit none

 logical(4) :: Success
 integer :: NoNUMANodes, blocksize,dim
 integer :: N,I,J
 integer :: ID
 real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8


NoNUMANodes=6                     !How many NUMA nodes to distribute calculations over
success = SETENVQQ("OMP_DISPLAY_ENV=TRUE")
success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}")


 blocksize=1000
 dim=blocksize*NoNUMANodes
 allocate(A(dim,dim))
 allocate(B(dim,dim))
 allocate(C1(dim,dim))
 allocate(C2(dim,dim))
 allocate(C3(dim,dim))
 allocate(C4(dim,dim))
 allocate(C5(dim,dim))
 allocate(C6(dim,dim))
 allocate(C7(dim,dim))
 allocate(C8(dim,dim))
 call KMP_SET_STACKSIZE_S(990000000)
 call omp_set_dynamic(0)
 call mkl_set_dynamic(0)
 call omp_set_nested(1)
 call MKL_SET_NUM_THREADS(5)

   !intialization region
   call omp_set_num_threads(NoNumaNodes) !Outer parallelization across all numanodes
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      ID=omp_get_thread_num()
      print *,'Thread binding for socket=',ID
      SELECT CASE (ID)
        CASE(1)
          success=SETENVQQ("OMP_PLACES={0:6}")
        CASE(2)
          success=SETENVQQ("OMP_PLACES={6:6}")
        CASE(3)
          success=SETENVQQ("OMP_PLACES={12:6}")
        CASE(4)
          success=SETENVQQ("OMP_PLACES={18:6}")
        CASE(5)
          success=SETENVQQ("OMP_PLACES={24:6}")
        CASE(6)
          success=SETENVQQ("OMP_PLACES={30:6}")
        CASE(7)
          success=SETENVQQ("OMP_PLACES={36:6}")
        CASE(8)
          success=SETENVQQ("OMP_PLACES={42:6}")
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL
    print*,'Initialization over'
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL
   print*,'First MKL call done'
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL
  end program NumaAwareDGEMM

dasfasd

Latest Images

Trending Articles

Latest Images