Fortran Example
We will use a Fortran example to illustrate how choosing memory allocations appropriately can avoid redundant data transfers and boost performance.
The Fortran example uses OpenMP offload and is adapted from NWChem, a high-performance computational chemistry application.
In the example, there is a 4-level loop nest. The innermost k-loop calls the omp_fbody routine which offloads two calls to sgemm onto the device, and then offloads a reduction computation (reduction variables emp4i and emp5i) onto the device.
In the loop nest, some arrays are updated on the host (namely, arrays Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1). So we need to make the values of these arrays on the device consistent with their values on the host.
Version 1
In the first (naive) version of the program, we allocate all 11 arrays in Shared memory using the allocate directive.
!$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( eorb(1:nbf) ) ...
Shared allocations are accessible by the host and the device, and automatically migrate between the host and the device as needed. So the same pointer to a Shared allocation may be used on the host and device.
Version 1 is shown below.
include "mkl_omp_offload.f90" subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) use omp_lib use onemkl_blas_omp_offload_lp64 use iso_fortran_env implicit none real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) real, intent(in) :: Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) real :: emp4i,emp5i real :: eaijk,denom integer :: lnov,lnvv integer :: b,c real :: f1nbc,f1ncb,f2nbc,f2ncb real :: t1v1b,t1v2b lnov=nocc*nvir lnvv=nvir*nvir emp4i = 0.0 emp5i = 0.0 !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) !$omp target teams distribute parallel do collapse(2) & !$omp reduction(+:emp4i,emp5i) & !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & !$omp private(t1v1b,t1v2b) & !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) do b=1,nvir do c=1,nvir denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) f1nbc = f1n(b,c); f1ncb = f1n(c,b); f2nbc = f2n(b,c); f2ncb = f2n(c,b); t1v1b = t1v1(b); t1v2b = t1v2(b); emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) enddo enddo !$omp end target teams distribute parallel do emp4 = emp4 + emp4i emp5 = emp5 + emp5i end subroutine init_array_1(arr, m) implicit none real, intent(inout) :: arr(m) integer m, i do i = 1, m arr(i) = 1.0/(100.0 + i-1) end do end subroutine init_array_1 subroutine init_array_2(arr, m, n) implicit none real, intent(inout) :: arr(m, n) integer m, n, i, j do i = 1, m do j = 1, n arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) end do end do end subroutine init_array_2 program main use omp_lib use iso_fortran_env implicit none interface subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) end subroutine omp_fbody end interface integer :: ncor, nocc, nvir, maxiter, nkpass integer :: nbf, lnvv, lnov, kchunk real, allocatable :: eorb(:) real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: Jia(:) real, allocatable :: Kia(:) real, allocatable :: Tia(:) real, allocatable :: Xia(:) real, allocatable :: Tkj(:) real, allocatable :: Kkj(:) real, allocatable :: t1v1(:),t1v2(:) real emp4, emp5 integer :: a, b, c, i, j, k integer :: klo, khi, iter double precision, allocatable :: timers(:) double precision :: t0, t1, tsum, tmax, tmin, tavg ! Run parameters nocc = 256 nvir = 2048 maxiter = 50 nkpass = 1 ncor = 0 print *, "Run parameters:" print *, "nocc =", nocc print *, "nvir =", nvir print *, "maxiter =", maxiter print *, "nkpass =", nkpass print *, "ncor =", ncor print *, " " ! Allocate and initialize arrays. nbf = ncor + nocc + nvir lnvv = nvir * nvir lnov = nocc * nvir kchunk = (nocc - 1)/nkpass + 1 !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Kia(1:lnvv) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Tia(1:lnov*nocc) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Xia(1:lnov*nocc)) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Tkj(1:kchunk*lnvv) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( Kkj(1:kchunk*lnvv) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( t1v1(1:lnvv) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( t1v2(1:lnvv) ) ! call init_array_1(eorb, nbf) call init_array_1(Jia, lnvv) call init_array_1(Kia, lnvv) call init_array_1(Tia, lnov*nocc) call init_array_1(Xia, lnov*nocc) call init_array_1(Tkj, kchunk*lnvv) call init_array_1(Kkj, kchunk*lnov) call init_array_1(t1v1, lnvv) call init_array_1(t1v2, lnvv) call init_array_2(f1n, nvir, nvir) call init_array_2(f2n, nvir, nvir) print *, "End of initialization" allocate (timers(1:maxiter)) emp4=0.0 emp5=0.0 a=1 iter=1 do klo = 1, nocc, kchunk khi = MIN(nocc, klo+kchunk-1) do j = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Tkj and KKj. Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 #endif do i = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Jia, Kia, Tia, Xia arrays. Jia(lnvv) = Jia(lnvv) + 1.0 Kia(lnvv) = Kia(lnvv) + 1.0 Tia(lnov) = Tia(lnov) + 1.0 Xia(lnov) = Xia(lnov) + 1.0 #endif do k = klo, MIN(khi,i) #if defined(DO_UPDATE_ARRAYS) ! Update elements of t1v1 array. t1v1(:) = Tkj(lnvv-nvir+1:lnvv) #endif t0 = omp_get_wtime() call omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) t1 = omp_get_wtime() timers(iter) = (t1-t0) if (iter .eq. maxiter) then print *, "Stopping after ", iter, "iterations" print *, " " goto 150 endif ! Prevent NAN for large maxiter... if (emp4 > 1000.0) then emp4 = emp4 - 1000.0 endif if (emp4 < -1000.0) then emp4 = emp4 + 1000.0 endif if (emp5 > 1000.0) then emp5 = emp5 - 1000.0 endif if (emp5 < -1000.0) then emp5 = emp5 + 1000.0 endif iter = iter + 1 end do ! k = klo, MIN(khi,i) end do ! do i = 1, nocc end do ! do j = 1, nocc end do ! do klo = 1, nocc, kchunk 150 CONTINUE tsum = 0.0 tmax = -1.0e10 tmin = 1.0e10 do i = 2, iter tsum = tsum + timers(i) tmax = MAX(tmax,timers(i)) tmin = MIN(tmin,timers(i)) end do tavg = tsum / (iter - 1) print *, "TOTAL ITER: ", iter write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 110 format (A, F9.6, A, F9.6, A, F9.6, A) write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 120 format (A, F15.3, A, F15.3) print *, "END" deallocate (f1n) deallocate (f2n) deallocate (eorb) deallocate (Jia) deallocate (Kia) deallocate (Tia) deallocate (Xia) deallocate (Tkj) deallocate (Kkj) deallocate (t1v1) deallocate (t1v2) deallocate (timers) end program
While this version is straightforward to program, it is not the most efficient.
Version 2
In Version 2 of the program, we allocate all 11 arrays in system memory using plain allocate, and use the map clause to map the arrays to the device:
real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: eorb(:) ... !$omp target data & map(to: f1n, f2n, eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2)
When an array that is mapped to the device is updated on the host, we use the OpenMP target update to directive to copy the new values of the array from the host to the device. We place the directive at the highest level in the loop nest as possible, to avoid unnecessary copying of the array.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
The full Version 2 is shown below.
include "mkl_omp_offload.f90" subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) use omp_lib use onemkl_blas_omp_offload_lp64 use iso_fortran_env implicit none real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) real, intent(in) :: Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) real :: emp4i,emp5i real :: eaijk,denom integer :: lnov,lnvv integer :: b,c real :: f1nbc,f1ncb,f2nbc,f2ncb real :: t1v1b,t1v2b lnov=nocc*nvir lnvv=nvir*nvir emp4i = 0.0 emp5i = 0.0 !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) !$omp target teams distribute parallel do collapse(2) & !$omp reduction(+:emp4i,emp5i) & !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & !$omp private(t1v1b,t1v2b) & !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) do b=1,nvir do c=1,nvir denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) f1nbc = f1n(b,c); f1ncb = f1n(c,b); f2nbc = f2n(b,c); f2ncb = f2n(c,b); t1v1b = t1v1(b); t1v2b = t1v2(b); emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) enddo enddo !$omp end target teams distribute parallel do emp4 = emp4 + emp4i emp5 = emp5 + emp5i end subroutine init_array_1(arr, m) implicit none real, intent(inout) :: arr(m) integer m, i do i = 1, m arr(i) = 1.0/(100.0 + i-1) end do end subroutine init_array_1 subroutine init_array_2(arr, m, n) implicit none real, intent(inout) :: arr(m, n) integer m, n, i, j do i = 1, m do j = 1, n arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) end do end do end subroutine init_array_2 program main use omp_lib use iso_fortran_env implicit none interface subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) end subroutine omp_fbody end interface integer :: ncor, nocc, nvir, maxiter, nkpass integer :: nbf, lnvv, lnov, kchunk real, allocatable :: eorb(:) real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: Jia(:) real, allocatable :: Kia(:) real, allocatable :: Tia(:) real, allocatable :: Xia(:) real, allocatable :: Tkj(:) real, allocatable :: Kkj(:) real, allocatable :: t1v1(:),t1v2(:) real emp4, emp5 integer :: a, b, c, i, j, k integer :: klo, khi, iter double precision, allocatable :: timers(:) double precision :: t0, t1, tsum, tmax, tmin, tavg ! Run parameters nocc = 256 nvir = 2048 maxiter = 50 nkpass = 1 ncor = 0 print *, "Run parameters:" print *, "nocc =", nocc print *, "nvir =", nvir print *, "maxiter =", maxiter print *, "nkpass =", nkpass print *, "ncor =", ncor print *, " " ! Allocate and initialize arrays. nbf = ncor + nocc + nvir lnvv = nvir * nvir lnov = nocc * nvir kchunk = (nocc - 1)/nkpass + 1 allocate( f1n(1:nvir,1:nvir) ) allocate( f2n(1:nvir,1:nvir) ) allocate( eorb(1:nbf) ) allocate( Jia(1:lnvv) ) allocate( Kia(1:lnvv) ) allocate( Tia(1:lnov*nocc) ) allocate( Xia(1:lnov*nocc)) allocate( Tkj(1:kchunk*lnvv) ) allocate( Kkj(1:kchunk*lnvv) ) allocate( t1v1(1:lnvv) ) allocate( t1v2(1:lnvv) ) ! call init_array_1(eorb, nbf) call init_array_1(Jia, lnvv) call init_array_1(Kia, lnvv) call init_array_1(Tia, lnov*nocc) call init_array_1(Xia, lnov*nocc) call init_array_1(Tkj, kchunk*lnvv) call init_array_1(Kkj, kchunk*lnov) call init_array_1(t1v1, lnvv) call init_array_1(t1v2, lnvv) call init_array_2(f1n, nvir, nvir) call init_array_2(f2n, nvir, nvir) print *, "End of initialization" allocate (timers(1:maxiter)) emp4=0.0 emp5=0.0 a=1 iter=1 !$omp target data & map(to: f1n, f2n, eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) do klo = 1, nocc, kchunk khi = MIN(nocc, klo+kchunk-1) do j = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Tkj and KKj. Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj) #endif do i = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Jia, Kia, Tia, Xia arrays. Jia(lnvv) = Jia(lnvv) + 1.0 Kia(lnvv) = Kia(lnvv) + 1.0 Tia(lnov) = Tia(lnov) + 1.0 Xia(lnov) = Xia(lnov) + 1.0 !$omp target update to (Jia, Kia, Tia, Xia) #endif do k = klo, MIN(khi,i) #if defined(DO_UPDATE_ARRAYS) ! Update elements of t1v1 array. t1v1(:) = Tkj(lnvv-nvir+1:lnvv) !$omp target update to (t1v1) #endif t0 = omp_get_wtime() call omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) t1 = omp_get_wtime() timers(iter) = (t1-t0) if (iter .eq. maxiter) then print *, "Stopping after ", iter, "iterations" print *, " " goto 150 endif ! Prevent NAN for large maxiter... if (emp4 > 1000.0) then emp4 = emp4 - 1000.0 endif if (emp4 < -1000.0) then emp4 = emp4 + 1000.0 endif if (emp5 > 1000.0) then emp5 = emp5 - 1000.0 endif if (emp5 < -1000.0) then emp5 = emp5 + 1000.0 endif iter = iter + 1 end do ! k = klo, MIN(khi,i) end do ! do i = 1, nocc end do ! do j = 1, nocc end do ! do klo = 1, nocc, kchunk 150 CONTINUE !$omp end target data tsum = 0.0 tmax = -1.0e10 tmin = 1.0e10 do i = 2, iter tsum = tsum + timers(i) tmax = MAX(tmax,timers(i)) tmin = MIN(tmin,timers(i)) end do tavg = tsum / (iter - 1) print *, "TOTAL ITER: ", iter write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 110 format (A, F9.6, A, F9.6, A, F9.6, A) write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 120 format (A, F15.3, A, F15.3) print *, "END" deallocate (f1n) deallocate (f2n) deallocate (eorb) deallocate (Jia) deallocate (Kia) deallocate (Tia) deallocate (Xia) deallocate (Tkj) deallocate (Kkj) deallocate (t1v1) deallocate (t1v2) deallocate (timers) end program
Version 3
In the third version of the program, we consider which arrays are used on the host only, on the device only, or on both the host and the device. We also consider whether the array values are updated during the execution of the program. This information is used to decide where to allocate the arrays, how to initialize them, and whether to update their values on the device.
Arrays f1 and f2 are used as work arrays on the device (to store the results of calls to SGEMM), and they are not accessed on the host. So we allocate them directly on the device.
Since f1 and f2 are allocated on the device, we call init_array_2() to initialize the arrays in an OpenMP target construct.
The other 9 arrays are accessed on both the host and the device, so we allocate them in Host memory and and call init_array_1() to initialize the arrays. The arrays are then mapped to the device using the map clause.
We use the OpenMP target update to directive to copy the updated values of Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1 from the host to the device.
!$omp allocate allocator(omp_target_device_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kia(1:lnvv) ) ... !$omp target data & map(to: eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) ... Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
The full Version 3 of the program is shown below.
include "mkl_omp_offload.f90" subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) use omp_lib use onemkl_blas_omp_offload_lp64 use iso_fortran_env implicit none real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) real, intent(in) :: Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) real :: emp4i,emp5i real :: eaijk,denom integer :: lnov,lnvv integer :: b,c real :: f1nbc,f1ncb,f2nbc,f2ncb real :: t1v1b,t1v2b lnov=nocc*nvir lnvv=nvir*nvir emp4i = 0.0 emp5i = 0.0 !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) !$omp dispatch call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) !$omp dispatch call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) !$omp target teams distribute parallel do collapse(2) & !$omp reduction(+:emp4i,emp5i) & !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & !$omp private(t1v1b,t1v2b) & !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) do b=1,nvir do c=1,nvir denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) f1nbc = f1n(b,c); f1ncb = f1n(c,b); f2nbc = f2n(b,c); f2ncb = f2n(c,b); t1v1b = t1v1(b); t1v2b = t1v2(b); emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) enddo enddo !$omp end target teams distribute parallel do emp4 = emp4 + emp4i emp5 = emp5 + emp5i end subroutine init_array_1(arr, m) implicit none real, intent(inout) :: arr(m) integer m, i do i = 1, m arr(i) = 1.0/(100.0 + i-1) end do end subroutine init_array_1 subroutine init_array_2(arr, m, n) implicit none real, intent(inout) :: arr(m, n) integer m, n, i, j !$omp target teams distribute parallel do do i = 1, m do j = 1, n arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) end do end do end subroutine init_array_2 program main use omp_lib use iso_fortran_env implicit none interface subroutine omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) real, intent(inout) :: emp4,emp5 integer, intent(in) :: ncor,nocc,nvir integer, intent(in) :: a,i,j,k, klo real, intent(inout) :: f1n(nvir,nvir) real, intent(inout) :: f2n(nvir,nvir) real, intent(in) :: eorb(*) real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) real, intent(in) :: t1v1(nvir),t1v2(nvir) end subroutine omp_fbody end interface integer :: ncor, nocc, nvir, maxiter, nkpass integer :: nbf, lnvv, lnov, kchunk real, allocatable :: eorb(:) real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: Jia(:) real, allocatable :: Kia(:) real, allocatable :: Tia(:) real, allocatable :: Xia(:) real, allocatable :: Tkj(:) real, allocatable :: Kkj(:) real, allocatable :: t1v1(:),t1v2(:) real emp4, emp5 integer :: a, b, c, i, j, k integer :: klo, khi, iter double precision, allocatable :: timers(:) double precision :: t0, t1, tsum, tmax, tmin, tavg ! Run parameters nocc = 256 nvir = 2048 maxiter = 50 nkpass = 1 ncor = 0 print *, "Run parameters:" print *, "nocc =", nocc print *, "nvir =", nvir print *, "maxiter =", maxiter print *, "nkpass =", nkpass print *, "ncor =", ncor print *, " " ! Allocate and initialize arrays. nbf = ncor + nocc + nvir lnvv = nvir * nvir lnov = nocc * nvir kchunk = (nocc - 1)/nkpass + 1 !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Tia(1:lnov*nocc) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Xia(1:lnov*nocc)) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Tkj(1:kchunk*lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kkj(1:kchunk*lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( t1v1(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( t1v2(1:lnvv) ) ! call init_array_1(eorb, nbf) call init_array_1(Jia, lnvv) call init_array_1(Kia, lnvv) call init_array_1(Tia, lnov*nocc) call init_array_1(Xia, lnov*nocc) call init_array_1(Tkj, kchunk*lnvv) call init_array_1(Kkj, kchunk*lnov) call init_array_1(t1v1, lnvv) call init_array_1(t1v2, lnvv) call init_array_2(f1n, nvir, nvir) call init_array_2(f2n, nvir, nvir) print *, "End of initialization" allocate (timers(1:maxiter)) emp4=0.0 emp5=0.0 a=1 iter=1 !$omp target data & map(to: eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) do klo = 1, nocc, kchunk khi = MIN(nocc, klo+kchunk-1) do j = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Tkj and KKj. Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj) #endif do i = 1, nocc #if defined(DO_UPDATE_ARRAYS) ! Update elements of Jia, Kia, Tia, Xia arrays. Jia(lnvv) = Jia(lnvv) + 1.0 Kia(lnvv) = Kia(lnvv) + 1.0 Tia(lnov) = Tia(lnov) + 1.0 Xia(lnov) = Xia(lnov) + 1.0 !$omp target update to (Jia, Kia, Tia, Xia) #endif do k = klo, MIN(khi,i) #if defined(DO_UPDATE_ARRAYS) ! Update elements of t1v1 array. t1v1(:) = Tkj(lnvv-nvir+1:lnvv) !$omp target update to (t1v1) #endif t0 = omp_get_wtime() call omp_fbody(f1n,f2n,eorb, & ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & Jia, Kia, Tia, Xia, Tkj, Kkj, & t1v1,t1v2) t1 = omp_get_wtime() timers(iter) = (t1-t0) if (iter .eq. maxiter) then print *, "Stopping after ", iter, "iterations" print *, " " goto 150 endif ! Prevent NAN for large maxiter... if (emp4 > 1000.0) then emp4 = emp4 - 1000.0 endif if (emp4 < -1000.0) then emp4 = emp4 + 1000.0 endif if (emp5 > 1000.0) then emp5 = emp5 - 1000.0 endif if (emp5 < -1000.0) then emp5 = emp5 + 1000.0 endif iter = iter + 1 end do ! k = klo, MIN(khi,i) end do ! do i = 1, nocc end do ! do j = 1, nocc end do ! do klo = 1, nocc, kchunk 150 CONTINUE !$omp end target data tsum = 0.0 tmax = -1.0e10 tmin = 1.0e10 do i = 2, iter tsum = tsum + timers(i) tmax = MAX(tmax,timers(i)) tmin = MIN(tmin,timers(i)) end do tavg = tsum / (iter - 1) print *, "TOTAL ITER: ", iter write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 110 format (A, F9.6, A, F9.6, A, F9.6, A) write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 120 format (A, F15.3, A, F15.3) print *, "END" deallocate (f1n) deallocate (f2n) deallocate (eorb) deallocate (Jia) deallocate (Kia) deallocate (Tia) deallocate (Xia) deallocate (Tkj) deallocate (Kkj) deallocate (t1v1) deallocate (t1v2) deallocate (timers) end program
Performance Comparison
The following commands were used to compile, link, and run the various versions. In the following, substitute the source filename for TEST.f.
Compile: ifx -O3 -fiopenmp -fopenmp-targets=spir64 -DMKL -qmkl -fpp -free -D DO_UPDATE_ARRAYS TEST.f -c -o TEST.o Link: ifx -fiopenmp -fopenmp-targets=spir64 -qmkl -fsycl -L${MKLROOT}/lib/intel64 -liomp5 -lsycl -lOpenCL -lstdc++ -lpthread -lm -ldl -lmkl_sycl -D DO_UPDATE_ARRAYS TEST.o -o TEST.exe Run: LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=copy OMP_TARGET_OFFLOAD=MANDATORY ZE_AFFINITY_MASK=0.0 LIBOMPTARGET_DEBUG=0 LIBOMPTARGET_PLUGIN=level0 TEST.exe
We compared the performance of the three versions on the particular GPU used (1-stack only).
The average iteration time for the various versions was as follows.
Version 1 (test-SharedMem.f: 0.115163 seconds Version 2 (test-Map-UpdateTo.f): 0.002012 seconds Version 3 (test-DeviceMem-Map-UpdateTo.f): 0.001978 seconds