Fortran Example
We will use a Fortran example to illustrate how choosing memory allocations appropriately can avoid redundant data transfers and boost performance.
The Fortran example uses OpenMP offload and is adapted from NWChem, a high-performance computational chemistry application.
In the example, there is a 4-level loop nest. The innermost k-loop calls the omp_fbody routine which offloads two calls to sgemm onto the device, and then offloads a reduction computation (reduction variables emp4i and emp5i) onto the device.
In the loop nest, some arrays are updated on the host (namely, arrays Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1). So we need to make the values of these arrays on the device consistent with their values on the host.
Version 1
In the first (naive) version of the program, we allocate all 11 arrays in Shared memory using the allocate directive.
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): f1n)
allocate( f1n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): f2n)
allocate( f2n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): eorb)
allocate( eorb(1:nbf) )
...
Shared allocations are accessible by the host and the device, and automatically migrate between the host and the device as needed. So the same pointer to a Shared allocation may be used on the host and device.
Version 1 is shown below.
include "mkl_omp_offload.f90"
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
use omp_lib
use onemkl_blas_omp_offload_lp64
use iso_fortran_env
implicit none
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
real, intent(in) :: Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
real :: emp4i,emp5i
real :: eaijk,denom
integer :: lnov,lnvv
integer :: b,c
real :: f1nbc,f1ncb,f2nbc,f2ncb
real :: t1v1b,t1v2b
lnov=nocc*nvir
lnvv=nvir*nvir
emp4i = 0.0
emp5i = 0.0
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
!$omp target teams distribute parallel do collapse(2) &
!$omp reduction(+:emp4i,emp5i) &
!$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
!$omp private(t1v1b,t1v2b) &
!$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
do b=1,nvir
do c=1,nvir
denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
f1nbc = f1n(b,c);
f1ncb = f1n(c,b);
f2nbc = f2n(b,c);
f2ncb = f2n(c,b);
t1v1b = t1v1(b);
t1v2b = t1v2(b);
emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
enddo
enddo
!$omp end target teams distribute parallel do
emp4 = emp4 + emp4i
emp5 = emp5 + emp5i
end
subroutine init_array_1(arr, m)
implicit none
real, intent(inout) :: arr(m)
integer m, i
do i = 1, m
arr(i) = 1.0/(100.0 + i-1)
end do
end subroutine init_array_1
subroutine init_array_2(arr, m, n)
implicit none
real, intent(inout) :: arr(m, n)
integer m, n, i, j
do i = 1, m
do j = 1, n
arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
end do
end do
end subroutine init_array_2
program main
use omp_lib
use iso_fortran_env
implicit none
interface
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
end subroutine omp_fbody
end interface
integer :: ncor, nocc, nvir, maxiter, nkpass
integer :: nbf, lnvv, lnov, kchunk
real, allocatable :: eorb(:)
real, allocatable :: f1n(:,:)
real, allocatable :: f2n(:,:)
real, allocatable :: Jia(:)
real, allocatable :: Kia(:)
real, allocatable :: Tia(:)
real, allocatable :: Xia(:)
real, allocatable :: Tkj(:)
real, allocatable :: Kkj(:)
real, allocatable :: t1v1(:),t1v2(:)
real emp4, emp5
integer :: a, b, c, i, j, k
integer :: klo, khi, iter
double precision, allocatable :: timers(:)
double precision :: t0, t1, tsum, tmax, tmin, tavg
! Run parameters
nocc = 256
nvir = 2048
maxiter = 50
nkpass = 1
ncor = 0
print *, "Run parameters:"
print *, "nocc =", nocc
print *, "nvir =", nvir
print *, "maxiter =", maxiter
print *, "nkpass =", nkpass
print *, "ncor =", ncor
print *, " "
! Allocate and initialize arrays.
nbf = ncor + nocc + nvir
lnvv = nvir * nvir
lnov = nocc * nvir
kchunk = (nocc - 1)/nkpass + 1
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): f1n)
allocate( f1n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): f2n)
allocate( f2n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): eorb)
allocate( eorb(1:nbf) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Jia)
allocate( Jia(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Kia)
allocate( Kia(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Tia)
allocate( Tia(1:lnov*nocc) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Xia)
allocate( Xia(1:lnov*nocc))
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Tkj)
allocate( Tkj(1:kchunk*lnvv) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): Kkj)
allocate( Kkj(1:kchunk*lnvv) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): t1v1)
allocate( t1v1(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_shared_mem_alloc): t1v2)
allocate( t1v2(1:lnvv) )
!
call init_array_1(eorb, nbf)
call init_array_1(Jia, lnvv)
call init_array_1(Kia, lnvv)
call init_array_1(Tia, lnov*nocc)
call init_array_1(Xia, lnov*nocc)
call init_array_1(Tkj, kchunk*lnvv)
call init_array_1(Kkj, kchunk*lnov)
call init_array_1(t1v1, lnvv)
call init_array_1(t1v2, lnvv)
call init_array_2(f1n, nvir, nvir)
call init_array_2(f2n, nvir, nvir)
print *, "End of initialization"
allocate (timers(1:maxiter))
emp4=0.0
emp5=0.0
a=1
iter=1
do klo = 1, nocc, kchunk
khi = MIN(nocc, klo+kchunk-1)
do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Tkj and KKj.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
#endif
do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Jia, Kia, Tia, Xia arrays.
Jia(lnvv) = Jia(lnvv) + 1.0
Kia(lnvv) = Kia(lnvv) + 1.0
Tia(lnov) = Tia(lnov) + 1.0
Xia(lnov) = Xia(lnov) + 1.0
#endif
do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
! Update elements of t1v1 array.
t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
#endif
t0 = omp_get_wtime()
call omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
t1 = omp_get_wtime()
timers(iter) = (t1-t0)
if (iter .eq. maxiter) then
print *, "Stopping after ", iter, "iterations"
print *, " "
goto 150
endif
! Prevent NAN for large maxiter...
if (emp4 > 1000.0) then
emp4 = emp4 - 1000.0
endif
if (emp4 < -1000.0) then
emp4 = emp4 + 1000.0
endif
if (emp5 > 1000.0) then
emp5 = emp5 - 1000.0
endif
if (emp5 < -1000.0) then
emp5 = emp5 + 1000.0
endif
iter = iter + 1
end do ! k = klo, MIN(khi,i)
end do ! do i = 1, nocc
end do ! do j = 1, nocc
end do ! do klo = 1, nocc, kchunk
150 CONTINUE
tsum = 0.0
tmax = -1.0e10
tmin = 1.0e10
do i = 2, iter
tsum = tsum + timers(i)
tmax = MAX(tmax,timers(i))
tmin = MIN(tmin,timers(i))
end do
tavg = tsum / (iter - 1)
print *, "TOTAL ITER: ", iter
write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
110 format (A, F9.6, A, F9.6, A, F9.6, A)
write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
120 format (A, F15.3, A, F15.3)
print *, "END"
deallocate (f1n)
deallocate (f2n)
deallocate (eorb)
deallocate (Jia)
deallocate (Kia)
deallocate (Tia)
deallocate (Xia)
deallocate (Tkj)
deallocate (Kkj)
deallocate (t1v1)
deallocate (t1v2)
deallocate (timers)
end program
While this version is straightforward to program, it is not the most efficient.
Version 2
In Version 2 of the program, we allocate all 11 arrays in system memory using plain allocate, and use the map clause to map the arrays to the device:
real, allocatable :: f1n(:,:)
real, allocatable :: f2n(:,:)
real, allocatable :: eorb(:)
...
!$omp target data &
map(to: f1n, f2n, eorb) &
map(to: Jia, Kia, Tia, Xia) &
map(to: Tkj, Kkj) &
map(to: t1v1, t1v2)
When an array that is mapped to the device is updated on the host, we use the OpenMP target update to directive to copy the new values of the array from the host to the device. We place the directive at the highest level in the loop nest as possible, to avoid unnecessary copying of the array.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
!$omp target update to (Tkj, Kkj)
The full Version 2 is shown below.
include "mkl_omp_offload.f90"
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
use omp_lib
use onemkl_blas_omp_offload_lp64
use iso_fortran_env
implicit none
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
real, intent(in) :: Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
real :: emp4i,emp5i
real :: eaijk,denom
integer :: lnov,lnvv
integer :: b,c
real :: f1nbc,f1ncb,f2nbc,f2ncb
real :: t1v1b,t1v2b
lnov=nocc*nvir
lnvv=nvir*nvir
emp4i = 0.0
emp5i = 0.0
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
!$omp target teams distribute parallel do collapse(2) &
!$omp reduction(+:emp4i,emp5i) &
!$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
!$omp private(t1v1b,t1v2b) &
!$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
do b=1,nvir
do c=1,nvir
denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
f1nbc = f1n(b,c);
f1ncb = f1n(c,b);
f2nbc = f2n(b,c);
f2ncb = f2n(c,b);
t1v1b = t1v1(b);
t1v2b = t1v2(b);
emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
enddo
enddo
!$omp end target teams distribute parallel do
emp4 = emp4 + emp4i
emp5 = emp5 + emp5i
end
subroutine init_array_1(arr, m)
implicit none
real, intent(inout) :: arr(m)
integer m, i
do i = 1, m
arr(i) = 1.0/(100.0 + i-1)
end do
end subroutine init_array_1
subroutine init_array_2(arr, m, n)
implicit none
real, intent(inout) :: arr(m, n)
integer m, n, i, j
do i = 1, m
do j = 1, n
arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
end do
end do
end subroutine init_array_2
program main
use omp_lib
use iso_fortran_env
implicit none
interface
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
end subroutine omp_fbody
end interface
integer :: ncor, nocc, nvir, maxiter, nkpass
integer :: nbf, lnvv, lnov, kchunk
real, allocatable :: eorb(:)
real, allocatable :: f1n(:,:)
real, allocatable :: f2n(:,:)
real, allocatable :: Jia(:)
real, allocatable :: Kia(:)
real, allocatable :: Tia(:)
real, allocatable :: Xia(:)
real, allocatable :: Tkj(:)
real, allocatable :: Kkj(:)
real, allocatable :: t1v1(:),t1v2(:)
real emp4, emp5
integer :: a, b, c, i, j, k
integer :: klo, khi, iter
double precision, allocatable :: timers(:)
double precision :: t0, t1, tsum, tmax, tmin, tavg
! Run parameters
nocc = 256
nvir = 2048
maxiter = 50
nkpass = 1
ncor = 0
print *, "Run parameters:"
print *, "nocc =", nocc
print *, "nvir =", nvir
print *, "maxiter =", maxiter
print *, "nkpass =", nkpass
print *, "ncor =", ncor
print *, " "
! Allocate and initialize arrays.
nbf = ncor + nocc + nvir
lnvv = nvir * nvir
lnov = nocc * nvir
kchunk = (nocc - 1)/nkpass + 1
allocate( f1n(1:nvir,1:nvir) )
allocate( f2n(1:nvir,1:nvir) )
allocate( eorb(1:nbf) )
allocate( Jia(1:lnvv) )
allocate( Kia(1:lnvv) )
allocate( Tia(1:lnov*nocc) )
allocate( Xia(1:lnov*nocc))
allocate( Tkj(1:kchunk*lnvv) )
allocate( Kkj(1:kchunk*lnvv) )
allocate( t1v1(1:lnvv) )
allocate( t1v2(1:lnvv) )
!
call init_array_1(eorb, nbf)
call init_array_1(Jia, lnvv)
call init_array_1(Kia, lnvv)
call init_array_1(Tia, lnov*nocc)
call init_array_1(Xia, lnov*nocc)
call init_array_1(Tkj, kchunk*lnvv)
call init_array_1(Kkj, kchunk*lnov)
call init_array_1(t1v1, lnvv)
call init_array_1(t1v2, lnvv)
call init_array_2(f1n, nvir, nvir)
call init_array_2(f2n, nvir, nvir)
print *, "End of initialization"
allocate (timers(1:maxiter))
emp4=0.0
emp5=0.0
a=1
iter=1
!$omp target data &
map(to: f1n, f2n, eorb) &
map(to: Jia, Kia, Tia, Xia) &
map(to: Tkj, Kkj) &
map(to: t1v1, t1v2)
do klo = 1, nocc, kchunk
khi = MIN(nocc, klo+kchunk-1)
do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Tkj and KKj.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
!$omp target update to (Tkj, Kkj)
#endif
do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Jia, Kia, Tia, Xia arrays.
Jia(lnvv) = Jia(lnvv) + 1.0
Kia(lnvv) = Kia(lnvv) + 1.0
Tia(lnov) = Tia(lnov) + 1.0
Xia(lnov) = Xia(lnov) + 1.0
!$omp target update to (Jia, Kia, Tia, Xia)
#endif
do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
! Update elements of t1v1 array.
t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
!$omp target update to (t1v1)
#endif
t0 = omp_get_wtime()
call omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
t1 = omp_get_wtime()
timers(iter) = (t1-t0)
if (iter .eq. maxiter) then
print *, "Stopping after ", iter, "iterations"
print *, " "
goto 150
endif
! Prevent NAN for large maxiter...
if (emp4 > 1000.0) then
emp4 = emp4 - 1000.0
endif
if (emp4 < -1000.0) then
emp4 = emp4 + 1000.0
endif
if (emp5 > 1000.0) then
emp5 = emp5 - 1000.0
endif
if (emp5 < -1000.0) then
emp5 = emp5 + 1000.0
endif
iter = iter + 1
end do ! k = klo, MIN(khi,i)
end do ! do i = 1, nocc
end do ! do j = 1, nocc
end do ! do klo = 1, nocc, kchunk
150 CONTINUE
!$omp end target data
tsum = 0.0
tmax = -1.0e10
tmin = 1.0e10
do i = 2, iter
tsum = tsum + timers(i)
tmax = MAX(tmax,timers(i))
tmin = MIN(tmin,timers(i))
end do
tavg = tsum / (iter - 1)
print *, "TOTAL ITER: ", iter
write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
110 format (A, F9.6, A, F9.6, A, F9.6, A)
write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
120 format (A, F15.3, A, F15.3)
print *, "END"
deallocate (f1n)
deallocate (f2n)
deallocate (eorb)
deallocate (Jia)
deallocate (Kia)
deallocate (Tia)
deallocate (Xia)
deallocate (Tkj)
deallocate (Kkj)
deallocate (t1v1)
deallocate (t1v2)
deallocate (timers)
end program
Version 3
In the third version of the program, we consider which arrays are used on the host only, on the device only, or on both the host and the device. We also consider whether the array values are updated during the execution of the program. This information is used to decide where to allocate the arrays, how to initialize them, and whether to update their values on the device.
Arrays f1 and f2 are used as work arrays on the device (to store the results of calls to SGEMM), and they are not accessed on the host. So we allocate them directly on the device.
Since f1 and f2 are allocated on the device, we call init_array_2() to initialize the arrays in an OpenMP target construct.
The other 9 arrays are accessed on both the host and the device, so we allocate them in Host memory and and call init_array_1() to initialize the arrays. The arrays are then mapped to the device using the map clause.
We use the OpenMP target update to directive to copy the updated values of Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1 from the host to the device.
!$omp allocators allocate(allocator(omp_target_device_mem_alloc): f1n)
allocate( f1n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_device_mem_alloc): f2n)
allocate( f2n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): eorb)
allocate( eorb(1:nbf) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Jia)
allocate( Jia(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Kia)
allocate( Kia(1:lnvv) )
...
!$omp target data &
map(to: eorb) &
map(to: Jia, Kia, Tia, Xia) &
map(to: Tkj, Kkj) &
map(to: t1v1, t1v2)
...
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
!$omp target update to (Tkj, Kkj)
The full Version 3 of the program is shown below.
include "mkl_omp_offload.f90"
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
use omp_lib
use onemkl_blas_omp_offload_lp64
use iso_fortran_env
implicit none
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*)
real, intent(in) :: Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
real :: emp4i,emp5i
real :: eaijk,denom
integer :: lnov,lnvv
integer :: b,c
real :: f1nbc,f1ncb,f2nbc,f2ncb
real :: t1v1b,t1v2b
lnov=nocc*nvir
lnvv=nvir*nvir
emp4i = 0.0
emp5i = 0.0
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
!$omp dispatch
call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, &
Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
!$omp dispatch
call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, &
Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
!$omp target teams distribute parallel do collapse(2) &
!$omp reduction(+:emp4i,emp5i) &
!$omp private(f1nbc,f1ncb,f2nbc,f2ncb) &
!$omp private(t1v1b,t1v2b) &
!$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc)
do b=1,nvir
do c=1,nvir
denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
f1nbc = f1n(b,c);
f1ncb = f1n(c,b);
f2nbc = f2n(b,c);
f2ncb = f2n(c,b);
t1v1b = t1v1(b);
t1v2b = t1v2(b);
emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
enddo
enddo
!$omp end target teams distribute parallel do
emp4 = emp4 + emp4i
emp5 = emp5 + emp5i
end
subroutine init_array_1(arr, m)
implicit none
real, intent(inout) :: arr(m)
integer m, i
do i = 1, m
arr(i) = 1.0/(100.0 + i-1)
end do
end subroutine init_array_1
subroutine init_array_2(arr, m, n)
implicit none
real, intent(inout) :: arr(m, n)
integer m, n, i, j
!$omp target teams distribute parallel do
do i = 1, m
do j = 1, n
arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
end do
end do
end subroutine init_array_2
program main
use omp_lib
use iso_fortran_env
implicit none
interface
subroutine omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
real, intent(inout) :: emp4,emp5
integer, intent(in) :: ncor,nocc,nvir
integer, intent(in) :: a,i,j,k, klo
real, intent(inout) :: f1n(nvir,nvir)
real, intent(inout) :: f2n(nvir,nvir)
real, intent(in) :: eorb(*)
real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
real, intent(in) :: t1v1(nvir),t1v2(nvir)
end subroutine omp_fbody
end interface
integer :: ncor, nocc, nvir, maxiter, nkpass
integer :: nbf, lnvv, lnov, kchunk
real, allocatable :: eorb(:)
real, allocatable :: f1n(:,:)
real, allocatable :: f2n(:,:)
real, allocatable :: Jia(:)
real, allocatable :: Kia(:)
real, allocatable :: Tia(:)
real, allocatable :: Xia(:)
real, allocatable :: Tkj(:)
real, allocatable :: Kkj(:)
real, allocatable :: t1v1(:),t1v2(:)
real emp4, emp5
integer :: a, b, c, i, j, k
integer :: klo, khi, iter
double precision, allocatable :: timers(:)
double precision :: t0, t1, tsum, tmax, tmin, tavg
! Run parameters
nocc = 256
nvir = 2048
maxiter = 50
nkpass = 1
ncor = 0
print *, "Run parameters:"
print *, "nocc =", nocc
print *, "nvir =", nvir
print *, "maxiter =", maxiter
print *, "nkpass =", nkpass
print *, "ncor =", ncor
print *, " "
! Allocate and initialize arrays.
nbf = ncor + nocc + nvir
lnvv = nvir * nvir
lnov = nocc * nvir
kchunk = (nocc - 1)/nkpass + 1
!$omp allocators allocate(allocator(omp_target_device_mem_alloc): f1n)
allocate( f1n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_device_mem_alloc): f2n)
allocate( f2n(1:nvir,1:nvir) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): eorb)
allocate( eorb(1:nbf) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Jia)
allocate( Jia(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Kia)
allocate( Kia(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Tia)
allocate( Tia(1:lnov*nocc) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Xia)
allocate( Xia(1:lnov*nocc))
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Tkj)
allocate( Tkj(1:kchunk*lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): Kkj)
allocate( Kkj(1:kchunk*lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): t1v1)
allocate( t1v1(1:lnvv) )
!$omp allocators allocate(allocator(omp_target_host_mem_alloc): t1v2)
allocate( t1v2(1:lnvv) )
!
call init_array_1(eorb, nbf)
call init_array_1(Jia, lnvv)
call init_array_1(Kia, lnvv)
call init_array_1(Tia, lnov*nocc)
call init_array_1(Xia, lnov*nocc)
call init_array_1(Tkj, kchunk*lnvv)
call init_array_1(Kkj, kchunk*lnov)
call init_array_1(t1v1, lnvv)
call init_array_1(t1v2, lnvv)
call init_array_2(f1n, nvir, nvir)
call init_array_2(f2n, nvir, nvir)
print *, "End of initialization"
allocate (timers(1:maxiter))
emp4=0.0
emp5=0.0
a=1
iter=1
!$omp target data &
map(to: eorb) &
map(to: Jia, Kia, Tia, Xia) &
map(to: Tkj, Kkj) &
map(to: t1v1, t1v2)
do klo = 1, nocc, kchunk
khi = MIN(nocc, klo+kchunk-1)
do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Tkj and KKj.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
!$omp target update to (Tkj, Kkj)
#endif
do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
! Update elements of Jia, Kia, Tia, Xia arrays.
Jia(lnvv) = Jia(lnvv) + 1.0
Kia(lnvv) = Kia(lnvv) + 1.0
Tia(lnov) = Tia(lnov) + 1.0
Xia(lnov) = Xia(lnov) + 1.0
!$omp target update to (Jia, Kia, Tia, Xia)
#endif
do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
! Update elements of t1v1 array.
t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
!$omp target update to (t1v1)
#endif
t0 = omp_get_wtime()
call omp_fbody(f1n,f2n,eorb, &
ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
Jia, Kia, Tia, Xia, Tkj, Kkj, &
t1v1,t1v2)
t1 = omp_get_wtime()
timers(iter) = (t1-t0)
if (iter .eq. maxiter) then
print *, "Stopping after ", iter, "iterations"
print *, " "
goto 150
endif
! Prevent NAN for large maxiter...
if (emp4 > 1000.0) then
emp4 = emp4 - 1000.0
endif
if (emp4 < -1000.0) then
emp4 = emp4 + 1000.0
endif
if (emp5 > 1000.0) then
emp5 = emp5 - 1000.0
endif
if (emp5 < -1000.0) then
emp5 = emp5 + 1000.0
endif
iter = iter + 1
end do ! k = klo, MIN(khi,i)
end do ! do i = 1, nocc
end do ! do j = 1, nocc
end do ! do klo = 1, nocc, kchunk
150 CONTINUE
!$omp end target data
tsum = 0.0
tmax = -1.0e10
tmin = 1.0e10
do i = 2, iter
tsum = tsum + timers(i)
tmax = MAX(tmax,timers(i))
tmin = MIN(tmin,timers(i))
end do
tavg = tsum / (iter - 1)
print *, "TOTAL ITER: ", iter
write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
110 format (A, F9.6, A, F9.6, A, F9.6, A)
write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
120 format (A, F15.3, A, F15.3)
print *, "END"
deallocate (f1n)
deallocate (f2n)
deallocate (eorb)
deallocate (Jia)
deallocate (Kia)
deallocate (Tia)
deallocate (Xia)
deallocate (Tkj)
deallocate (Kkj)
deallocate (t1v1)
deallocate (t1v2)
deallocate (timers)
end program
Performance Comparison
The following commands were used to compile, link, and run the various versions. In the following, substitute the source filename for TEST.f.
Compile:
ifx -O3 -fiopenmp -fopenmp-targets=spir64 -DMKL -qmkl -fpp -free -D DO_UPDATE_ARRAYS TEST.f -c -o TEST.o
Link:
ifx -fiopenmp -fopenmp-targets=spir64 -qmkl -fsycl -L${MKLROOT}/lib/intel64 \
-liomp5 -lsycl -lOpenCL -lstdc++ -lpthread -lm -ldl -lmkl_sycl \
-D DO_UPDATE_ARRAYS TEST.o -o TEST.exe
Run:
LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=copy OMP_TARGET_OFFLOAD=MANDATORY \
ZE_AFFINITY_MASK=0 LIBOMPTARGET_DEBUG=0 LIBOMPTARGET_PLUGIN=level0 TEST.exe
We compared the performance of the three versions on the particular GPU used (1-stack only).
The average iteration time for the various versions was as follows.
Version 1 (test-SharedMem.f: 0.115163 seconds
Version 2 (test-Map-UpdateTo.f): 0.002012 seconds
Version 3 (test-DeviceMem-Map-UpdateTo.f): 0.001978 seconds