Visible to Intel only — GUID: GUID-77A5BD18-8DA3-4AC6-88BC-F3A73B190541
Visible to Intel only — GUID: GUID-77A5BD18-8DA3-4AC6-88BC-F3A73B190541
Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in OpenMP
This section describes how to use explicit scaling with Intel® oneAPI Math Kernel Library (oneMKL) on a multi-stack platform for an OpenMP offload example.
The code below shows full example of how to split AXPY computation across 2 stacks. This example demonstrates explicit scaling for cblas_daxpy API.
#include "mkl.h"
#include "mkl_omp_offload.h"
#include <omp.h>
#include <stdio.h>
int main() {
double *x, *y, *y_ref, alpha;
MKL_INT n, incx, incy, i;
// Initialize data for AXPY
alpha = 1.0;
n = 8192;
incx = 1;
incy = 1;
// Allocate and initialize arrays for vectors
x = (double *)mkl_malloc(n * sizeof(double), 128);
y = (double *)mkl_malloc(n * sizeof(double), 128);
if ((x == NULL) || (y == NULL)) {
printf("Error in vector allocation\n");
return 1;
}
for (i = 0; i < n; i++) {
x[i] = rand() / (double)RAND_MAX - .5;
y[i] = rand() / (double)RAND_MAX - .5;
}
printf("First 10 elements of the output vector Y before AXPY:\n");
for (i = 0; i < 10; i++) {
printf("%lf ", y[i]);
}
printf("\n\n");
// Detect number of available stacks
int nb_device = omp_get_num_devices();
// Copy data to device and perform computation
if (omp_get_num_devices() > 1) {
printf("2 stacks are detected. AXPY operation is divided into 2 to take "
"advantage of explicit scaling\n");
printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n / 2 - 1, n / 2 - 1);
#pragma omp target data map(to \
: x [0:n / 2]) map(tofrom \
: y [0:n / 2]) device(0)
{
printf("Copy x[%lld..%lld] and y[%lld..%lld] to stack 1\n", n / 2, n - 1,
n / 2, n - 1);
#pragma omp target data map(to \
: x [n / 2:n - n / 2]) map(tofrom \
: y [n / 2:n - n / 2]) \
device(1)
{
double *x1 = &x[n / 2];
double *y1 = &y[n / 2];
#pragma omp target variant dispatch use_device_ptr(x, y) device(0) nowait
cblas_daxpy(n / 2, alpha, x, incx, y, incy);
#pragma omp target variant dispatch use_device_ptr(x1, y1) device(1)
cblas_daxpy(n / 2, alpha, x1, incx, y1, incy);
#pragma omp taskwait
}
}
} else {
printf("1 stack is detected. Entire AXPY operation is performed on that "
"stack\n");
printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n - 1, n - 1);
#pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0)
{
#pragma omp target variant dispatch use_device_ptr(x, y) device(0)
cblas_daxpy(n, alpha, x, incx, y, incy);
#pragma omp taskwait
}
}
// End of computation
printf("\nFirst 10 elements of the output vector Y after AXPY:\n");
for (i = 0; i < 10; i++) {
printf("%lf ", y[i]);
}
printf("\n");
mkl_free(x);
mkl_free(y);
return 0;
}
In this example, first AXPY parameters and vectors are allocated and initialized.
// Initialize data for AXPY
alpha = 1.0;
n = 8192;
incx = 1;
incy = 1;
// Allocate and initialize arrays for vectors
x = (double *)mkl_malloc(n * sizeof(double), 128);
y = (double *)mkl_malloc(n * sizeof(double), 128);
if ((x == NULL) || (y == NULL)) {
printf("Error in vector allocation\n");
return 1;
}
for (i = 0; i < n; i++) {
x[i] = rand() / (double)RAND_MAX - .5;
y[i] = rand() / (double)RAND_MAX - .5;
}
printf("First 10 elements of the output vector Y before AXPY:\n");
for (i = 0; i < 10; i++) {
printf("%lf ", y[i]);
}
printf("\n\n");
Next, number of available stacks is detected using omp_get_num_devices API.
// Detect number of available stacks
int nb_device = omp_get_num_devices();
Finally, data is transferred and oneMKL cblas_daxpy calls are offloaded to GPU using OpenMP. Note that, if 2 stacks are detected, half of x and y vectors are copied to each stack and AXPY operation is divided into 2 to take advantage of explicit scaling.
// Copy data to device and perform computation
if (omp_get_num_devices() > 1) {
printf("2 stacks are detected. AXPY operation is divided into 2 to take "
"advantage of explicit scaling\n");
printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n / 2 - 1, n / 2 - 1);
#pragma omp target data map(to \
: x [0:n / 2]) map(tofrom \
: y [0:n / 2]) device(0)
{
printf("Copy x[%lld..%lld] and y[%lld..%lld] to stack 1\n", n / 2, n - 1,
n / 2, n - 1);
#pragma omp target data map(to \
: x [n / 2:n - n / 2]) map(tofrom \
: y [n / 2:n - n / 2]) \
device(1)
{
double *x1 = &x[n / 2];
double *y1 = &y[n / 2];
#pragma omp target variant dispatch use_device_ptr(x, y) device(0) nowait
cblas_daxpy(n / 2, alpha, x, incx, y, incy);
#pragma omp target variant dispatch use_device_ptr(x1, y1) device(1)
cblas_daxpy(n / 2, alpha, x1, incx, y1, incy);
#pragma omp taskwait
}
}
} else {
printf("1 stack is detected. Entire AXPY operation is performed on that "
"stack\n");
printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n - 1, n - 1);
#pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0)
{
#pragma omp target variant dispatch use_device_ptr(x, y) device(0)
cblas_daxpy(n, alpha, x, incx, y, incy);
#pragma omp taskwait
}
}
To be able to run this example, the below build command can be used after setting $MKLROOT variable. LIBOMPTARGET_DEVICES=subdevice is especially required to expose subdevices/stacks to users with OpenMP.
export LIBOMPTARGET_DEVICES=subdevice
icpx -fiopenmp -fopenmp-targets=spir64 -m64 -DMKL_ILP64 -qmkl-ilp64=parallel -lstdc++ -o onemkl_openmp_axpy onemkl_openmp_axpy.cpp
export LD_LIBRARY_PATH=${MKLROOT}/lib/intel64/:${LD_LIBRARY_PATH}
./onemkl_openmp_axpy
For large problem sizes (m=150000000), close to 2X performance scaling is expected using explicit scaling on two stacks.