Visible to Intel only — GUID: GUID-F1A30D1E-8C22-4248-87D9-F9F5C75A347D
Visible to Intel only — GUID: GUID-F1A30D1E-8C22-4248-87D9-F9F5C75A347D
Scaling Performance with Intel® oneAPI Math Kernel Library(oneMKL) in OpenMP
This section describes how to use Intel® oneAPI Math Kernel Library (oneMKL) on a platform with multiple devices to scale up performance in FLAT mode using an OpenMP offload example.
The code below shows full example of how to split AXPY computation across 2 devices. This example demonstrates scaling for cblas_daxpy API.
#include "mkl.h" #include "mkl_omp_offload.h" #include <omp.h> #include <stdio.h> int main() { double *x, *y, *y_ref, alpha; MKL_INT n, incx, incy, i; // Initialize data for AXPY alpha = 1.0; n = 8192; incx = 1; incy = 1; // Allocate and initialize arrays for vectors x = (double *)mkl_malloc(n * sizeof(double), 128); y = (double *)mkl_malloc(n * sizeof(double), 128); if ((x == NULL) || (y == NULL)) { printf("Error in vector allocation\n"); return 1; } for (i = 0; i < n; i++) { x[i] = rand() / (double)RAND_MAX - .5; y[i] = rand() / (double)RAND_MAX - .5; } printf("First 10 elements of the output vector Y before AXPY:\n"); for (i = 0; i < 10; i++) { printf("%lf ", y[i]); } printf("\n\n"); // Detect number of available devices int nb_device = omp_get_num_devices(); // Copy data to device and perform computation if (omp_get_num_devices() > 1) { printf("2 devices are detected. AXPY operation is divided into 2 to take " "advantage of explicit scaling\n"); printf("Copy x[0..%lld] and y[0..%lld] to device 0\n", n / 2 - 1, n / 2 - 1); #pragma omp target data map(to \ : x [0:n / 2]) map(tofrom \ : y [0:n / 2]) device(0) { printf("Copy x[%lld..%lld] and y[%lld..%lld] to device 1\n", n / 2, n - 1, n / 2, n - 1); #pragma omp target data map(to \ : x [n / 2:n - n / 2]) map(tofrom \ : y [n / 2:n - n / 2]) \ device(1) { double *x1 = &x[n / 2]; double *y1 = &y[n / 2]; #pragma omp dispatch device(0) nowait cblas_daxpy(n / 2, alpha, x, incx, y, incy); #pragma omp dispatch device(1) cblas_daxpy(n / 2, alpha, x1, incx, y1, incy); #pragma omp taskwait } } } else { printf("1 device is detected. Entire AXPY operation is performed on that " "device\n"); printf("Copy x[0..%lld] and y[0..%lld] to device 0\n", n - 1, n - 1); #pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0) { #pragma omp dispatch device(0) cblas_daxpy(n, alpha, x, incx, y, incy); } } // End of computation printf("\nFirst 10 elements of the output vector Y after AXPY:\n"); for (i = 0; i < 10; i++) { printf("%lf ", y[i]); } printf("\n"); mkl_free(x); mkl_free(y); return 0; }
In this example, first AXPY parameters and vectors are allocated and initialized.
// Initialize data for AXPY alpha = 1.0; n = 8192; incx = 1; incy = 1; // Allocate and initialize arrays for vectors x = (double *)mkl_malloc(n * sizeof(double), 128); y = (double *)mkl_malloc(n * sizeof(double), 128); if ((x == NULL) || (y == NULL)) { printf("Error in vector allocation\n"); return 1; } for (i = 0; i < n; i++) { x[i] = rand() / (double)RAND_MAX - .5; y[i] = rand() / (double)RAND_MAX - .5; } printf("First 10 elements of the output vector Y before AXPY:\n"); for (i = 0; i < 10; i++) { printf("%lf ", y[i]); } printf("\n\n");
Next, number of available devices is detected using omp_get_num_devices API.
// Detect number of available devices int nb_device = omp_get_num_devices();
Finally, data is transferred and oneMKL cblas_daxpy calls are offloaded to GPU using OpenMP. Note that, if 2 devices are detected, half of x and y vectors are copied to each device and AXPY operation is divided into 2 to take advantage of scaling.
// Copy data to device and perform computation if (omp_get_num_devices() > 1) { printf("2 devices are detected. AXPY operation is divided into 2 to take " "advantage of explicit scaling\n"); printf("Copy x[0..%lld] and y[0..%lld] to device 0\n", n / 2 - 1, n / 2 - 1); #pragma omp target data map(to \ : x [0:n / 2]) map(tofrom \ : y [0:n / 2]) device(0) { printf("Copy x[%lld..%lld] and y[%lld..%lld] to device 1\n", n / 2, n - 1, n / 2, n - 1); #pragma omp target data map(to \ : x [n / 2:n - n / 2]) map(tofrom \ : y [n / 2:n - n / 2]) \ device(1) { double *x1 = &x[n / 2]; double *y1 = &y[n / 2]; #pragma omp dispatch device(0) nowait cblas_daxpy(n / 2, alpha, x, incx, y, incy); #pragma omp dispatch device(1) cblas_daxpy(n / 2, alpha, x1, incx, y1, incy); #pragma omp taskwait } } } else { printf("1 device is detected. Entire AXPY operation is performed on that " "device\n"); printf("Copy x[0..%lld] and y[0..%lld] to device 0\n", n - 1, n - 1); #pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0) { #pragma omp dispatch device(0) cblas_daxpy(n, alpha, x, incx, y, incy); } }
To be able to run this example, the below build command can be used after setting $MKLROOT variable.
icpx -fiopenmp -fopenmp-targets=spir64 -m64 -DMKL_ILP64 -qmkl-ilp64=parallel -lstdc++ -o onemkl_openmp_axpy onemkl_openmp_axpy.cpp export LD_LIBRARY_PATH=${MKLROOT}/lib/:${LD_LIBRARY_PATH} ./onemkl_openmp_axpy
For large problem sizes (m=150000000), close to 2X performance scaling is expected on two devices.