Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in...

Developer Guide

oneAPI GPU Optimization Guide

Download PDF

ID 771772

Date 11/07/2023

Version

Public

A newer version of this document is available. Customers should click here to go to the newest version.

Visible to Intel only — GUID: GUID-77A5BD18-8DA3-4AC6-88BC-F3A73B190541

View Details

Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in OpenMP

This section describes how to use explicit scaling with Intel^® oneAPI Math Kernel Library (oneMKL) on a multi-stack platform for an OpenMP offload example.

The code below shows full example of how to split AXPY computation across 2 stacks. This example demonstrates explicit scaling for cblas_daxpy API.


#include "mkl.h"
#include "mkl_omp_offload.h"
#include <omp.h>
#include <stdio.h>

int main() {

  double *x, *y, *y_ref, alpha;
  MKL_INT n, incx, incy, i;

  // Initialize data for AXPY
  alpha = 1.0;
  n = 8192;
  incx = 1;
  incy = 1;

  // Allocate and initialize arrays for vectors
  x = (double *)mkl_malloc(n * sizeof(double), 128);
  y = (double *)mkl_malloc(n * sizeof(double), 128);
  if ((x == NULL) || (y == NULL)) {
    printf("Error in vector allocation\n");
    return 1;
  }
  for (i = 0; i < n; i++) {
    x[i] = rand() / (double)RAND_MAX - .5;
    y[i] = rand() / (double)RAND_MAX - .5;
  }

  printf("First 10 elements of the output vector Y before AXPY:\n");
  for (i = 0; i < 10; i++) {
    printf("%lf ", y[i]);
  }
  printf("\n\n");

  // Detect number of available stacks
  int nb_device = omp_get_num_devices();

  // Copy data to device and perform computation
  if (omp_get_num_devices() > 1) {
    printf("2 stacks are detected. AXPY operation is divided into 2 to take "
           "advantage of explicit scaling\n");
    printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n / 2 - 1, n / 2 - 1);
#pragma omp target data map(to                                                 \
                            : x [0:n / 2]) map(tofrom                          \
                                               : y [0:n / 2]) device(0)
    {
      printf("Copy x[%lld..%lld] and y[%lld..%lld] to stack 1\n", n / 2, n - 1,
             n / 2, n - 1);
#pragma omp target data map(to                                                 \
                            : x [n / 2:n - n / 2]) map(tofrom                  \
                                                       : y [n / 2:n - n / 2])  \
    device(1)
      {
        double *x1 = &x[n / 2];
        double *y1 = &y[n / 2];
#pragma omp target variant dispatch use_device_ptr(x, y) device(0) nowait
        cblas_daxpy(n / 2, alpha, x, incx, y, incy);
#pragma omp target variant dispatch use_device_ptr(x1, y1) device(1)
        cblas_daxpy(n / 2, alpha, x1, incx, y1, incy);
#pragma omp taskwait
      }
    }
  } else {
    printf("1 stack is detected. Entire AXPY operation is performed on that "
           "stack\n");
    printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n - 1, n - 1);
#pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0)
    {
#pragma omp target variant dispatch use_device_ptr(x, y) device(0)
      cblas_daxpy(n, alpha, x, incx, y, incy);
#pragma omp taskwait
    }
  }
  // End of computation

  printf("\nFirst 10 elements of the output vector Y after AXPY:\n");
  for (i = 0; i < 10; i++) {
    printf("%lf ", y[i]);
  }
  printf("\n");

  mkl_free(x);
  mkl_free(y);
  return 0;
}

In this example, first AXPY parameters and vectors are allocated and initialized.


  // Initialize data for AXPY
  alpha = 1.0;
  n = 8192;
  incx = 1;
  incy = 1;

  // Allocate and initialize arrays for vectors
  x = (double *)mkl_malloc(n * sizeof(double), 128);
  y = (double *)mkl_malloc(n * sizeof(double), 128);
  if ((x == NULL) || (y == NULL)) {
    printf("Error in vector allocation\n");
    return 1;
  }
  for (i = 0; i < n; i++) {
    x[i] = rand() / (double)RAND_MAX - .5;
    y[i] = rand() / (double)RAND_MAX - .5;
  }

  printf("First 10 elements of the output vector Y before AXPY:\n");
  for (i = 0; i < 10; i++) {
    printf("%lf ", y[i]);
  }
  printf("\n\n");

Next, number of available stacks is detected using omp_get_num_devices API.


  // Detect number of available stacks
  int nb_device = omp_get_num_devices();

Finally, data is transferred and oneMKL cblas_daxpy calls are offloaded to GPU using OpenMP. Note that, if 2 stacks are detected, half of x and y vectors are copied to each stack and AXPY operation is divided into 2 to take advantage of explicit scaling.


  // Copy data to device and perform computation
  if (omp_get_num_devices() > 1) {
    printf("2 stacks are detected. AXPY operation is divided into 2 to take "
           "advantage of explicit scaling\n");
    printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n / 2 - 1, n / 2 - 1);
#pragma omp target data map(to                                                 \
                            : x [0:n / 2]) map(tofrom                          \
                                               : y [0:n / 2]) device(0)
    {
      printf("Copy x[%lld..%lld] and y[%lld..%lld] to stack 1\n", n / 2, n - 1,
             n / 2, n - 1);
#pragma omp target data map(to                                                 \
                            : x [n / 2:n - n / 2]) map(tofrom                  \
                                                       : y [n / 2:n - n / 2])  \
    device(1)
      {
        double *x1 = &x[n / 2];
        double *y1 = &y[n / 2];
#pragma omp target variant dispatch use_device_ptr(x, y) device(0) nowait
        cblas_daxpy(n / 2, alpha, x, incx, y, incy);
#pragma omp target variant dispatch use_device_ptr(x1, y1) device(1)
        cblas_daxpy(n / 2, alpha, x1, incx, y1, incy);
#pragma omp taskwait
      }
    }
  } else {
    printf("1 stack is detected. Entire AXPY operation is performed on that "
           "stack\n");
    printf("Copy x[0..%lld] and y[0..%lld] to stack 0\n", n - 1, n - 1);
#pragma omp target data map(to : x [0:n]) map(tofrom : y [0:n]) device(0)
    {
#pragma omp target variant dispatch use_device_ptr(x, y) device(0)
      cblas_daxpy(n, alpha, x, incx, y, incy);
#pragma omp taskwait
    }
  }

To be able to run this example, the below build command can be used after setting $MKLROOT variable. LIBOMPTARGET_DEVICES=subdevice is especially required to expose subdevices/stacks to users with OpenMP.


export LIBOMPTARGET_DEVICES=subdevice
icpx -fiopenmp -fopenmp-targets=spir64 -m64 -DMKL_ILP64  -qmkl-ilp64=parallel -lstdc++ -o onemkl_openmp_axpy onemkl_openmp_axpy.cpp
export LD_LIBRARY_PATH=${MKLROOT}/lib/intel64/:${LD_LIBRARY_PATH}
./onemkl_openmp_axpy

For large problem sizes (m=150000000), close to 2X performance scaling is expected using explicit scaling on two stacks.

Select Your Language

Using Intel.com Search

Quick Links

Recent Searches

Advanced Search

Only search in

oneAPI GPU Optimization Guide

Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in OpenMP