Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in...

Developer Guide

oneAPI GPU Optimization Guide

Download PDF

ID 771772

Date 11/07/2023

Version

Public

A newer version of this document is available. Customers should click here to go to the newest version.

Visible to Intel only — GUID: GUID-FC866EC4-5B84-47B7-9713-01926069C230

View Details

Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in SYCL

This section describes how to use explicit scaling with Intel^® oneAPI Math Kernel Library (oneMKL) on a multi-stack platform.

The code below shows full example of how to split GEMM computation across 2 stacks. This example demonstrates explicit scaling for {S,D,C,Z}GEMM APIs.


#include "mkl.h"
#include "oneapi/mkl/blas.hpp"
#include <CL/sycl.hpp>

// Helper functions
template <typename fp> fp set_fp_value(fp arg1, fp arg2 = 0.0) { return arg1; }

template <typename fp>
std::complex<fp> set_fp_value(std::complex<fp> arg1,
                              std::complex<fp> arg2 = 0.0) {
  return std::complex<fp>(arg1.real(), arg2.real());
}

template <typename fp>
void init_matrix(fp *M, oneapi::mkl::transpose trans, int64_t m, int64_t n,
                 int64_t ld) {
  if (trans == oneapi::mkl::transpose::nontrans) {
    for (int64_t j = 0; j < n; j++)
      for (int64_t i = 0; i < m; i++)
        M[i + j * ld] = fp(i + j);
  } else {
    for (int64_t i = 0; i < m; i++)
      for (int64_t j = 0; j < n; j++)
        M[j + i * ld] = fp(i + j);
  }
}

// GEMM operation is performed as shown below.
// C = alpha * op(A) * op(B) + beta * C

template <typename fp> void run_gemm(const sycl::device &dev) {
  // Initialize data for GEMM
  oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
  oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;

  // Matrix data sizes
  int64_t m = 1024;
  int64_t n = 1024;
  int64_t k = 1024;

  // Leading dimensions of matrices
  int64_t ldA = 1024;
  int64_t ldB = 1024;
  int64_t ldC = 1024;

  // Set scalar fp values
  fp alpha = set_fp_value(2.0, -0.5);
  fp beta = set_fp_value(3.0, -1.5);

  // Create sub devices for multi stack device
  std::vector<sycl::device> subdev = {};
  sycl::queue subdev_queue0;
  sycl::queue subdev_queue1;
  try {
    using namespace sycl::info;
    subdev = dev.create_sub_devices<
        partition_property::partition_by_affinity_domain>(
        partition_affinity_domain::numa);
    std::cout << "\tUse " << subdev.size() << " subdevices" << std::endl;
  } catch (sycl::exception &e) {
    std::cout << "\tSubdevices not found, use root device" << std::endl;
  }

  int64_t nb_device = subdev.size();

  // Catch asynchronous exceptions
  auto exception_handler = [](sycl::exception_list exceptions) {
    for (std::exception_ptr const &e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (sycl::exception const &e) {
        std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
                  << e.what() << std::endl;
      }
    }
  };

  // Create context and execution queue
  subdev.push_back(dev);
  sycl::context cxt(subdev);
  sycl::queue main_queue(cxt, dev, exception_handler);

  // If two subdevices are detected, create queue for each subdevice
  if (nb_device > 1) {
    subdev_queue0 = sycl::queue(cxt, subdev[0], exception_handler);
    subdev_queue1 = sycl::queue(cxt, subdev[1], exception_handler);
  }

  // Allocate and initialize arrays for matrices
  int64_t sizea;
  if (transA == oneapi::mkl::transpose::nontrans)
    sizea = ldA * k;
  else
    sizea = ldA * m;
  int64_t sizeb = ldB * n;
  int64_t sizec = ldC * n;

  auto A_host = sycl::malloc_host<fp>(sizea, main_queue);
  auto B_host = sycl::malloc_host<fp>(sizeb, main_queue);
  auto C_host = sycl::malloc_host<fp>(sizec, main_queue);

  init_matrix(A_host, transA, m, k, ldA);
  init_matrix(B_host, transB, k, n, ldB);
  init_matrix(C_host, oneapi::mkl::transpose::nontrans, m, n, ldC);

  // Copy A/B/C from host to device/subdevices
  // When two stacks are detected,
  // GEMM operation is split between stacks in n direction
  // All A matrix is copied to both stacks.
  // B and C matrices are split between stacks, so only half of B and C are
  // copied to each stack.

  fp *A_dev0, *A_dev1, *B_dev0, *B_dev1, *C_dev0, *C_dev1, *A_dev, *B_dev,
      *C_dev;

  if (nb_device > 1) {
    A_dev0 = sycl::malloc_device<fp>(sizea, subdev_queue0);
    A_dev1 = sycl::malloc_device<fp>(sizea, subdev_queue1);
    B_dev0 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue0);
    B_dev1 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue1);
    C_dev0 = sycl::malloc_device<fp>(sizec / 2, subdev_queue0);
    C_dev1 = sycl::malloc_device<fp>(sizec / 2, subdev_queue1);
    // Entire A matrix is copied to both stacks
    subdev_queue0.memcpy(A_dev0, A_host, sizea * sizeof(fp));
    subdev_queue1.memcpy(A_dev1, A_host, sizea * sizeof(fp));
    // Half of B and C are copied to each stack
    subdev_queue0.memcpy(B_dev0, B_host, (sizeb / 2) * sizeof(fp));
    subdev_queue1.memcpy(B_dev1, B_host + ldB * n / 2,
                         (sizeb / 2) * sizeof(fp));
    subdev_queue0.memcpy(C_dev0, C_host, (sizec / 2) * sizeof(fp));
    subdev_queue1.memcpy(C_dev1, C_host + ldC * n / 2,
                         (sizec / 2) * sizeof(fp));
    subdev_queue0.wait();
    subdev_queue1.wait();
  } else {
    A_dev = sycl::malloc_device<fp>(sizea, main_queue);
    B_dev = sycl::malloc_device<fp>(sizeb, main_queue);
    C_dev = sycl::malloc_device<fp>(sizec, main_queue);
    main_queue.memcpy(A_dev, A_host, sizea * sizeof(fp));
    main_queue.memcpy(B_dev, B_host, sizeb * sizeof(fp));
    main_queue.memcpy(C_dev, C_host, sizec * sizeof(fp));
    main_queue.wait();
  }

  // Call oneMKL GEMM API
  // When two stacks are detected, GEMM call is launched on each stack
  sycl::event gemm_done0;
  sycl::event gemm_done1;
  std::vector<sycl::event> gemm_dependencies;
  try {
    if (nb_device > 1) {
      // Split B and C for each stack
      int64_t n_subdev = n / 2;
      gemm_done0 = oneapi::mkl::blas::gemm(
          subdev_queue0, transA, transB, m, n_subdev, k, alpha, A_dev0, ldA,
          B_dev0, ldB, beta, C_dev0, ldC, gemm_dependencies);
      gemm_done1 = oneapi::mkl::blas::gemm(
          subdev_queue1, transA, transB, m, n_subdev, k, alpha, A_dev1, ldA,
          B_dev1, ldB, beta, C_dev1, ldC, gemm_dependencies);
    } else {
      gemm_done0 = oneapi::mkl::blas::gemm(main_queue, transA, transB, m, n, k,
                                           alpha, A_dev, ldA, B_dev, ldB, beta,
                                           C_dev, ldC, gemm_dependencies);
    }
  } catch (sycl::exception const &e) {
    std::cout << "\t\tCaught synchronous SYCL exception during GEMM:\n"
              << e.what() << std::endl;
  }

  // Wait for GEMM calls to finish
  gemm_done0.wait();
  if (nb_device > 1)
    gemm_done1.wait();

  // Copy C from device/subdevices to host
  if (nb_device > 1) {
    subdev_queue0.memcpy(C_host, C_dev0, (sizec / 2) * sizeof(fp));
    subdev_queue1.memcpy(C_host + ldC * n / 2, C_dev1,
                         (sizec / 2) * sizeof(fp));
    subdev_queue0.wait();
    subdev_queue1.wait();
  } else {
    main_queue.memcpy(C_host, C_dev, sizec * sizeof(fp));
    main_queue.wait();
  }

  // Clean-up
  free(A_host, cxt);
  free(B_host, cxt);
  free(C_host, cxt);
  if (nb_device > 1) {
    sycl::free(A_dev0, subdev_queue0);
    sycl::free(A_dev1, subdev_queue1);
    sycl::free(B_dev0, subdev_queue0);
    sycl::free(B_dev1, subdev_queue1);
    sycl::free(C_dev0, subdev_queue0);
    sycl::free(C_dev1, subdev_queue1);
  } else {
    sycl::free(A_dev, main_queue);
    sycl::free(B_dev, main_queue);
    sycl::free(C_dev, main_queue);
  }

  if (nb_device > 1)
    std::cout << "\tGEMM operation is complete on 2 stacks" << std::endl;
  else
    std::cout << "\tGEMM operation is complete on 1 stack" << std::endl;
}

// Main entry point for example.
// GEMM example is run on GPU for 4 data types.

int main(int argc, char **argv) {
  // Create GPU device
  sycl::device dev = sycl::device(sycl::gpu_selector_v);

  std::cout << "Running with single precision real data type:" << std::endl;
  run_gemm<float>(dev);

  std::cout << "Running with double precision real data type:" << std::endl;
  run_gemm<double>(dev);

  std::cout << "Running with single precision complex data type:" << std::endl;
  run_gemm<std::complex<float>>(dev);

  std::cout << "Running with double precision complex data type:" << std::endl;
  run_gemm<std::complex<double>>(dev);
  return 0;
}

In this example, first scalar and dimension parameters for GEMM are initialized.


  // Initialize data for GEMM
  oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
  oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;

  // Matrix data sizes
  int64_t m = 1024;
  int64_t n = 1024;
  int64_t k = 1024;

  // Leading dimensions of matrices
  int64_t ldA = 1024;
  int64_t ldB = 1024;
  int64_t ldC = 1024;

  // Set scalar fp values
  fp alpha = set_fp_value(2.0, -0.5);
  fp beta = set_fp_value(3.0, -1.5);

Next, if available, subdevices are created from root device and stored in SYCL vector. If the platform has no subdevices (e.g single stack device), root device will be used for computation. Note that, the example also demonstrates how to handle asynchronous exceptions.


  // Create sub devices for multi stack device
  std::vector<sycl::device> subdev = {};
  sycl::queue subdev_queue0;
  sycl::queue subdev_queue1;
  try {
    using namespace sycl::info;
    subdev = dev.create_sub_devices<
        partition_property::partition_by_affinity_domain>(
        partition_affinity_domain::numa);
    std::cout << "\tUse " << subdev.size() << " subdevices" << std::endl;
  } catch (sycl::exception &e) {
    std::cout << "\tSubdevices not found, use root device" << std::endl;
  }

  int64_t nb_device = subdev.size();

  // Catch asynchronous exceptions
  auto exception_handler = [](sycl::exception_list exceptions) {
    for (std::exception_ptr const &e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (sycl::exception const &e) {
        std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
                  << e.what() << std::endl;
      }
    }
  };

Context and queues are then created. If subdevices are detected earlier, a separate queue is created for each subdevice.


  // Create context and execution queue
  subdev.push_back(dev);
  sycl::context cxt(subdev);
  sycl::queue main_queue(cxt, dev, exception_handler);

  // If two subdevices are detected, create queue for each subdevice
  if (nb_device > 1) {
    subdev_queue0 = sycl::queue(cxt, subdev[0], exception_handler);
    subdev_queue1 = sycl::queue(cxt, subdev[1], exception_handler);
  }

Before oneMKL calls are invoked, input and output matrices have to be allocated, initialized and transferred from host to device. It is important to note that, when 2 subdevices are detected, first half of GEMM computation is done on first stack using entire A matrix and first half of B matrix and the other half of GEMM computation is done on second stack using entire A matrix and other half of B matrix. Thus, matrices needs to be copied to each stack accordingly.


  // Allocate and initialize arrays for matrices
  int64_t sizea;
  if (transA == oneapi::mkl::transpose::nontrans)
    sizea = ldA * k;
  else
    sizea = ldA * m;
  int64_t sizeb = ldB * n;
  int64_t sizec = ldC * n;

  auto A_host = sycl::malloc_host<fp>(sizea, main_queue);
  auto B_host = sycl::malloc_host<fp>(sizeb, main_queue);
  auto C_host = sycl::malloc_host<fp>(sizec, main_queue);

  init_matrix(A_host, transA, m, k, ldA);
  init_matrix(B_host, transB, k, n, ldB);
  init_matrix(C_host, oneapi::mkl::transpose::nontrans, m, n, ldC);

  // Copy A/B/C from host to device/subdevices
  // When two stacks are detected,
  // GEMM operation is split between stacks in n direction
  // All A matrix is copied to both stacks.
  // B and C matrices are split between stacks, so only half of B and C are
  // copied to each stack.

  fp *A_dev0, *A_dev1, *B_dev0, *B_dev1, *C_dev0, *C_dev1, *A_dev, *B_dev,
      *C_dev;

  if (nb_device > 1) {
    A_dev0 = sycl::malloc_device<fp>(sizea, subdev_queue0);
    A_dev1 = sycl::malloc_device<fp>(sizea, subdev_queue1);
    B_dev0 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue0);
    B_dev1 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue1);
    C_dev0 = sycl::malloc_device<fp>(sizec / 2, subdev_queue0);
    C_dev1 = sycl::malloc_device<fp>(sizec / 2, subdev_queue1);
    // Entire A matrix is copied to both stacks
    subdev_queue0.memcpy(A_dev0, A_host, sizea * sizeof(fp));
    subdev_queue1.memcpy(A_dev1, A_host, sizea * sizeof(fp));
    // Half of B and C are copied to each stack
    subdev_queue0.memcpy(B_dev0, B_host, (sizeb / 2) * sizeof(fp));
    subdev_queue1.memcpy(B_dev1, B_host + ldB * n / 2,
                         (sizeb / 2) * sizeof(fp));
    subdev_queue0.memcpy(C_dev0, C_host, (sizec / 2) * sizeof(fp));
    subdev_queue1.memcpy(C_dev1, C_host + ldC * n / 2,
                         (sizec / 2) * sizeof(fp));
    subdev_queue0.wait();
    subdev_queue1.wait();
  } else {
    A_dev = sycl::malloc_device<fp>(sizea, main_queue);
    B_dev = sycl::malloc_device<fp>(sizeb, main_queue);
    C_dev = sycl::malloc_device<fp>(sizec, main_queue);
    main_queue.memcpy(A_dev, A_host, sizea * sizeof(fp));
    main_queue.memcpy(B_dev, B_host, sizeb * sizeof(fp));
    main_queue.memcpy(C_dev, C_host, sizec * sizeof(fp));
    main_queue.wait();
  }

Finally, oneMKL GEMM APIs are called with corresponding device/subdevice queues and updated C matrix is transferred to host once the GEMM calls are done.


  // Call oneMKL GEMM API
  // When two stacks are detected, GEMM call is launched on each stack
  sycl::event gemm_done0;
  sycl::event gemm_done1;
  std::vector<sycl::event> gemm_dependencies;
  try {
    if (nb_device > 1) {
      // Split B and C for each stack
      int64_t n_subdev = n / 2;
      gemm_done0 = oneapi::mkl::blas::gemm(
          subdev_queue0, transA, transB, m, n_subdev, k, alpha, A_dev0, ldA,
          B_dev0, ldB, beta, C_dev0, ldC, gemm_dependencies);
      gemm_done1 = oneapi::mkl::blas::gemm(
          subdev_queue1, transA, transB, m, n_subdev, k, alpha, A_dev1, ldA,
          B_dev1, ldB, beta, C_dev1, ldC, gemm_dependencies);
    } else {
      gemm_done0 = oneapi::mkl::blas::gemm(main_queue, transA, transB, m, n, k,
                                           alpha, A_dev, ldA, B_dev, ldB, beta,
                                           C_dev, ldC, gemm_dependencies);
    }
  } catch (sycl::exception const &e) {
    std::cout << "\t\tCaught synchronous SYCL exception during GEMM:\n"
              << e.what() << std::endl;
  }

  // Wait for GEMM calls to finish
  gemm_done0.wait();
  if (nb_device > 1)
    gemm_done1.wait();

  // Copy C from device/subdevices to host
  if (nb_device > 1) {
    subdev_queue0.memcpy(C_host, C_dev0, (sizec / 2) * sizeof(fp));
    subdev_queue1.memcpy(C_host + ldC * n / 2, C_dev1,
                         (sizec / 2) * sizeof(fp));
    subdev_queue0.wait();
    subdev_queue1.wait();
  } else {
    main_queue.memcpy(C_host, C_dev, sizec * sizeof(fp));
    main_queue.wait();
  }

To be able to run this example, the below build command can be used after setting $MKLROOT variable.


icpx -fsycl -qmkl=sequential -DMKL_ILP64 -L${MKLROOT}/lib/intel64 -lsycl -lOpenCL -lpthread -lm -ldl -o onemkl_dpcpp_gemm onemkl_dpcpp_gemm.cpp
export LD_LIBRARY_PATH=${MKLROOT}/lib/intel64/:${LD_LIBRARY_PATH}
./onemkl_dpcpp_gemm

For large problem sizes (m=n=k=8192), close to 2X performance scaling is expected using explicit scaling on two stacks.

Select Your Language

Using Intel.com Search

Quick Links

Recent Searches

Advanced Search

Only search in

oneAPI GPU Optimization Guide

Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in SYCL