Visible to Intel only — GUID: GUID-FC866EC4-5B84-47B7-9713-01926069C230
Visible to Intel only — GUID: GUID-FC866EC4-5B84-47B7-9713-01926069C230
Explicit Scaling Using Intel® oneAPI Math Kernel Library (oneMKL) in SYCL
This section describes how to use explicit scaling with Intel® oneAPI Math Kernel Library (oneMKL) on a multi-stack platform.
The code below shows full example of how to split GEMM computation across 2 stacks. This example demonstrates explicit scaling for {S,D,C,Z}GEMM APIs.
#include "mkl.h"
#include "oneapi/mkl/blas.hpp"
#include <CL/sycl.hpp>
// Helper functions
template <typename fp> fp set_fp_value(fp arg1, fp arg2 = 0.0) { return arg1; }
template <typename fp>
std::complex<fp> set_fp_value(std::complex<fp> arg1,
std::complex<fp> arg2 = 0.0) {
return std::complex<fp>(arg1.real(), arg2.real());
}
template <typename fp>
void init_matrix(fp *M, oneapi::mkl::transpose trans, int64_t m, int64_t n,
int64_t ld) {
if (trans == oneapi::mkl::transpose::nontrans) {
for (int64_t j = 0; j < n; j++)
for (int64_t i = 0; i < m; i++)
M[i + j * ld] = fp(i + j);
} else {
for (int64_t i = 0; i < m; i++)
for (int64_t j = 0; j < n; j++)
M[j + i * ld] = fp(i + j);
}
}
// GEMM operation is performed as shown below.
// C = alpha * op(A) * op(B) + beta * C
template <typename fp> void run_gemm(const sycl::device &dev) {
// Initialize data for GEMM
oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
// Matrix data sizes
int64_t m = 1024;
int64_t n = 1024;
int64_t k = 1024;
// Leading dimensions of matrices
int64_t ldA = 1024;
int64_t ldB = 1024;
int64_t ldC = 1024;
// Set scalar fp values
fp alpha = set_fp_value(2.0, -0.5);
fp beta = set_fp_value(3.0, -1.5);
// Create sub devices for multi stack device
std::vector<sycl::device> subdev = {};
sycl::queue subdev_queue0;
sycl::queue subdev_queue1;
try {
using namespace sycl::info;
subdev = dev.create_sub_devices<
partition_property::partition_by_affinity_domain>(
partition_affinity_domain::numa);
std::cout << "\tUse " << subdev.size() << " subdevices" << std::endl;
} catch (sycl::exception &e) {
std::cout << "\tSubdevices not found, use root device" << std::endl;
}
int64_t nb_device = subdev.size();
// Catch asynchronous exceptions
auto exception_handler = [](sycl::exception_list exceptions) {
for (std::exception_ptr const &e : exceptions) {
try {
std::rethrow_exception(e);
} catch (sycl::exception const &e) {
std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
<< e.what() << std::endl;
}
}
};
// Create context and execution queue
subdev.push_back(dev);
sycl::context cxt(subdev);
sycl::queue main_queue(cxt, dev, exception_handler);
// If two subdevices are detected, create queue for each subdevice
if (nb_device > 1) {
subdev_queue0 = sycl::queue(cxt, subdev[0], exception_handler);
subdev_queue1 = sycl::queue(cxt, subdev[1], exception_handler);
}
// Allocate and initialize arrays for matrices
int64_t sizea;
if (transA == oneapi::mkl::transpose::nontrans)
sizea = ldA * k;
else
sizea = ldA * m;
int64_t sizeb = ldB * n;
int64_t sizec = ldC * n;
auto A_host = sycl::malloc_host<fp>(sizea, main_queue);
auto B_host = sycl::malloc_host<fp>(sizeb, main_queue);
auto C_host = sycl::malloc_host<fp>(sizec, main_queue);
init_matrix(A_host, transA, m, k, ldA);
init_matrix(B_host, transB, k, n, ldB);
init_matrix(C_host, oneapi::mkl::transpose::nontrans, m, n, ldC);
// Copy A/B/C from host to device/subdevices
// When two stacks are detected,
// GEMM operation is split between stacks in n direction
// All A matrix is copied to both stacks.
// B and C matrices are split between stacks, so only half of B and C are
// copied to each stack.
fp *A_dev0, *A_dev1, *B_dev0, *B_dev1, *C_dev0, *C_dev1, *A_dev, *B_dev,
*C_dev;
if (nb_device > 1) {
A_dev0 = sycl::malloc_device<fp>(sizea, subdev_queue0);
A_dev1 = sycl::malloc_device<fp>(sizea, subdev_queue1);
B_dev0 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue0);
B_dev1 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue1);
C_dev0 = sycl::malloc_device<fp>(sizec / 2, subdev_queue0);
C_dev1 = sycl::malloc_device<fp>(sizec / 2, subdev_queue1);
// Entire A matrix is copied to both stacks
subdev_queue0.memcpy(A_dev0, A_host, sizea * sizeof(fp));
subdev_queue1.memcpy(A_dev1, A_host, sizea * sizeof(fp));
// Half of B and C are copied to each stack
subdev_queue0.memcpy(B_dev0, B_host, (sizeb / 2) * sizeof(fp));
subdev_queue1.memcpy(B_dev1, B_host + ldB * n / 2,
(sizeb / 2) * sizeof(fp));
subdev_queue0.memcpy(C_dev0, C_host, (sizec / 2) * sizeof(fp));
subdev_queue1.memcpy(C_dev1, C_host + ldC * n / 2,
(sizec / 2) * sizeof(fp));
subdev_queue0.wait();
subdev_queue1.wait();
} else {
A_dev = sycl::malloc_device<fp>(sizea, main_queue);
B_dev = sycl::malloc_device<fp>(sizeb, main_queue);
C_dev = sycl::malloc_device<fp>(sizec, main_queue);
main_queue.memcpy(A_dev, A_host, sizea * sizeof(fp));
main_queue.memcpy(B_dev, B_host, sizeb * sizeof(fp));
main_queue.memcpy(C_dev, C_host, sizec * sizeof(fp));
main_queue.wait();
}
// Call oneMKL GEMM API
// When two stacks are detected, GEMM call is launched on each stack
sycl::event gemm_done0;
sycl::event gemm_done1;
std::vector<sycl::event> gemm_dependencies;
try {
if (nb_device > 1) {
// Split B and C for each stack
int64_t n_subdev = n / 2;
gemm_done0 = oneapi::mkl::blas::gemm(
subdev_queue0, transA, transB, m, n_subdev, k, alpha, A_dev0, ldA,
B_dev0, ldB, beta, C_dev0, ldC, gemm_dependencies);
gemm_done1 = oneapi::mkl::blas::gemm(
subdev_queue1, transA, transB, m, n_subdev, k, alpha, A_dev1, ldA,
B_dev1, ldB, beta, C_dev1, ldC, gemm_dependencies);
} else {
gemm_done0 = oneapi::mkl::blas::gemm(main_queue, transA, transB, m, n, k,
alpha, A_dev, ldA, B_dev, ldB, beta,
C_dev, ldC, gemm_dependencies);
}
} catch (sycl::exception const &e) {
std::cout << "\t\tCaught synchronous SYCL exception during GEMM:\n"
<< e.what() << std::endl;
}
// Wait for GEMM calls to finish
gemm_done0.wait();
if (nb_device > 1)
gemm_done1.wait();
// Copy C from device/subdevices to host
if (nb_device > 1) {
subdev_queue0.memcpy(C_host, C_dev0, (sizec / 2) * sizeof(fp));
subdev_queue1.memcpy(C_host + ldC * n / 2, C_dev1,
(sizec / 2) * sizeof(fp));
subdev_queue0.wait();
subdev_queue1.wait();
} else {
main_queue.memcpy(C_host, C_dev, sizec * sizeof(fp));
main_queue.wait();
}
// Clean-up
free(A_host, cxt);
free(B_host, cxt);
free(C_host, cxt);
if (nb_device > 1) {
sycl::free(A_dev0, subdev_queue0);
sycl::free(A_dev1, subdev_queue1);
sycl::free(B_dev0, subdev_queue0);
sycl::free(B_dev1, subdev_queue1);
sycl::free(C_dev0, subdev_queue0);
sycl::free(C_dev1, subdev_queue1);
} else {
sycl::free(A_dev, main_queue);
sycl::free(B_dev, main_queue);
sycl::free(C_dev, main_queue);
}
if (nb_device > 1)
std::cout << "\tGEMM operation is complete on 2 stacks" << std::endl;
else
std::cout << "\tGEMM operation is complete on 1 stack" << std::endl;
}
// Main entry point for example.
// GEMM example is run on GPU for 4 data types.
int main(int argc, char **argv) {
// Create GPU device
sycl::device dev = sycl::device(sycl::gpu_selector_v);
std::cout << "Running with single precision real data type:" << std::endl;
run_gemm<float>(dev);
std::cout << "Running with double precision real data type:" << std::endl;
run_gemm<double>(dev);
std::cout << "Running with single precision complex data type:" << std::endl;
run_gemm<std::complex<float>>(dev);
std::cout << "Running with double precision complex data type:" << std::endl;
run_gemm<std::complex<double>>(dev);
return 0;
}
In this example, first scalar and dimension parameters for GEMM are initialized.
// Initialize data for GEMM
oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
// Matrix data sizes
int64_t m = 1024;
int64_t n = 1024;
int64_t k = 1024;
// Leading dimensions of matrices
int64_t ldA = 1024;
int64_t ldB = 1024;
int64_t ldC = 1024;
// Set scalar fp values
fp alpha = set_fp_value(2.0, -0.5);
fp beta = set_fp_value(3.0, -1.5);
Next, if available, subdevices are created from root device and stored in SYCL vector. If the platform has no subdevices (e.g single stack device), root device will be used for computation. Note that, the example also demonstrates how to handle asynchronous exceptions.
// Create sub devices for multi stack device
std::vector<sycl::device> subdev = {};
sycl::queue subdev_queue0;
sycl::queue subdev_queue1;
try {
using namespace sycl::info;
subdev = dev.create_sub_devices<
partition_property::partition_by_affinity_domain>(
partition_affinity_domain::numa);
std::cout << "\tUse " << subdev.size() << " subdevices" << std::endl;
} catch (sycl::exception &e) {
std::cout << "\tSubdevices not found, use root device" << std::endl;
}
int64_t nb_device = subdev.size();
// Catch asynchronous exceptions
auto exception_handler = [](sycl::exception_list exceptions) {
for (std::exception_ptr const &e : exceptions) {
try {
std::rethrow_exception(e);
} catch (sycl::exception const &e) {
std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
<< e.what() << std::endl;
}
}
};
Context and queues are then created. If subdevices are detected earlier, a separate queue is created for each subdevice.
// Create context and execution queue
subdev.push_back(dev);
sycl::context cxt(subdev);
sycl::queue main_queue(cxt, dev, exception_handler);
// If two subdevices are detected, create queue for each subdevice
if (nb_device > 1) {
subdev_queue0 = sycl::queue(cxt, subdev[0], exception_handler);
subdev_queue1 = sycl::queue(cxt, subdev[1], exception_handler);
}
Before oneMKL calls are invoked, input and output matrices have to be allocated, initialized and transferred from host to device. It is important to note that, when 2 subdevices are detected, first half of GEMM computation is done on first stack using entire A matrix and first half of B matrix and the other half of GEMM computation is done on second stack using entire A matrix and other half of B matrix. Thus, matrices needs to be copied to each stack accordingly.
// Allocate and initialize arrays for matrices
int64_t sizea;
if (transA == oneapi::mkl::transpose::nontrans)
sizea = ldA * k;
else
sizea = ldA * m;
int64_t sizeb = ldB * n;
int64_t sizec = ldC * n;
auto A_host = sycl::malloc_host<fp>(sizea, main_queue);
auto B_host = sycl::malloc_host<fp>(sizeb, main_queue);
auto C_host = sycl::malloc_host<fp>(sizec, main_queue);
init_matrix(A_host, transA, m, k, ldA);
init_matrix(B_host, transB, k, n, ldB);
init_matrix(C_host, oneapi::mkl::transpose::nontrans, m, n, ldC);
// Copy A/B/C from host to device/subdevices
// When two stacks are detected,
// GEMM operation is split between stacks in n direction
// All A matrix is copied to both stacks.
// B and C matrices are split between stacks, so only half of B and C are
// copied to each stack.
fp *A_dev0, *A_dev1, *B_dev0, *B_dev1, *C_dev0, *C_dev1, *A_dev, *B_dev,
*C_dev;
if (nb_device > 1) {
A_dev0 = sycl::malloc_device<fp>(sizea, subdev_queue0);
A_dev1 = sycl::malloc_device<fp>(sizea, subdev_queue1);
B_dev0 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue0);
B_dev1 = sycl::malloc_device<fp>(sizeb / 2, subdev_queue1);
C_dev0 = sycl::malloc_device<fp>(sizec / 2, subdev_queue0);
C_dev1 = sycl::malloc_device<fp>(sizec / 2, subdev_queue1);
// Entire A matrix is copied to both stacks
subdev_queue0.memcpy(A_dev0, A_host, sizea * sizeof(fp));
subdev_queue1.memcpy(A_dev1, A_host, sizea * sizeof(fp));
// Half of B and C are copied to each stack
subdev_queue0.memcpy(B_dev0, B_host, (sizeb / 2) * sizeof(fp));
subdev_queue1.memcpy(B_dev1, B_host + ldB * n / 2,
(sizeb / 2) * sizeof(fp));
subdev_queue0.memcpy(C_dev0, C_host, (sizec / 2) * sizeof(fp));
subdev_queue1.memcpy(C_dev1, C_host + ldC * n / 2,
(sizec / 2) * sizeof(fp));
subdev_queue0.wait();
subdev_queue1.wait();
} else {
A_dev = sycl::malloc_device<fp>(sizea, main_queue);
B_dev = sycl::malloc_device<fp>(sizeb, main_queue);
C_dev = sycl::malloc_device<fp>(sizec, main_queue);
main_queue.memcpy(A_dev, A_host, sizea * sizeof(fp));
main_queue.memcpy(B_dev, B_host, sizeb * sizeof(fp));
main_queue.memcpy(C_dev, C_host, sizec * sizeof(fp));
main_queue.wait();
}
Finally, oneMKL GEMM APIs are called with corresponding device/subdevice queues and updated C matrix is transferred to host once the GEMM calls are done.
// Call oneMKL GEMM API
// When two stacks are detected, GEMM call is launched on each stack
sycl::event gemm_done0;
sycl::event gemm_done1;
std::vector<sycl::event> gemm_dependencies;
try {
if (nb_device > 1) {
// Split B and C for each stack
int64_t n_subdev = n / 2;
gemm_done0 = oneapi::mkl::blas::gemm(
subdev_queue0, transA, transB, m, n_subdev, k, alpha, A_dev0, ldA,
B_dev0, ldB, beta, C_dev0, ldC, gemm_dependencies);
gemm_done1 = oneapi::mkl::blas::gemm(
subdev_queue1, transA, transB, m, n_subdev, k, alpha, A_dev1, ldA,
B_dev1, ldB, beta, C_dev1, ldC, gemm_dependencies);
} else {
gemm_done0 = oneapi::mkl::blas::gemm(main_queue, transA, transB, m, n, k,
alpha, A_dev, ldA, B_dev, ldB, beta,
C_dev, ldC, gemm_dependencies);
}
} catch (sycl::exception const &e) {
std::cout << "\t\tCaught synchronous SYCL exception during GEMM:\n"
<< e.what() << std::endl;
}
// Wait for GEMM calls to finish
gemm_done0.wait();
if (nb_device > 1)
gemm_done1.wait();
// Copy C from device/subdevices to host
if (nb_device > 1) {
subdev_queue0.memcpy(C_host, C_dev0, (sizec / 2) * sizeof(fp));
subdev_queue1.memcpy(C_host + ldC * n / 2, C_dev1,
(sizec / 2) * sizeof(fp));
subdev_queue0.wait();
subdev_queue1.wait();
} else {
main_queue.memcpy(C_host, C_dev, sizec * sizeof(fp));
main_queue.wait();
}
To be able to run this example, the below build command can be used after setting $MKLROOT variable.
icpx -fsycl -qmkl=sequential -DMKL_ILP64 -L${MKLROOT}/lib/intel64 -lsycl -lOpenCL -lpthread -lm -ldl -o onemkl_dpcpp_gemm onemkl_dpcpp_gemm.cpp
export LD_LIBRARY_PATH=${MKLROOT}/lib/intel64/:${LD_LIBRARY_PATH}
./onemkl_dpcpp_gemm
For large problem sizes (m=n=k=8192), close to 2X performance scaling is expected using explicit scaling on two stacks.