Visible to Intel only — GUID: GUID-A6FB8A50-F3DA-4A58-9166-ADB2E40EDECE
Visible to Intel only — GUID: GUID-A6FB8A50-F3DA-4A58-9166-ADB2E40EDECE
Explicit Scaling - OpenMP
In this section we describe explicit scaling in OpenMP in COMPOSITE mode provide usage examples.
Remember to set the environment variable ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE to enable COMPOSITE mode.
Unified Shared Memory (USM)
Three OpenMP APIs as Intel extensions for USM memory allocations have been added: omp_target_alloc_host, omp_target_alloc_device, and omp_target_alloc_shared.
Please refer to OpenMP USM Allocation API section for details.
Offloading to multiple subdevices
In this scheme, we have multiple subdevices on which the code will run, and queues are attached to the subdevices. This effectively results in “explicit scaling”. See code example below:
#define DEVKIND 0 // Stack
int root_id = omp_get_default_device();
#pragma omp parallel for
for (int id = 0; id < NUM_SUBDEVICES; ++id) {
#pragma omp target teams distribute parallel for device(root_id) \
subdevice(DEVKIND, id) map(...)
for (int i = lb(id), i < ub(id); i++) {
...
}
}
In COMPOSITE mode, each subdevice (stack) can be further decomposed to subsubdevices (Compute Command Streamers or CCSs). For more information about subsubdevices, refer to Advanced Topics section.
Offloading to a single root device
In this scheme, we have a single root device and a queue attached to the root device. The work will be automatically distributed across all subdevices/stacks via “implicit scaling” by the GPU driver. This is the most simple way to enable multi-stack utilization, without targeting specific stacks. See code example below:
int root_id = omp_get_default_device();
#pragma omp target teams distribute parallel for device(root_id) map(...)
for (int i = 0, i < N; i++) {
...
}
Offloading to multiple root devices
In this scheme, we have multiple root devices, where each root device is a GPU card. The queues are attached to the root devices, which offers more sharing possibilities but at the cost of slow access through host memory or explicit copying of data. See code example:
int num_devices = omp_get_num_devices();
#pragma omp parallel for
for (int root_id = 0; root_id < num_devices; root_id++) {
#pragma omp target teams distribute parallel for device(root_id) map(...)
for (int i = lb(root_id); I < ub(root_id); i++) {
...
}
}
Program: Offloading to subdevices (stacks) in COMPOSITE mode
Depending on the chosen devices or subdevices used, as well as the algorithm used, be sure to do proper memory allocation/synchronization. The following is a full OpenMP program that offloads to multiple subdevices (stacks) in COMPOSITE mode.
#include <assert.h>
#include <iostream>
#include <omp.h>
#include <stdint.h>
#ifndef NUM_SUBDEVICES
#define NUM_SUBDEVICES 1
#endif
#ifndef DEVKIND
#define DEVKIND 0 // Stack
#endif
template <int num_subdevices> struct mptr {
float *p[num_subdevices];
};
int main(int argc, char **argv) {
constexpr int SIZE = 8e6;
constexpr int SIMD_SIZE = 32;
constexpr std::size_t TOTAL_SIZE = SIZE * SIMD_SIZE;
constexpr int num_subdevices = NUM_SUBDEVICES;
mptr<num_subdevices> device_ptr_a;
mptr<num_subdevices> device_ptr_b;
mptr<num_subdevices> device_ptr_c;
const int default_device = omp_get_default_device();
std::cout << "default_device = " << default_device << std::endl;
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
device_ptr_a.p[sdev] =
static_cast<float *>(malloc(TOTAL_SIZE * sizeof(float)));
device_ptr_b.p[sdev] =
static_cast<float *>(malloc(TOTAL_SIZE * sizeof(float)));
device_ptr_c.p[sdev] =
static_cast<float *>(malloc(TOTAL_SIZE * sizeof(float)));
#pragma omp target enter data map(alloc \
: device_ptr_a.p[sdev] [0:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
#pragma omp target enter data map(alloc \
: device_ptr_b.p[sdev] [0:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
#pragma omp target enter data map(alloc \
: device_ptr_c.p[sdev] [0:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
} // for (int sdev ...
std::cout << "memory footprint per GPU = "
<< 3 * (std::size_t)(TOTAL_SIZE) * sizeof(float) * 1E-9 << " GB"
<< std::endl;
#pragma omp parallel for
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
float *a = device_ptr_a.p[sdev];
float *b = device_ptr_b.p[sdev];
#pragma omp target teams distribute parallel for device(default_device) \
subdevice(DEVKIND, sdev)
for (int i = 0; i < TOTAL_SIZE; ++i) {
a[i] = i + 0.5;
b[i] = i - 0.5;
}
}
const int no_max_rep = 200;
double time = 0.0;
for (int irep = 0; irep < no_max_rep + 1; ++irep) {
if (irep == 1)
time = omp_get_wtime();
#pragma omp parallel for num_threads(num_subdevices)
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
float *a = device_ptr_a.p[sdev];
float *b = device_ptr_b.p[sdev];
float *c = device_ptr_c.p[sdev];
#pragma omp target teams distribute parallel for device(default_device) \
subdevice(DEVKIND, sdev)
for (int i = 0; i < TOTAL_SIZE; ++i) {
c[i] = a[i] + b[i];
}
}
}
time = omp_get_wtime() - time;
time = time / no_max_rep;
const std::size_t streamed_bytes =
3 * (std::size_t)(TOTAL_SIZE)*num_subdevices * sizeof(float);
std::cout << "bandwidth = " << (streamed_bytes / time) * 1E-9 << " GB/s"
<< std::endl;
std::cout << "time = " << time << " s" << std::endl;
std::cout.precision(10);
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
#pragma omp target update from(device_ptr_c.p[sdev][:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
std::cout << "-GPU: device id = : " << sdev << std::endl;
std::cout << "target result:" << std::endl;
std::cout << "c[" << 0 << "] = " << device_ptr_c.p[sdev][0] << std::endl;
std::cout << "c[" << SIMD_SIZE - 1
<< "] = " << device_ptr_c.p[sdev][SIMD_SIZE - 1] << std::endl;
std::cout << "c[" << TOTAL_SIZE / 2
<< "] = " << device_ptr_c.p[sdev][TOTAL_SIZE / 2] << std::endl;
std::cout << "c[" << TOTAL_SIZE - 1
<< "] = " << device_ptr_c.p[sdev][TOTAL_SIZE - 1] << std::endl;
}
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
for (int i = 0; i < TOTAL_SIZE; ++i) {
assert((int)(device_ptr_c.p[sdev][i]) ==
(int)(device_ptr_c.p[sdev][i] +
device_ptr_a.p[sdev][i] * device_ptr_b.p[sdev][i]));
}
}
for (int sdev = 0; sdev < num_subdevices; ++sdev) {
#pragma omp target exit data map(release \
: device_ptr_a.p[sdev][:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
#pragma omp target exit data map(release \
: device_ptr_b.p[sdev][:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
#pragma omp target exit data map(release \
: device_ptr_a.p[sdev][:TOTAL_SIZE]) \
device(default_device) subdevice(DEVKIND, sdev)
}
}
Compilation command:
$ icpx -ffp-contract=fast -O2 -ffast-math -DNUM_SUBDEVICES=2 \
-fiopenmp -fopenmp-targets=spir64 openmp_explicit_subdevice.cpp
Run command:
$ ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE OMP_TARGET_OFFLOAD=MANDATORY ./a.out
This OpenMP program achieves linear scaling ~2x on an Intel® Data Center GPU Max system.