Visible to Intel only — GUID: GUID-214C3064-4EE1-4A3C-95DA-66823D9A4649
Visible to Intel only — GUID: GUID-214C3064-4EE1-4A3C-95DA-66823D9A4649
Avoiding Moving Data Back and Forth between Host and Device
The cost of moving data between host and device is quite high, especially in the case of discrete accelerators. So it is very important to avoid data transfers between host and device as much as possible. In some situations it may be required to bring the data that was computed by a kernel on the accelerator to the host and do some operation on it and send it back to the device for further processing. In such situation we will end up paying for the cost of device to host transfer and then again host to device transfer.
Consider the following example, where one kernel produces data through some operation (in this case vector add) into a new vector. This new vector is then transformed into a third vector by applying a function on each value and this third vector is finally fed as input into another kernel for some additional computation. This form of computation is quite common and occurs in many domains where algorithms are iterative and output from one computation needs to be fed as input into another computation. In machine learning, for example, models are structured as layers of computations, and output of one layer is input to the next layer.
double myFunc1(sycl::queue &q, AlignedVector<int> &a, AlignedVector<int> &b, AlignedVector<int> &c, AlignedVector<int> &d, AlignedVector<int> &res, int iter) { sycl::range num_items{a.size()}; VectorAllocator<int> alloc; AlignedVector<int> sum(a.size(), alloc); const sycl::property_list props = {sycl::property::buffer::use_host_ptr()}; sycl::buffer a_buf(a, props); sycl::buffer b_buf(b, props); sycl::buffer c_buf(b, props); sycl::buffer d_buf(b, props); sycl::buffer res_buf(res, props); sycl::buffer sum_buf(sum.data(), num_items, props); Timer timer; for (int i = 0; i < iter; i++) { // kernel1 q.submit([&](auto &h) { // Input accessors sycl::accessor a_acc(a_buf, h, sycl::read_only); sycl::accessor b_acc(b_buf, h, sycl::read_only); // Output accessor sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto id) { sum_acc[id] = a_acc[id] + b_acc[id]; }); }); { sycl::host_accessor h_acc(sum_buf); for (int j = 0; j < a.size(); j++) if (h_acc[j] > 10) h_acc[j] = 1; else h_acc[j] = 0; } // kernel2 q.submit([&](auto &h) { // Input accessors sycl::accessor sum_acc(sum_buf, h, sycl::read_only); sycl::accessor c_acc(c_buf, h, sycl::read_only); sycl::accessor d_acc(d_buf, h, sycl::read_only); // Output accessor sycl::accessor res_acc(res_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto id) { res_acc[id] = sum_acc[id] * c_acc[id] + d_acc[id]; }); }); q.wait(); } double elapsed = timer.Elapsed() / iter; return (elapsed); } // end myFunc1
Instead of bringing the data to the host and applying the function to the data and sending it back to the device, you can create a kernel3 to execute this function on the device, as shown in the following example. The kernel kernel3 operates on the intermediate data in accum_buf in between kernel1 and kernel2, avoiding the round trip of data transfer between the device and the host.
double myFunc2(sycl::queue &q, AlignedVector<int> &a, AlignedVector<int> &b, AlignedVector<int> &c, AlignedVector<int> &d, AlignedVector<int> &res, int iter) { sycl::range num_items{a.size()}; VectorAllocator<int> alloc; AlignedVector<int> sum(a.size(), alloc); const sycl::property_list props = {sycl::property::buffer::use_host_ptr()}; sycl::buffer a_buf(a, props); sycl::buffer b_buf(b, props); sycl::buffer c_buf(b, props); sycl::buffer d_buf(b, props); sycl::buffer res_buf(res, props); sycl::buffer sum_buf(sum.data(), num_items, props); Timer timer; for (int i = 0; i < iter; i++) { // kernel1 q.submit([&](auto &h) { // Input accessors sycl::accessor a_acc(a_buf, h, sycl::read_only); sycl::accessor b_acc(b_buf, h, sycl::read_only); // Output accessor sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto i) { sum_acc[i] = a_acc[i] + b_acc[i]; }); }); // kernel3 q.submit([&](auto &h) { sycl::accessor sum_acc(sum_buf, h, sycl::read_write); h.parallel_for(num_items, [=](auto id) { if (sum_acc[id] > 10) sum_acc[id] = 1; else sum_acc[id] = 0; }); }); // kernel2 q.submit([&](auto &h) { // Input accessors sycl::accessor sum_acc(sum_buf, h, sycl::read_only); sycl::accessor c_acc(c_buf, h, sycl::read_only); sycl::accessor d_acc(d_buf, h, sycl::read_only); // Output accessor sycl::accessor res_acc(res_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto i) { res_acc[i] = sum_acc[i] * c_acc[i] + d_acc[i]; }); }); q.wait(); } double elapsed = timer.Elapsed() / iter; return (elapsed); } // end myFunc2
There are other ways to optimize this example. For instance, the clipping operation in kernel3 can be merged into the computation of kernel1 as shown below. This is kernel fusion and has the added advantage of not launching a third kernel. The SYCL compiler cannot do this kind of optimization. In some specific domains like machine learning, there are graph compilers that operate on the ML models and fuse the operations, which has the same impact.
double myFunc3(sycl::queue &q, AlignedVector<int> &a, AlignedVector<int> &b, AlignedVector<int> &c, AlignedVector<int> &d, AlignedVector<int> &res, int iter) { sycl::range num_items{a.size()}; VectorAllocator<int> alloc; AlignedVector<int> sum(a.size(), alloc); const sycl::property_list props = {sycl::property::buffer::use_host_ptr()}; sycl::buffer a_buf(a, props); sycl::buffer b_buf(b, props); sycl::buffer c_buf(b, props); sycl::buffer d_buf(b, props); sycl::buffer res_buf(res, props); sycl::buffer sum_buf(sum.data(), num_items, props); Timer timer; for (int i = 0; i < iter; i++) { // kernel1 q.submit([&](auto &h) { // Input accessors sycl::accessor a_acc(a_buf, h, sycl::read_only); sycl::accessor b_acc(b_buf, h, sycl::read_only); // Output accessor sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto i) { int t = a_acc[i] + b_acc[i]; if (t > 10) sum_acc[i] = 1; else sum_acc[i] = 0; }); }); // kernel2 q.submit([&](auto &h) { // Input accessors sycl::accessor sum_acc(sum_buf, h, sycl::read_only); sycl::accessor c_acc(c_buf, h, sycl::read_only); sycl::accessor d_acc(d_buf, h, sycl::read_only); // Output accessor sycl::accessor res_acc(res_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto i) { res_acc[i] = sum_acc[i] * c_acc[i] + d_acc[i]; }); }); q.wait(); } double elapsed = timer.Elapsed() / iter; return (elapsed); } // end myFunc3
We can take this kernel fusion one level further and fuse both kernel1 and kernel2 as shown in the code below. This gives very good performance since it avoids the intermediate accum_buf completely, saving memory in addition to launching an additional kernel. Most of the performance benefit in this case is due to improvement in locality of memory references.
double myFunc4(sycl::queue &q, AlignedVector<int> &a, AlignedVector<int> &b, AlignedVector<int> &c, AlignedVector<int> &d, AlignedVector<int> &res, int iter) { sycl::range num_items{a.size()}; VectorAllocator<int> alloc; const sycl::property_list props = {sycl::property::buffer::use_host_ptr()}; sycl::buffer a_buf(a, props); sycl::buffer b_buf(b, props); sycl::buffer c_buf(b, props); sycl::buffer d_buf(b, props); sycl::buffer res_buf(res, props); Timer timer; for (int i = 0; i < iter; i++) { // kernel1 q.submit([&](auto &h) { // Input accessors sycl::accessor a_acc(a_buf, h, sycl::read_only); sycl::accessor b_acc(b_buf, h, sycl::read_only); sycl::accessor c_acc(c_buf, h, sycl::read_only); sycl::accessor d_acc(d_buf, h, sycl::read_only); // Output accessor sycl::accessor res_acc(res_buf, h, sycl::write_only, sycl::no_init); h.parallel_for(num_items, [=](auto i) { int t = a_acc[i] + b_acc[i]; if (t > 10) res_acc[i] = c_acc[i] + d_acc[i]; else res_acc[i] = d_acc[i]; }); }); q.wait(); } double elapsed = timer.Elapsed() / iter; return (elapsed); } // end myFunc4