Avoiding Redundant Queue Constructions

Developer Guide

oneAPI GPU Optimization Guide

Download PDF

ID 771772

Date 4/18/2024

Version

Public

A newer version of this document is available. Customers should click here to go to the newest version.

Visible to Intel only — GUID: GUID-76DEBDE6-DDA3-412B-B325-A0D1FA87CE94

View Details

Avoiding Redundant Queue Constructions

To execute kernels on a device, the user must create a queue, which references an associated context, platform, and device. These may be chosen automatically, or specified by the user.

A context is constructed, either directly by the user or implicitly when creating a queue, to hold all the runtime information required by the SYCL runtime and the SYCL backend to operate on a device. When a queue is created with no context specified, a new context is implicitly constructed using the default constructor. In general, creating a new context is a heavy duty operation due to the need for JIT compiling the program every time a kernel is submitted to a queue with a new context. For good performance one should use as few contexts as possible in their application.

In the following example, a queue is created inside the loop and the kernel is submitted to this new queue. This will essentially invoke the JIT compiler for every iteration of the loop.

int reductionMultipleQMultipleC(std::vector<int> &data, int iter) {
  const size_t data_size = data.size();
  int sum = 0;

  int work_group_size = 512;
  int num_work_groups = 1;
  int num_work_items = work_group_size;

  const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};

  sycl::buffer<int> buf(data.data(), data_size, props);
  sycl::buffer<int> sum_buf(&sum, 1, props);

  sycl::queue q1{sycl::default_selector_v, exception_handler};
  // initialize data on the device
  q1.submit([&](auto &h) {
    sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
    h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
  });

  double elapsed = 0;
  for (int i = 0; i < iter; i++) {
    sycl::queue q2{sycl::default_selector_v, exception_handler};
    if (i == 0)
      std::cout << q2.get_device().get_info<sycl::info::device::name>() << "\n";
    // reductionMultipleQMultipleC main begin
    Timer timer;
    q2.submit([&](auto &h) {
      sycl::accessor buf_acc(buf, h, sycl::read_only);
      sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
      sycl::local_accessor<int, 1> scratch(work_group_size, h);
      h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
                     [=](sycl::nd_item<1> item) {
                       size_t loc_id = item.get_local_id(0);
                       int sum = 0;
                       for (int i = loc_id; i < data_size; i += num_work_items)
                         sum += buf_acc[i];
                       scratch[loc_id] = sum;
                       for (int i = work_group_size / 2; i > 0; i >>= 1) {
                         item.barrier(sycl::access::fence_space::local_space);
                         if (loc_id < i)
                           scratch[loc_id] += scratch[loc_id + i];
                       }
                       if (loc_id == 0)
                         sum_acc[0] = scratch[0];
                     });
    });
    // reductionMultipleQMultipleC main end
    q2.wait();
    sycl::host_accessor h_acc(sum_buf);
    sum = h_acc[0];
    elapsed += timer.Elapsed();
  }
  elapsed = elapsed / iter;
  if (sum == sum_expected)
    std::cout << "SUCCESS: Time reductionMultipleQMultipleC   = " << elapsed
              << "s"
              << " sum = " << sum << "\n";
  else
    std::cout << "ERROR: reductionMultipleQMultipleC Expected " << sum_expected
              << " but got " << sum << "\n";
  return sum;
} // end reductionMultipleQMultipleC

The above program can be rewritten by moving the queue declaration outside the loop, which improves performance quite dramatically.

int reductionSingleQ(std::vector<int> &data, int iter) {
  const size_t data_size = data.size();
  int sum = 0;

  int work_group_size = 512;
  int num_work_groups = 1;
  int num_work_items = work_group_size;

  const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};

  sycl::buffer<int> buf(data.data(), data_size, props);
  sycl::buffer<int> sum_buf(&sum, 1, props);
  sycl::queue q{sycl::default_selector_v, exception_handler};
  std::cout << q.get_device().get_info<sycl::info::device::name>() << "\n";

  // initialize data on the device
  q.submit([&](auto &h) {
    sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
    h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
  });

  double elapsed = 0;
  for (int i = 0; i < iter; i++) {
    // reductionIntBarrier main begin
    Timer timer;
    q.submit([&](auto &h) {
      sycl::accessor buf_acc(buf, h, sycl::read_only);
      sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
      sycl::local_accessor<int, 1> scratch(work_group_size, h);
      h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
                     [=](sycl::nd_item<1> item) {
                       size_t loc_id = item.get_local_id(0);
                       int sum = 0;
                       for (int i = loc_id; i < data_size; i += num_work_items)
                         sum += buf_acc[i];
                       scratch[loc_id] = sum;
                       for (int i = work_group_size / 2; i > 0; i >>= 1) {
                         item.barrier(sycl::access::fence_space::local_space);
                         if (loc_id < i)
                           scratch[loc_id] += scratch[loc_id + i];
                       }
                       if (loc_id == 0)
                         sum_acc[0] = scratch[0];
                     });
    });
    // reductionSingleQ main end
    q.wait();
    sycl::host_accessor h_acc(sum_buf);
    sum = h_acc[0];
    elapsed += timer.Elapsed();
  }
  elapsed = elapsed / iter;
  if (sum == sum_expected)
    std::cout << "SUCCESS: Time reductionSingleQ   = " << elapsed << "s"
              << " sum = " << sum << "\n";
  else
    std::cout << "ERROR: reductionSingleQ Expected " << sum_expected
              << " but got " << sum << "\n";
  return sum;
} // end reductionSingleQ

In case you need to create multiple queues, try to share the contexts among the queues. This will improve the performance. The above kernel is rewritten as shown below where the new queues created inside the loop and the queue outside the loop share the context. In this case the performance is same as the one with one queue.

int reductionMultipleQSingleC(std::vector<int> &data, int iter) {
  const size_t data_size = data.size();
  int sum = 0;

  int work_group_size = 512;
  int num_work_groups = 1;
  int num_work_items = work_group_size;

  const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};

  sycl::buffer<int> buf(data.data(), data_size, props);
  sycl::buffer<int> sum_buf(&sum, 1, props);

  sycl::queue q1{sycl::default_selector_v, exception_handler};
  // initialize data on the device
  q1.submit([&](auto &h) {
    sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
    h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
  });

  double elapsed = 0;
  for (int i = 0; i < iter; i++) {
    sycl::queue q2{q1.get_context(), sycl::default_selector_v,
                   exception_handler};
    if (i == 0)
      std::cout << q2.get_device().get_info<sycl::info::device::name>() << "\n";
    // reductionMultipleQSingleC main begin
    Timer timer;
    q2.submit([&](auto &h) {
      sycl::accessor buf_acc(buf, h, sycl::read_only);
      sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
      sycl::local_accessor<int, 1> scratch(work_group_size, h);
      h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
                     [=](sycl::nd_item<1> item) {
                       size_t loc_id = item.get_local_id(0);
                       int sum = 0;
                       for (int i = loc_id; i < data_size; i += num_work_items)
                         sum += buf_acc[i];
                       scratch[loc_id] = sum;
                       for (int i = work_group_size / 2; i > 0; i >>= 1) {
                         item.barrier(sycl::access::fence_space::local_space);
                         if (loc_id < i)
                           scratch[loc_id] += scratch[loc_id + i];
                       }
                       if (loc_id == 0)
                         sum_acc[0] = scratch[0];
                     });
    });
    // reductionMultipleQSingleC main end
    q2.wait();
    sycl::host_accessor h_acc(sum_buf);
    sum = h_acc[0];
    elapsed += timer.Elapsed();
  }
  elapsed = elapsed / iter;
  if (sum == sum_expected)
    std::cout << "SUCCESS: Time reductionMultipleQSingleContext  = " << elapsed
              << "s"
              << " sum = " << sum << "\n";
  else
    std::cout << "ERROR: reductionMultipleQSingleContext Expected "
              << sum_expected << " but got " << sum << "\n";
  return sum;
} // end reductionMultipleQSingleC

Select Your Language

Using Intel.com Search

Quick Links

Recent Searches

Advanced Search

Only search in

oneAPI GPU Optimization Guide

Avoiding Redundant Queue Constructions