SYCL Sample Code
main.cpp File
#include <algorithm>
#include <array>
#include <numeric>
#include <vector>
#include <sycl/sycl.hpp>
#include <sycl/ext/intel/fpga_extensions.hpp>
using namespace sycl;
// the number of bins in the histogram is constant
constexpr int kNumBins = 10;
// Forward declare the kernel names in the global scope to reduce name mangling
class histogram;
int main(int argc, char* argv[]) {
// parse command line args
uint count = 1000000;
if (argc > 1) {
count = atoi(argv[1]);
}
// host input and output memory
std::vector<int> in_h(count);
std::array<int, kNumBins> bins_h = {0};
std::array<int, kNumBins> bins_ref_h = {0};
// generate random input and compute the expected result
std::generate(in_h.begin(), in_h.end(), [] { return rand() % 100; });
for (auto& x : in_h) { bins_ref_h[x % kNumBins]++; };
// the device selector
#ifdef FPGA_EMULATOR
ext::intel::fpga_emulator_selector selector;
#else
ext::intel::fpga_selector selector;
#endif
// create the device queue
queue q(selector);
try {
// create SYCL buffers for inputs and outputs
// providing host pointers (in this case, std::vectors and std::array)
// allows the runtime to automatically migrate input and output data
// to and from the device on demand
buffer in_buf(in_h);
buffer bins_buf(bins_h);
// launch the kernel
event kernel_event = q.submit([&](handler& h) {
// get accessors to the SYCL buffers
// 'no_init' tells the runtime that we don't care about the initial
// contents of the output (z) and avoids copying the output from host to
// device before launching the kernel.
accessor in(in_buf, h, read_only);
accessor bins(bins_buf, h, write_only, no_init);
h.single_task<histogram>([=]() [[intel::kernel_args_restrict]] {
// store a local copy of the histogram to avoid read-accumulate-writes
// to global memory
[[intel::fpga_register]] int bins_local[kNumBins];
// initialize the local bins
#pragma unroll
for (uint i = 0; i < kNumBins; i++) {
bins_local[i] = 0;
}
// compute the histogram
[[intel::initiation_interval(1)]]
for (uint i = 0; i < count; i++) {
bins_local[in[i] % kNumBins]++;
}
// write back the local copy to global memory
#pragma unroll
for (uint i = 0; i < kNumBins; i++) {
bins[i] = bins_local[i];
}
});
});
} catch (exception const& e) {
std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
std::terminate();
}
// Exiting the try-catch scope will cause the buffer destructors to be called
// which will result in an implicit 'wait' on the kernel to finish (since
// the kernel uses the buffers).
// Therefore, at this point in the code, we know the kernel has finished
// and the data has been transferred back to the host.
// Since x_buf and y_buf are only accessed with 'read_only' buffers,
// the runtime will not copy them back from the device.
// validate the results
bool passed = std::equal(bins_h.begin(), bins_h.end(), bins_ref_h.begin());
if (passed) {
printf("PASSED\n");
} else {
printf("FAILED\n");
}
return passed;
}
Makefile
BOARD=intel_a10gx_pac:pac_a10
fpga_emu: main.cpp
icpx -fsycl -fintelfpga -DFPGA_EMULATOR main.cpp -o main.fpga_emu
report: main.cpp
icpx -fsycl -fintelfpga -Xshardware -Xstarget=$(BOARD) -fsycl-link=early main.cpp -o main_report.a
fpga: main.cpp
icpx -fsycl -fintelfpga -Xshardware -Xstarget=$(BOARD) -reuse-exe=main.fpga main.cpp -o main.fpga
clean:
rm -rf *.o *.a *.prj
Parent topic: Histogram Design Example Walkthrough