Visible to Intel only — GUID: GUID-5504DF5F-2A69-4FDB-9B38-E3BA956D4D5B
Sample Application
The sample code below shows how to use oneCCL API to perform allreduce communication for SYCL USM memory.
#include <iostream>
#include <mpi.h>
#include "oneapi/ccl.hpp"
void mpi_finalize() {
int is_finalized = 0;
MPI_Finalized(&is_finalized);
if (!is_finalized) {
MPI_Finalize();
}
}
int main(int argc, char* argv[]) {
constexpr size_t count = 10 * 1024 * 1024;
int size = 0;
int rank = 0;
ccl::init();
MPI_Init(nullptr, nullptr);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
atexit(mpi_finalize);
auto device_selector = sycl::default_selector_v;
sycl::queue q(device_selector);
std::cout << "Running on " << q.get_device().get_info<sycl::info::device::name>() << "\n";
/* create kvs */
ccl::shared_ptr_class<ccl::kvs> kvs;
ccl::kvs::address_type main_addr;
if (rank == 0) {
kvs = ccl::create_main_kvs();
main_addr = kvs->get_address();
MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
kvs = ccl::create_kvs(main_addr);
}
/* create communicator */
auto dev = ccl::create_device(q.get_device());
auto ctx = ccl::create_context(q.get_context());
auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
/* create stream */
auto stream = ccl::create_stream(q);
/* create buffers */
auto send_buf = sycl::malloc_device<int>(count, q);
auto recv_buf = sycl::malloc_device<int>(count, q);
/* open buffers and modify them on the device side */
auto e = q.submit([&](auto& h) {
h.parallel_for(count, [=](auto id) {
send_buf[id] = rank + id + 1;
recv_buf[id] = -1;
});
});
int check_sum = 0;
for (int i = 1; i <= size; ++i) {
check_sum += i;
}
/* do not wait completion of kernel and provide it as dependency for operation */
std::vector<ccl::event> deps;
deps.push_back(ccl::create_event(e));
/* invoke allreduce */
auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
/* open recv_buf and check its correctness on the device side */
sycl::buffer<int> check_buf(count);
q.submit([&](auto& h) {
sycl::accessor check_buf_acc(check_buf, h, sycl::write_only);
h.parallel_for(count, [=](auto id) {
if (recv_buf[id] != static_cast<int>(check_sum + size * id)) {
check_buf_acc[id] = -1;
}
});
});
q.wait_and_throw();
/* print out the result of the test on the host side */
{
sycl::host_accessor check_buf_acc(check_buf, sycl::read_only);
size_t i;
for (i = 0; i < count; i++) {
if (check_buf_acc[i] == -1) {
std::cout << "FAILED\n";
break;
}
}
if (i == count) {
std::cout << "PASSED\n";
}
}
sycl::free(send_buf, q);
sycl::free(recv_buf, q);
}
Build details
Build oneCCL with SYCL support (only Intel® oneAPI DPC++/C++ Compiler is supported).
Use the C++ driver with the -fsycl option to build the sample:
icpx -o sample sample.cpp -lccl -lmpi -fsycl
Run the sample
Intel® MPI Library is required for running the sample. Make sure that MPI environment is set up.
To run the sample, use the following command:
mpiexec <parameters> ./sample
where <parameters> represents optional mpiexec parameters such as node count, processes per node, hosts, and so on.
NOTE:
Explore the complete list of oneAPI code samples in the oneAPI Samples Catalog. These samples were designed to help you develop, offload, and optimize multiarchitecture applications targeting CPUs, GPUs, and FPGAs.