Intel® oneAPI Collective Communications Library (oneCCL) Developer Guide and Reference
A newer version of this document is available. Customers should click here to go to the newest version.
Visible to Intel only — GUID: GUID-69D8923A-23EC-4F60-9F86-0076917CB679
Build a Sample Application
The sample code below shows how to use oneCCL API to perform allreduce communication for SYCL USM memory.
#include <iostream> #include <mpi.h> #include "oneapi/ccl.hpp" void mpi_finalize() { int is_finalized = 0; MPI_Finalized(&is_finalized); if (!is_finalized) { MPI_Finalize(); } } int main(int argc, char* argv[]) { constexpr size_t count = 10 * 1024 * 1024; int size = 0; int rank = 0; ccl::init(); MPI_Init(nullptr, nullptr); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); atexit(mpi_finalize); /* find and initialize Level-Zero devices and queues */ std::vector<sycl::device> devices; std::vector<sycl::queue> queues; auto platform_list = sycl::platform::get_platforms(); for (const auto &platform : platform_list) { auto platform_name = platform.get_info<sycl::info::platform::name>(); bool is_level_zero = platform_name.find("Level-Zero") != std::string::npos; if (is_level_zero) { std::cout << "Platform_name is: " << platform_name << std::endl; auto device_list = platform.get_devices(); for (const auto &device : device_list) { if (device.is_gpu()) { devices.push_back(device); } } } } if (devices.size() < size) { std::cerr << "Not enough devices for all ranks" << std::endl; exit(-1); } sycl::context context(devices); for (size_t i = 0; i < devices.size(); ++i) { if (i == rank) { /* Only create a queue for the current rank's device */ queues.push_back(sycl::queue(context, devices[i], {sycl::property::queue::in_order()})); break; } } if (queues.empty()) { std::cerr << "No queue created for rank " << rank << std::endl; exit(-1); } /* Use the only queue in the queues vector for the current rank */ sycl::queue& q = queues[0]; /* create kvs */ ccl::shared_ptr_class<ccl::kvs> kvs; ccl::kvs::address_type main_addr; if (rank == 0) { kvs = ccl::create_main_kvs(); main_addr = kvs->get_address(); MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); } else { MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); kvs = ccl::create_kvs(main_addr); } /* create communicator */ auto dev = ccl::create_device(q.get_device()); auto ctx = ccl::create_context(q.get_context()); auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs); /* create stream */ auto stream = ccl::create_stream(q); /* create buffers */ auto send_buf = sycl::malloc_device<int>(count, q); auto recv_buf = sycl::malloc_device<int>(count, q); /* open buffers and modify them on the device side */ auto e = q.submit([&](auto& h) { h.parallel_for(count, [=](auto id) { send_buf[id] = rank + id + 1; recv_buf[id] = -1; }); }); int check_sum = 0; for (int i = 1; i <= size; ++i) { check_sum += i; } /* do not wait completion of kernel and provide it as dependency for operation */ std::vector<ccl::event> deps; deps.push_back(ccl::create_event(e)); /* invoke allreduce */ auto attr = ccl::create_operation_attr<ccl::allreduce_attr>(); ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ sycl::buffer<int> check_buf(count); q.submit([&](auto& h) { sycl::accessor check_buf_acc(check_buf, h, sycl::write_only); h.parallel_for(count, [=](auto id) { if (recv_buf[id] != static_cast<int>(check_sum + size * id)) { check_buf_acc[id] = -1; } }); }); q.wait_and_throw(); /* print out the result of the test on the host side */ { sycl::host_accessor check_buf_acc(check_buf, sycl::read_only); size_t i; for (i = 0; i < count; i++) { if (check_buf_acc[i] == -1) { std::cout << "FAILED\n"; break; } } if (i == count) { std::cout << "PASSED\n"; } } sycl::free(send_buf, q); sycl::free(recv_buf, q); }
Build the Library
Build the library with SYCL support (only Intel® oneAPI DPC++/C++ Compiler is supported).
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp make install
Use the C++ driver with the -fsycl option to build the sample:
icpx -o sample sample.cpp -lccl -lmpi -fsycl
Run the Sample
Intel® MPI Library is required for running the sample. Make sure that MPI environment is set up.
To run the sample, use the following command:
mpiexec <parameters> ./sample
where <parameters> represents optional mpiexec parameters such as node count, processes per node, hosts, and so on.