Visible to Intel only — GUID: GUID-98386A17-B40D-498D-8F02-CFB36CE46A2E
Visible to Intel only — GUID: GUID-98386A17-B40D-498D-8F02-CFB36CE46A2E
DPCT1020
Message
Migration of <api name>, if it is called from __global__ or __device__ function, is not supported. You may need to redesign the code to use the host-side <api name> instead, which submits this call to the SYCL queue automatically.
Detailed Help
The warning message is generated in cases where the <api name> itself submits the SYCL* kernel to the command queue, and the caller of <api-name> is the SYCL kernel that is submitted to the command queue itself. It results in device-side enqueue of the kernel, which is not supported by SYCL 1.2.1.
Suggestions to Fix
Redesign the code to use the host-side API, which submits this call to the SYCL queue automatically.
For example, this original CUDA* code:
__global__ void kernel(float *d_data) {
int tid = threadIdx.x;
d_data[tid + 1] = tid;
__syncthreads();
if (tid == 0) {
cublasHandle_t handle;
cublasCreate(&handle);
cublasSasum(handle, 128, d_data + 1, 1, d_data)
cublasDestroy(handle);
}
}
void foo() {
float *d_data;
cudaMalloc((void **)&d_data, sizeof(float) * (1 + 128));
kernel<<<1, 128>>>(d_data);
float data;
cudaMemcpy(data, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
}
results in the following migrated SYCL code:
void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
int tid = item_ct1.get_local_id(2);
d_data[tid + 1] = tid;
item_ct1.barrier();
if (tid == 0) {
/*
DPCT1021:2: Migration of cublasHandle_t in __global__ or __device__ function
is not supported. You may need to redesign the code.
*/
cublasHandle_t handle;
handle = &dpct::get_default_queue();
/*
DPCT1020:1: Migration of cublasSasum, if it is called from __global__ or
__device__ function, is not supported. You may need to redesign the code to
use the host-side oneapi::mkl::blas::column_major::asum instead, which submits
this call to the SYCL queue automatically.
*/
cublasSasum(handle, 128, d_data + 1, 1, d_data);
handle = nullptr;
}
}
void foo() {
dpct::device_ext &dev_ct1 = dpct::get_current_device();
sycl::queue &q_ct1 = dev_ct1.default_queue();
float *d_data;
d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
q_ct1.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
[=](sycl::nd_item<3> item_ct1) {
kernel(d_data, item_ct1);
});
float data;
q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
sycl::free(d_data, q_ct1);
}
which is rewritten to:
void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
int tid = item_ct1.get_local_id(2);
d_data[tid + 1] = tid;
}
void foo() {
dpct::device_ext &dev_ct1 = dpct::get_current_device();
sycl::queue &q_ct1 = dev_ct1.default_queue();
float *d_data;
d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
q_ct1.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
[=](sycl::nd_item<3> item_ct1) {
kernel(d_data, item_ct1);
});
oneapi::mkl::blas::column_major::asum(q_ct1, 128, d_data + 1, 1, d_data);
float data;
q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
sycl::free(d_data, q_ct1);
}