Visible to Intel only — GUID: GUID-6C24CD30-A12D-4D47-A793-72CDB7AD5552
Visible to Intel only — GUID: GUID-6C24CD30-A12D-4D47-A793-72CDB7AD5552
Configuring and computing a DFT in DPC++
Some usage examples are provided below for computing (possibly batched) DFTs using device-accessible USM allocations, with default or otherwise recommended strides. More working examples can also be found in the oneMKL installation directory under share/doc/mkl/examples/sycl/dft.
To use the DPC++ interface, include oneapi/mkl/dft.hpp. The DPC++ interface supports CPU and Intel GPU devices with some limitations, documented in previous pages. Refer to the Intel® oneAPI Math Kernel Library Release Notes for other (release-specific) known limitations.
The DFT functions assume the Cartesian representation of complex data; that is, complex numbers are defined by their real and imaginary parts. oneMKL provides efficient Vector Mathematical Functions for conversion to and from polar representation.
Usage examples
The following example illustrates the computation of (possibly many) complex and real discrete Fourier transforms using device-accessible USM allocations without dependencies on other events, via the DPC++ interface of oneMKL.
#include <oneapi/mkl/dft.hpp>
#include <complex>
enum class dft_compute_direction {
FWD_DFT,
BWD_DFT
};
namespace dft_ns = oneapi::mkl::dft;
template<typename T>
bool is_usable_compute_type = std::is_same_v<T, float> |
std::is_same_v<T, std::complex<float>> |
std::is_same_v<T, double> |
std::is_same_v<T, std::complex<double>>;
template<dft_ns::precision prec, dft_ns::domain dom>
void ready_descriptor(dft_ns::descriptor<prec, dom>& desc,
sycl::queue& q,
dft_ns::config_value placement,
std::int64_t batch_size) {
// this example assumes that desc was freshly created and that its default
// configuration values have not been changed prior to invoking this routine
// Fetch rank, lengths and type of forward domain
std::int64_t rank;
desc.get_value(dft_ns::config_param::DIMENSION, &rank);
std::vector<std::int64_t> lengths(rank);
desc.get_value(dft_ns::config_param::LENGTHS, &lengths);
if (placement == dft_ns::config_value::NOT_INPLACE) {
// non-default behavior
desc.set_value(dft_ns::config_param::PLACEMENT, placement);
if constexpr (dom == dft_ns::domain::REAL) {
// for real descriptors interpreting backward domain's data type as
// complex (default behavior), unpadded forward-domain data layouts
// may be used if operating out-of-place:
std::vector<std::int64_t> fwd_strides(rank + 1);
std::vector<std::int64_t> bwd_strides(rank + 1);
fwd_strides[rank] = bwd_strides[rank] = 1;
if (rank > 1) {
fwd_strides[rank - 1] = lengths[rank - 1];
bwd_strides[rank - 1] = lengths[rank - 1] / 2 + 1;
for (std::int64_t dim = 2; dim < rank; dim++) {
fwd_strides[rank - dim] = fwd_strides[rank - dim + 1] * lengths[rank - dim];
bwd_strides[rank - dim] = bwd_strides[rank - dim + 1] * lengths[rank - dim];
}
}
fwd_strides[0] = bwd_strides[0] = 0; // offsets in data layouts
desc.set_value(dft_ns::config_param::FWD_STRIDES, fwd_strides);
desc.set_value(dft_ns::config_param::BWD_STRIDES, bwd_strides);
}
}
if (batch_size > 1) {
// configuration of batched DFTs
desc.set_value(dft_ns::config_param::NUMBER_OF_TRANSFORMS, batch_size);
std::int64_t fwd_distance, bwd_distance;
fwd_distance = bwd_distance = 1;
for (std::int64_t dim = 0; dim < rank - 1; dim++) {
fwd_distance *= lengths[dim];
bwd_distance *= lengths[dim];
}
if constexpr (dom == dft_ns::domain::REAL) {
if (placement == dft_ns::config_value::NOT_INPLACE)
fwd_distance *= lengths[rank - 1];
else
fwd_distance *= 2 * (lengths[rank - 1] / 2 + 1);
bwd_distance *= (lengths[rank - 1] / 2 + 1);
}
else {
fwd_distance *= lengths[rank - 1];
bwd_distance *= lengths[rank - 1];
}
desc.set_value(dft_ns::config_param::FWD_DISTANCE, fwd_distance);
desc.set_value(dft_ns::config_param::BWD_DISTANCE, bwd_distance);
}
desc.commit(q);
}
template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_cmplx_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr auto prec =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>> ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
}
template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_cmplx_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data_in,
T* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// = Y[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// = Y[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// = Y[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Y =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
// (0 <= kj < lengths[j-1])
}
template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_real_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// Let
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data);
// and
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
}
}
template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_real_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data_in,
T* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_out);
}
}