Visible to Intel only — GUID: GUID-6E72A739-0AAE-4BDC-8C6E-BE05AEBDD174
Visible to Intel only — GUID: GUID-6E72A739-0AAE-4BDC-8C6E-BE05AEBDD174
Configuring and computing a DFT in DPC++
Some usage examples are provided below for computing (possibly batched) DFTs using device-accessible USM allocations, with default strides (if applicable). Users are also advised to consult the working usage examples that can be found in the oneMKL installation directory under share/doc/mkl/examples/sycl/dft.
To use the DPC++ interface, include oneapi/mkl/dfti.hpp. The DPC++ interface supports CPU and Intel GPU devices. Some device-specific limitations are noted in this section and the supported data layouts on GPU devices are documented in Supported layouts on GPU devices. Refer to the Intel® oneAPI Math Kernel Library Release Notes for other known limitations.
The DFT functions assume the Cartesian representation of complex data (that is, the real and imaginary parts define a complex number). The Intel® oneAPI Math Kernel Library Vector Mathematical Functions provide efficient tools for conversion to and from polar representation. See the Conversion of complex data examples.
Usage examples
The following example illustrates the computation of (possibly many) complex and real discrete Fourier transforms using device-accessible USM allocations without dependencies on other events, via the DPC++ interface of oneMKL.
#include <oneapi/mkl/dfti.hpp>
enum class dft_compute_direction {
FWD_DFT,
BWD_DFT
};
namespace dft_ns = oneapi::mkl::dft;
template<typename descriptor_type>
void ready_descriptor(descriptor_type& desc, sycl::queue& q,
enum DFTI_CONFIG_VALUE placement, int64_t batch_size) {
// this example assumes that desc was freshly created and that its default
// configuration values have not been changed prior to invoking this routine
// Fetch rank, lengths and type of forward domain
int64_t rank;
desc.get_value(dft_ns::config_param::DIMENSION, &rank);
std::vector<int64_t> lengths(rank);
desc.get_value(dft_ns::config_param::LENGTHS, lengths.data());
dft_ns::domain fwd_domain;
desc.get_value(dft_ns::config_param::FORWARD_DOMAIN, &fwd_domain);
if (placement == DFTI_NOT_INPLACE) {
// non-default behavior
desc.set_value(dft_ns::config_param::PLACEMENT, placement);
if (fwd_domain == dft_ns::domain::REAL) {
// for real descriptors interpreting backward domain's data type as
// complex (default behavior), unpadded forward-domain data layouts
// may be used if operating out-of-place:
std::vector<int64_t> fwd_strides(rank + 1);
std::vector<int64_t> bwd_strides(rank + 1);
fwd_strides[rank] = bwd_strides[rank] = 1;
if (rank > 1) {
fwd_strides[rank - 1] = lengths[rank - 1];
bwd_strides[rank - 1] = lengths[rank - 1] / 2 + 1;
for (int64_t dim = 2; dim < rank; dim++) {
fwd_strides[rank - dim] = fwd_strides[rank - dim + 1] * lengths[rank - dim];
bwd_strides[rank - dim] = bwd_strides[rank - dim + 1] * lengths[rank - dim];
}
}
fwd_strides[0] = bwd_strides[0] = 0; // offsets in data layouts
desc.set_value(dft_ns::config_param::FWD_STRIDES, fwd_strides.data());
desc.set_value(dft_ns::config_param::BWD_STRIDES, bwd_strides.data());
}
}
if (batch_size > 1) {
// configuration of batched DFTs
desc.set_value(dft_ns::config_param::NUMBER_OF_TRANSFORMS, batch_size);
int64_t fwd_distance, bwd_distance;
fwd_distance = bwd_distance = 1;
for (int64_t dim = 0; dim < rank - 1; dim++) {
fwd_distance *= lengths[dim];
bwd_distance *= lengths[dim];
}
if (fwd_domain == dft_ns::domain::REAL) {
if (placement == DFTI_NOT_INPLACE)
fwd_distance *= lengths[rank - 1];
else
fwd_distance *= 2 * (lengths[rank - 1] / 2 + 1);
bwd_distance *= (lengths[rank - 1] / 2 + 1);
}
else {
fwd_distance *= lengths[rank - 1];
bwd_distance *= lengths[rank - 1];
}
desc.set_value(dft_ns::config_param::FWD_DISTANCE, fwd_distance);
desc.set_value(dft_ns::config_param::BWD_DISTANCE, bwd_distance);
}
desc.commit(q);
}
template<typename T>
sycl::event compute_inplace_cmplx_dft(sycl::queue& q,
const std::vector<int64_t>& lengths,
const int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, DFTI_INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// std::complex<T>* Z = static_cast<std::complex<T>*>(device_accessible_usm_data);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
}
template<typename T_in, typename T_out>
sycl::event compute_outofplace_cmplx_dft(sycl::queue& q,
const std::vector<int64_t>& lengths,
const int64_t& batch_size,
const dft_compute_direction& direction,
T_in* device_accessible_usm_data_in,
T_out* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T_in, float> | std::is_same_v<T_in, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, DFTI_NOT_INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// std::complex<T>* Z = static_cast<std::complex<T>*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// = Y[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// = Y[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// = Y[ m*lengths[0] + k1 ]
// wherein
// std::complex<T>* Y = static_cast<std::complex<T>*>(device_accessible_usm_data_out);
// (0 <= kj < lengths[j-1])
}
template<typename T>
sycl::event compute_inplace_real_dft(sycl::queue& q,
const std::vector<int64_t>& lengths,
const int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, DFTI_INPLACE, batch_size);
// Let
// T* fwd_Z = static_cast<T*>(device_accessible_usm_data);
// and
// std::complex<T>* bwd_Z = static_cast<std::complex<T>*>(device_accessible_usm_data);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
}
}
template<typename T_in, typename T_out>
sycl::event compute_outofplace_real_dft(sycl::queue& q,
const std::vector<int64_t>& lengths,
const int64_t& batch_size,
const dft_compute_direction& direction,
T_in* device_accessible_usm_data_in,
T_out* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T_in, float> | std::is_same_v<T_in, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, DFTI_NOT_INPLACE, batch_size);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// wherein
// T* fwd_Z = static_cast<T*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// std::complex<T>* bwd_Z = static_cast<std::complex<T>*>(device_accessible_usm_data_out);
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// std::complex<T>* bwd_Z = static_cast<std::complex<T>*>(device_accessible_usm_data_in);
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
// wherein
// T* fwd_Z = static_cast<T*>(device_accessible_usm_data_out);
}
}