disable_loop_pipelining |
Directs the Intel® oneAPI DPC++/C++ Compiler to disable pipelining of a loop. |
[[intel::disable_loop_pipelining]]
for (int i = 1; i < N; i++) {
int j = a[i-1];
// Memory dependency induces a high-latency loop feedback path
a[i] = foo(j)
}
|
initiation_interval |
Forces a loop to have a loop initialization interval (II) of a specified value. |
// ii set to 5
[[intel::initiation_interval(5)]]
for (int i = 0; i < N; ++i){
}
|
ivdep |
Ignores memory dependencies between iterations of this loop |
// ivdep loop
[[intel::ivdep]] for (…) {}
//ivdep safelen
[[intel::ivdep(safelen)]] for (;;) {}
// ivdep accessor
[[intel::ivdep(accessorA)]] for (;;) {}
//ivdep array safelen
[[intel::ivdep(accessorA, safelen)]]
for (;;){}
|
loop_coalesce |
Coalesces nested loops into a single loop without affecting the loop functionality. |
[[intel::loop_coalesce(2)]]
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
sum[i][j] += i+j;
|
max_concurrency |
Limits the number of iterations of a loop that can simultaneously execute at any time. |
//max concurrency set to 1
[[intel::max_concurrency(1)]]
for (int i = 0; i < c; ++i){
}
|
max_interleaving |
Maximizes the throughput and hardware resource occupancy of pipelined inner loops in a loop nest. |
// Loop j is pipelined with ii=1
for (int j = 0; j < M; j++) {
int a[N];
// Loop i is pipelined with ii=2
[[intel::max_interleaving(1)]]
for (int i = 1; i < N; i++) {
a[i] = foo(i)
}
…
}
|
speculated_iterations |
Improves the performance of pipelined loops. |
[[intel::speculated_iterations(1)]]
while (m*m*m < N) {
m += 1;
}
dst[0] = m;
|
unroll |
Unrolls a loop in the kernel code. |
// unroll factor N set to 2
#pragma unroll 2
for(size_t k = 0; k < 4; k++){
mac += data_in[(gid * 4) + k] * coeff[k];
}
|
nofusion |
Prevents the compiler from fusing the annotated loop with any of the adjacent loops. |
for (int x = 0; x < N; x++) {
a1_acc[x] = x;
}
[[intel::nofusion]]
for (int x = 0; x < N; x++) {
a2_acc[x] = x;
}
|
sycl::ext::intel::fpga_loop_fuse<v>(f) |
Fuses loops within the function f up to a depth of v >= 1, where v = 1 by default. |
[=]() [[intel::kernel_args_restrict]] {
sycl::ext::intel::fpga_loop_fuse<v>{
for (int x = 0; x < N; x++) {
for (int y = 0; y < N; y++) {
for (int z = 0; z < N; z++) {
a1_acc[x][y][z] = 0;
}
}
}
for (int x = 0; x < N + 1; x++) {
for (int y = 0; y < N + 1; y++) {
for (int z = 0; z < N + 1; z++) {
a2_acc[x][y][z] = 0;
}
}
}
}
}
|
sycl::ext::intel::fpga_loop_fuse<v><v>(f) |
Fuses loops within the function f up to a depth v >= 1 while overriding fusion-safety checks. Here, v = 1 by default. |
[=]() { //Kernel
sycl::ext::intel::fpga_loop_fuse_independent([&] {
for(int x = 0; x < N; x++){
a3_acc[x] = x;
}
for(int x = 0; x < N + 1; x++){
a4_acc[x] = x;
}
});
}
|