Visible to Intel only — GUID: GUID-F6F71727-FB36-4E81-A45F-7857EC73C0BD
Visible to Intel only — GUID: GUID-F6F71727-FB36-4E81-A45F-7857EC73C0BD
Efficiency with Structure of Arrays Example
This example demonstrates the efficiency of using a Structure of Arrays (SoA) approach by comparing the assembly generated from a simple SIMD loop using an Array of Structures (AoS) approach with the assembly generated using the SoA approach of SDLT.
Array of Structures: Non-unit stride access version
Source:
#include <stdio.h> #define N 1024 typedef struct RGBs { float r; float g; float b; } RGBTy; void main() { RGBTy a[N]; #pragma omp simd for (int k = 0; k<N; ++k) { a[k].r = k*1.5; // non-unit stride access a[k].g = k*2.5; // non-unit stride access a[k].b = k*3.5; // non-unit stride access } std::cout << "k =" << 10 << ", a[k].r =" << a[10].r << ", a[k].g =" << a[10].g << ", a[k].b =" << a[10].b << std::endl; }
AVX2 assembly generated (69 instructions):
..TOP_OF_LOOP: vcvtdq2ps %ymm7, %ymm1 lea (%rax), %rcx vcvtdq2ps %ymm5, %ymm2 vpaddd %ymm3, %ymm7, %ymm7 vpaddd %ymm3, %ymm5, %ymm5 vmulps %ymm1, %ymm4, %ymm8 vmulps %ymm1, %ymm6, %ymm12 vmulps %ymm2, %ymm6, %ymm14 vmulps %ymm1, %ymm0, %ymm1 vmulps %ymm2, %ymm4, %ymm10 addl $16, %edx vextractf128 $1, %ymm8, %xmm9 vmovss %xmm8, (%rsp,%rcx) vmovss %xmm9, 48(%rsp,%rcx) vextractps $1, %xmm8, 12(%rsp,%rcx) vextractps $2, %xmm8, 24(%rsp,%rcx) vextractps $3, %xmm8, 36(%rsp,%rcx) vmulps %ymm2, %ymm0, %ymm8 vextractps $1, %xmm9, 60(%rsp,%rcx) vextractps $2, %xmm9, 72(%rsp,%rcx) vextractps $3, %xmm9, 84(%rsp,%rcx) vextractf128 $1, %ymm12, %xmm13 vextractf128 $1, %ymm14, %xmm15 vextractf128 $1, %ymm1, %xmm2 vextractf128 $1, %ymm8, %xmm9 vmovss %xmm12, 4(%rsp,%rax) vmovss %xmm13, 52(%rsp,%rax) vextractps $1, %xmm12, 16(%rsp,%rax) vextractps $2, %xmm12, 28(%rsp,%rax) vextractps $3, %xmm12, 40(%rsp,%rax) vextractps $1, %xmm13, 64(%rsp,%rax) vextractps $2, %xmm13, 76(%rsp,%rax) vextractps $3, %xmm13, 88(%rsp,%rax) vmovss %xmm14, 100(%rsp,%rax) vextractps $1, %xmm14, 112(%rsp,%rax) vextractps $2, %xmm14, 124(%rsp,%rax) vextractps $3, %xmm14, 136(%rsp,%rax) vmovss %xmm15, 148(%rsp,%rax) vextractps $1, %xmm15, 160(%rsp,%rax) vextractps $2, %xmm15, 172(%rsp,%rax) vextractps $3, %xmm15, 184(%rsp,%rax) vmovss %xmm1, 8(%rsp,%rax) vextractps $1, %xmm1, 20(%rsp,%rax) vextractps $2, %xmm1, 32(%rsp,%rax) vextractps $3, %xmm1, 44(%rsp,%rax) vmovss %xmm2, 56(%rsp,%rax) vextractps $1, %xmm2, 68(%rsp,%rax) vextractps $2, %xmm2, 80(%rsp,%rax) vextractps $3, %xmm2, 92(%rsp,%rax) vmovss %xmm8, 104(%rsp,%rax) vextractps $1, %xmm8, 116(%rsp,%rax) vextractps $2, %xmm8, 128(%rsp,%rax) vextractps $3, %xmm8, 140(%rsp,%rax) vmovss %xmm9, 152(%rsp,%rax) vextractps $1, %xmm9, 164(%rsp,%rax) vextractps $2, %xmm9, 176(%rsp,%rax) vextractps $3, %xmm9, 188(%rsp,%rax) addq $192, %rax vextractf128 $1, %ymm10, %xmm11 vmovss %xmm10, 96(%rsp,%rcx) vmovss %xmm11, 144(%rsp,%rcx) vextractps $1, %xmm10, 108(%rsp,%rcx) vextractps $2, %xmm10, 120(%rsp,%rcx) vextractps $3, %xmm10, 132(%rsp,%rcx) vextractps $1, %xmm11, 156(%rsp,%rcx) vextractps $2, %xmm11, 168(%rsp,%rcx) vextractps $3, %xmm11, 180(%rsp,%rcx) cmpl $1024, %edx jb ..TOP_OF_LOOP
Structure of Arrays: Using SDLT for unit stride access
To introduce the use of SDLT, the code below will:
declare a primitive,
use an soa1d_container instead of an array
use an accessor inside a SIMD loop to generate efficient code
use a proxy object’s data member interface to access individual data members of an element inside the container
Source:
#include <stdio.h> #include <sdlt/sdlt.h> #define N 1024 typedef struct RGBs { float r; float g; float b; } RGBTy; SDLT_PRIMITIVE(RGBTy, r, g, b) void main() { // Use SDLT to get SOA data layout sdlt::soa1d_container<RGBTy> aContainer(N); auto a = aContainer.access(); // use SDLT Data Member Interface to access struct members r, g, and b. // achieve unit-stride access after vectorization #pragma omp simd for (int k = 0; k<N; k++) { a[k].r() = k*1.5; a[k].g() = k*2.5; a[k].b() = k*3.5; } std::cout << "k =" << 10 << ", a[k].r =" << a[10].r() << ", a[k].g =" << a[10].g() << ", a[k].b =" << a[10].b() << std::endl; }
AVX2 assemply generated (19 instructions):
..TOP_OF_LOOP: vpaddd %ymm4, %ymm3, %ymm12 vcvtdq2ps %ymm3, %ymm7 vcvtdq2ps %ymm12, %ymm10 vmulps %ymm7, %ymm2, %ymm5 vmulps %ymm7, %ymm1, %ymm6 vmulps %ymm7, %ymm0, %ymm8 vmulps %ymm10, %ymm2, %ymm3 vmulps %ymm10, %ymm1, %ymm9 vmulps %ymm10, %ymm0, %ymm11 vmovups %ymm5, (%r13,%rax,4) vmovups %ymm6, (%r15,%rax,4) vmovups %ymm8, (%rbx,%rax,4) vmovups %ymm3, 32(%r13,%rax,4) vmovups %ymm9, 32(%r15,%rax,4) vmovups %ymm11, 32(%rbx,%rax,4) vpaddd %ymm4, %ymm12, %ymm3 addq $16, %rax cmpq $1024, %rax jb ..TOP_OF_LOOP
Both versions appear to have unrolled the loop twice. When examining the assembly generated for AVX2 instruction set, we can see a measurable reduction in the number of instructions (19 vs. 69) when we are able to perform unit stride access using SDLT. Also, at runtime, the soa1d_container aligned its data allocation and will gain any of the architectural advantages that come with using aligned instead of unaligned SIMD stores.