• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37 
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
_MM_EXTRACT_EPI64(__m128i a,const int32_t ndx)42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44     OSALIGNLINE(uint32_t) elems[4];
45     _mm_store_si128((__m128i*)elems, a);
46     if (ndx == 0)
47     {
48         uint64_t foo = elems[0];
49         foo |= (uint64_t)elems[1] << 32;
50         return foo;
51     }
52     else
53     {
54         uint64_t foo = elems[2];
55         foo |= (uint64_t)elems[3] << 32;
56         return foo;
57     }
58 }
59 
_MM_INSERT_EPI64(__m128i a,int64_t b,const int32_t ndx)60 INLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62     OSALIGNLINE(int64_t) elems[2];
63     _mm_store_si128((__m128i*)elems, a);
64     if (ndx == 0)
65     {
66         elems[0] = b;
67     }
68     else
69     {
70         elems[1] = b;
71     }
72     __m128i out;
73     out = _mm_load_si128((const __m128i*)elems);
74     return out;
75 }
76 #endif
77 
78 struct simdBBox
79 {
80     simdscalari ymin;
81     simdscalari ymax;
82     simdscalari xmin;
83     simdscalari xmax;
84 };
85 
86 INLINE
vTranspose(__m128 & row0,__m128 & row1,__m128 & row2,__m128 & row3)87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89     __m128i row0i = _mm_castps_si128(row0);
90     __m128i row1i = _mm_castps_si128(row1);
91     __m128i row2i = _mm_castps_si128(row2);
92     __m128i row3i = _mm_castps_si128(row3);
93 
94     __m128i vTemp = row2i;
95     row2i = _mm_unpacklo_epi32(row2i, row3i);
96     vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97 
98     row3i = row0i;
99     row0i = _mm_unpacklo_epi32(row0i, row1i);
100     row3i = _mm_unpackhi_epi32(row3i, row1i);
101 
102     row1i = row0i;
103     row0i = _mm_unpacklo_epi64(row0i, row2i);
104     row1i = _mm_unpackhi_epi64(row1i, row2i);
105 
106     row2i = row3i;
107     row2i = _mm_unpacklo_epi64(row2i, vTemp);
108     row3i = _mm_unpackhi_epi64(row3i, vTemp);
109 
110     row0 = _mm_castsi128_ps(row0i);
111     row1 = _mm_castsi128_ps(row1i);
112     row2 = _mm_castsi128_ps(row2i);
113     row3 = _mm_castsi128_ps(row3i);
114 }
115 
116 INLINE
vTranspose(__m128i & row0,__m128i & row1,__m128i & row2,__m128i & row3)117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119     __m128i vTemp = row2;
120     row2 = _mm_unpacklo_epi32(row2, row3);
121     vTemp = _mm_unpackhi_epi32(vTemp, row3);
122 
123     row3 = row0;
124     row0 = _mm_unpacklo_epi32(row0, row1);
125     row3 = _mm_unpackhi_epi32(row3, row1);
126 
127     row1 = row0;
128     row0 = _mm_unpacklo_epi64(row0, row2);
129     row1 = _mm_unpackhi_epi64(row1, row2);
130 
131     row2 = row3;
132     row2 = _mm_unpacklo_epi64(row2, vTemp);
133     row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135 
136 #define GCC_VERSION (__GNUC__ * 10000 \
137                      + __GNUC_MINOR__ * 100 \
138                      + __GNUC_PATCHLEVEL__)
139 
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147 
148 #if KNOB_SIMD_WIDTH == 8
149 INLINE
vTranspose3x8(__m128 (& vDst)[8],const __m256 & vSrc0,const __m256 & vSrc1,const __m256 & vSrc2)150 void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151 {
152     __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
153     __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
154     __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
155     __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
156 
157     r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
158     r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
159     __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
160     __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
161 
162     vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163     vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164     vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165     vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166 
167     vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168     vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169     vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170     vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172 
173 INLINE
vTranspose4x8(__m128 (& vDst)[8],const __m256 & vSrc0,const __m256 & vSrc1,const __m256 & vSrc2,const __m256 & vSrc3)174 void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175 {
176     __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
177     __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
178     __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
179     __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
180 
181     r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
182     r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
183     __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
184     __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
185 
186     vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187     vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188     vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189     vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190 
191     vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192     vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193     vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194     vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196 
197 #if ENABLE_AVX512_SIMD16
198 INLINE
vTranspose4x16(simd16scalar (& dst)[4],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2,const simd16scalar & src3)199 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
200 {
201     const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
202 
203     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
204     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
205     simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
206     simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
207 
208     simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
209     simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
210     simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
211     simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
212 
213     dst[0] = _simd16_unpacklo_ps(rblo, galo);
214     dst[1] = _simd16_unpackhi_ps(rblo, galo);
215     dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
216     dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
217 }
218 
219 #endif
220 INLINE
vTranspose8x8(__m256 (& vDst)[8],const __m256 & vMask0,const __m256 & vMask1,const __m256 & vMask2,const __m256 & vMask3,const __m256 & vMask4,const __m256 & vMask5,const __m256 & vMask6,const __m256 & vMask7)221 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
222 {
223     __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
224     __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
225     __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
226     __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
227     __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
228     __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
229     __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
230     __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
231     __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
232     __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
233     __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
234     __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
235     __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
236     __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
237     __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
238     __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
239     vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
240     vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
241     vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
242     vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
243     vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
244     vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
245     vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
246     vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
247 }
248 
249 INLINE
vTranspose8x8(__m256 (& vDst)[8],const __m256i & vMask0,const __m256i & vMask1,const __m256i & vMask2,const __m256i & vMask3,const __m256i & vMask4,const __m256i & vMask5,const __m256i & vMask6,const __m256i & vMask7)250 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
251 {
252     vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
253         _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
254 }
255 #endif
256 
257 //////////////////////////////////////////////////////////////////////////
258 /// TranposeSingleComponent
259 //////////////////////////////////////////////////////////////////////////
260 template<uint32_t bpp>
261 struct TransposeSingleComponent
262 {
263     //////////////////////////////////////////////////////////////////////////
264     /// @brief Pass-thru for single component.
265     /// @param pSrc - source data in SOA form
266     /// @param pDst - output data in AOS form
TransposeTransposeSingleComponent267     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
268     {
269         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
270     }
271 #if ENABLE_AVX512_SIMD16
272 
Transpose_16TransposeSingleComponent273     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
274     {
275         memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
276     }
277 #endif
278 };
279 
280 //////////////////////////////////////////////////////////////////////////
281 /// Transpose8_8_8_8
282 //////////////////////////////////////////////////////////////////////////
283 struct Transpose8_8_8_8
284 {
285     //////////////////////////////////////////////////////////////////////////
286     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287     /// @param pSrc - source data in SOA form
288     /// @param pDst - output data in AOS form
TransposeTranspose8_8_8_8289     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
290     {
291         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
292 
293 #if KNOB_SIMD_WIDTH == 8
294 #if KNOB_ARCH == KNOB_ARCH_AVX
295         __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
296         __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
297         __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
298         __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
299         __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
300         __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
301         __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
302         __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
303         _mm_store_si128((__m128i*)pDst, c0123lo);
304         _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
305 #elif KNOB_ARCH == KNOB_ARCH_AVX2
306         simdscalari dst01 = _mm256_shuffle_epi8(src,
307             _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
308         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
309         dst23 = _mm256_shuffle_epi8(dst23,
310             _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
311         simdscalari dst = _mm256_or_si256(dst01, dst23);
312         _simd_store_si((simdscalari*)pDst, dst);
313 #endif
314 #else
315 #error Unsupported vector width
316 #endif
317     }
318 #if ENABLE_AVX512_SIMD16
319 
Transpose_16Transpose8_8_8_8320     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
321     {
322         __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
323         __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
324         __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
325         __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
326 
327         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
328         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
329         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
330         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
331 
332         simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
333         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
334         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
335 
336         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
337 
338         _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
339     }
340 #endif
341 };
342 
343 //////////////////////////////////////////////////////////////////////////
344 /// Transpose8_8_8
345 //////////////////////////////////////////////////////////////////////////
346 struct Transpose8_8_8
347 {
348     //////////////////////////////////////////////////////////////////////////
349     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
350     /// @param pSrc - source data in SOA form
351     /// @param pDst - output data in AOS form
352     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
353 #if ENABLE_AVX512_SIMD16
354 
355     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
356 #endif
357 };
358 
359 //////////////////////////////////////////////////////////////////////////
360 /// Transpose8_8
361 //////////////////////////////////////////////////////////////////////////
362 struct Transpose8_8
363 {
364     //////////////////////////////////////////////////////////////////////////
365     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
366     /// @param pSrc - source data in SOA form
367     /// @param pDst - output data in AOS form
TransposeTranspose8_8368     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
369     {
370 #if KNOB_SIMD_WIDTH == 8
371         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
372 
373         __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
374         __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
375         rg = _mm_unpacklo_epi8(rg, g);
376         _mm_store_si128((__m128i*)pDst, rg);
377 #else
378 #error Unsupported vector width
379 #endif
380     }
381 #if ENABLE_AVX512_SIMD16
382 
Transpose_16Transpose8_8383     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
384     {
385         __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
386         __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
387 
388         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
389         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
390 
391         simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
392 
393         simdscalari dst = _simd_or_si(cvt0, shl1);
394 
395         _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
396     }
397 #endif
398 };
399 
400 //////////////////////////////////////////////////////////////////////////
401 /// Transpose32_32_32_32
402 //////////////////////////////////////////////////////////////////////////
403 struct Transpose32_32_32_32
404 {
405     //////////////////////////////////////////////////////////////////////////
406     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
407     /// @param pSrc - source data in SOA form
408     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32_32409     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
410     {
411 #if KNOB_SIMD_WIDTH == 8
412         simdscalar src0 = _simd_load_ps((const float*)pSrc);
413         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
414         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
415         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
416 
417         __m128 vDst[8];
418         vTranspose4x8(vDst, src0, src1, src2, src3);
419         _mm_store_ps((float*)pDst, vDst[0]);
420         _mm_store_ps((float*)pDst+4, vDst[1]);
421         _mm_store_ps((float*)pDst+8, vDst[2]);
422         _mm_store_ps((float*)pDst+12, vDst[3]);
423         _mm_store_ps((float*)pDst+16, vDst[4]);
424         _mm_store_ps((float*)pDst+20, vDst[5]);
425         _mm_store_ps((float*)pDst+24, vDst[6]);
426         _mm_store_ps((float*)pDst+28, vDst[7]);
427 #else
428 #error Unsupported vector width
429 #endif
430     }
431 #if ENABLE_AVX512_SIMD16
432 
Transpose_16Transpose32_32_32_32433     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
434     {
435         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
436         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
437         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
438         simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
439 
440         simd16scalar dst[4];
441 
442         vTranspose4x16(dst, src0, src1, src2, src3);
443 
444         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
445         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
446         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
447         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
448     }
449 #endif
450 };
451 
452 //////////////////////////////////////////////////////////////////////////
453 /// Transpose32_32_32
454 //////////////////////////////////////////////////////////////////////////
455 struct Transpose32_32_32
456 {
457     //////////////////////////////////////////////////////////////////////////
458     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
459     /// @param pSrc - source data in SOA form
460     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32461     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
462     {
463 #if KNOB_SIMD_WIDTH == 8
464         simdscalar src0 = _simd_load_ps((const float*)pSrc);
465         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
466         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
467 
468         __m128 vDst[8];
469         vTranspose3x8(vDst, src0, src1, src2);
470         _mm_store_ps((float*)pDst, vDst[0]);
471         _mm_store_ps((float*)pDst + 4, vDst[1]);
472         _mm_store_ps((float*)pDst + 8, vDst[2]);
473         _mm_store_ps((float*)pDst + 12, vDst[3]);
474         _mm_store_ps((float*)pDst + 16, vDst[4]);
475         _mm_store_ps((float*)pDst + 20, vDst[5]);
476         _mm_store_ps((float*)pDst + 24, vDst[6]);
477         _mm_store_ps((float*)pDst + 28, vDst[7]);
478 #else
479 #error Unsupported vector width
480 #endif
481     }
482 #if ENABLE_AVX512_SIMD16
483 
Transpose_16Transpose32_32_32484     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
485     {
486         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
487         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
488         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
489         simd16scalar src3 = _simd16_setzero_ps();
490 
491         simd16scalar dst[4];
492 
493         vTranspose4x16(dst, src0, src1, src2, src3);
494 
495         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
496         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
497         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
498         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
499     }
500 #endif
501 };
502 
503 //////////////////////////////////////////////////////////////////////////
504 /// Transpose32_32
505 //////////////////////////////////////////////////////////////////////////
506 struct Transpose32_32
507 {
508     //////////////////////////////////////////////////////////////////////////
509     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
510     /// @param pSrc - source data in SOA form
511     /// @param pDst - output data in AOS form
TransposeTranspose32_32512     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
513     {
514 #if KNOB_SIMD_WIDTH == 8
515         const float* pfSrc = (const float*)pSrc;
516         __m128 src_r0 = _mm_load_ps(pfSrc + 0);
517         __m128 src_r1 = _mm_load_ps(pfSrc + 4);
518         __m128 src_g0 = _mm_load_ps(pfSrc + 8);
519         __m128 src_g1 = _mm_load_ps(pfSrc + 12);
520 
521         __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
522         __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
523         __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
524         __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
525 
526         float* pfDst = (float*)pDst;
527         _mm_store_ps(pfDst + 0, dst0);
528         _mm_store_ps(pfDst + 4, dst1);
529         _mm_store_ps(pfDst + 8, dst2);
530         _mm_store_ps(pfDst + 12, dst3);
531 #else
532 #error Unsupported vector width
533 #endif
534     }
535 #if ENABLE_AVX512_SIMD16
536 
Transpose_16Transpose32_32537     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
538     {
539         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
540         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
541 
542         simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
543         simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
544 
545         simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
546         simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
547 
548         simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
549         simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
550 
551         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
552         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
553     }
554 #endif
555 };
556 
557 //////////////////////////////////////////////////////////////////////////
558 /// Transpose16_16_16_16
559 //////////////////////////////////////////////////////////////////////////
560 struct Transpose16_16_16_16
561 {
562     //////////////////////////////////////////////////////////////////////////
563     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
564     /// @param pSrc - source data in SOA form
565     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16_16566     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
567     {
568 #if KNOB_SIMD_WIDTH == 8
569         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
570         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
571 
572         __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
573         __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
574         __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
575         __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
576 
577         __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
578         __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
579         __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
580         __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
581 
582         __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
583         __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
584         __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
585         __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
586 
587         _mm_store_si128(((__m128i*)pDst) + 0, dst0);
588         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
589         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
590         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
591 #else
592 #error Unsupported vector width
593 #endif
594     }
595 #if ENABLE_AVX512_SIMD16
596 
Transpose_16Transpose16_16_16_16597     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
598     {
599         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
600         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
601         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
602         simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
603 
604         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
605         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
606         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
607         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
608 
609         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
610         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
611         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
612         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
613 
614         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
615         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
616         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
617         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
618 
619         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
620         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
621         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
622         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
623     }
624 #endif
625 };
626 
627 //////////////////////////////////////////////////////////////////////////
628 /// Transpose16_16_16
629 //////////////////////////////////////////////////////////////////////////
630 struct Transpose16_16_16
631 {
632     //////////////////////////////////////////////////////////////////////////
633     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
634     /// @param pSrc - source data in SOA form
635     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16636     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
637     {
638 #if KNOB_SIMD_WIDTH == 8
639         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
640 
641         __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
642         __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
643         __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
644         __m128i src_a = _mm_undefined_si128();
645 
646         __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
647         __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
648         __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
649         __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
650 
651         __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
652         __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
653         __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
654         __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
655 
656         _mm_store_si128(((__m128i*)pDst) + 0, dst0);
657         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
658         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
659         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
660 #else
661 #error Unsupported vector width
662 #endif
663     }
664 #if ENABLE_AVX512_SIMD16
665 
Transpose_16Transpose16_16_16666     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
667     {
668         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
669         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
670         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
671         simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
672 
673         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
674         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
675         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
676         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
677 
678         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
679         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
680         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
681         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
682 
683         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
684         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
685         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
686         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
687 
688         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
689         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
690         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
691         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
692     }
693 #endif
694 };
695 
696 //////////////////////////////////////////////////////////////////////////
697 /// Transpose16_16
698 //////////////////////////////////////////////////////////////////////////
699 struct Transpose16_16
700 {
701     //////////////////////////////////////////////////////////////////////////
702     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
703     /// @param pSrc - source data in SOA form
704     /// @param pDst - output data in AOS form
TransposeTranspose16_16705     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
706     {
707 #if KNOB_SIMD_WIDTH == 8
708         simdscalar src = _simd_load_ps((const float*)pSrc);
709 
710         __m128 comp0 = _mm256_castps256_ps128(src);
711         __m128 comp1 = _mm256_extractf128_ps(src, 1);
712 
713         __m128i comp0i = _mm_castps_si128(comp0);
714         __m128i comp1i = _mm_castps_si128(comp1);
715 
716         __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
717         __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
718 
719         _mm_store_si128((__m128i*)pDst, resLo);
720         _mm_store_si128((__m128i*)pDst + 1, resHi);
721 #else
722 #error Unsupported vector width
723 #endif
724     }
725 #if ENABLE_AVX512_SIMD16
726 
Transpose_16Transpose16_16727     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
728     {
729         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
730         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
731 
732         simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
733         simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
734 
735         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
736         simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
737 
738         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
739         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
740     }
741 #endif
742 };
743 
744 //////////////////////////////////////////////////////////////////////////
745 /// Transpose24_8
746 //////////////////////////////////////////////////////////////////////////
747 struct Transpose24_8
748 {
749     //////////////////////////////////////////////////////////////////////////
750     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
751     /// @param pSrc - source data in SOA form
752     /// @param pDst - output data in AOS form
753     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
754 #if ENABLE_AVX512_SIMD16
755 
756     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
757 #endif
758 };
759 
760 //////////////////////////////////////////////////////////////////////////
761 /// Transpose32_8_24
762 //////////////////////////////////////////////////////////////////////////
763 struct Transpose32_8_24
764 {
765     //////////////////////////////////////////////////////////////////////////
766     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
767     /// @param pSrc - source data in SOA form
768     /// @param pDst - output data in AOS form
769     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
770 #if ENABLE_AVX512_SIMD16
771 
772     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
773 #endif
774 };
775 
776 //////////////////////////////////////////////////////////////////////////
777 /// Transpose4_4_4_4
778 //////////////////////////////////////////////////////////////////////////
779 struct Transpose4_4_4_4
780 {
781     //////////////////////////////////////////////////////////////////////////
782     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
783     /// @param pSrc - source data in SOA form
784     /// @param pDst - output data in AOS form
785     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
786 #if ENABLE_AVX512_SIMD16
787 
788     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
789 #endif
790 };
791 
792 //////////////////////////////////////////////////////////////////////////
793 /// Transpose5_6_5
794 //////////////////////////////////////////////////////////////////////////
795 struct Transpose5_6_5
796 {
797     //////////////////////////////////////////////////////////////////////////
798     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
799     /// @param pSrc - source data in SOA form
800     /// @param pDst - output data in AOS form
801     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
802 #if ENABLE_AVX512_SIMD16
803 
804     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
805 #endif
806 };
807 
808 //////////////////////////////////////////////////////////////////////////
809 /// Transpose9_9_9_5
810 //////////////////////////////////////////////////////////////////////////
811 struct Transpose9_9_9_5
812 {
813     //////////////////////////////////////////////////////////////////////////
814     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
815     /// @param pSrc - source data in SOA form
816     /// @param pDst - output data in AOS form
817     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
818 #if ENABLE_AVX512_SIMD16
819 
820     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
821 #endif
822 };
823 
824 //////////////////////////////////////////////////////////////////////////
825 /// Transpose5_5_5_1
826 //////////////////////////////////////////////////////////////////////////
827 struct Transpose5_5_5_1
828 {
829     //////////////////////////////////////////////////////////////////////////
830     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
831     /// @param pSrc - source data in SOA form
832     /// @param pDst - output data in AOS form
833     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
834 #if ENABLE_AVX512_SIMD16
835 
836     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
837 #endif
838 };
839 
840 //////////////////////////////////////////////////////////////////////////
841 /// Transpose1_5_5_5
842 //////////////////////////////////////////////////////////////////////////
843 struct Transpose1_5_5_5
844 {
845     //////////////////////////////////////////////////////////////////////////
846     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
847     /// @param pSrc - source data in SOA form
848     /// @param pDst - output data in AOS form
849     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
850 };
851 
852 //////////////////////////////////////////////////////////////////////////
853 /// Transpose10_10_10_2
854 //////////////////////////////////////////////////////////////////////////
855 struct Transpose10_10_10_2
856 {
857     //////////////////////////////////////////////////////////////////////////
858     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
859     /// @param pSrc - source data in SOA form
860     /// @param pDst - output data in AOS form
861     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
862 #if ENABLE_AVX512_SIMD16
863 
864     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
865 #endif
866 };
867 
868 //////////////////////////////////////////////////////////////////////////
869 /// Transpose11_11_10
870 //////////////////////////////////////////////////////////////////////////
871 struct Transpose11_11_10
872 {
873     //////////////////////////////////////////////////////////////////////////
874     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
875     /// @param pSrc - source data in SOA form
876     /// @param pDst - output data in AOS form
877     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
878 #if ENABLE_AVX512_SIMD16
879 
880     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
881 #endif
882 };
883 
884 //////////////////////////////////////////////////////////////////////////
885 /// Transpose64
886 //////////////////////////////////////////////////////////////////////////
887 struct Transpose64
888 {
889     //////////////////////////////////////////////////////////////////////////
890     /// @brief Performs an SOA to AOS conversion
891     /// @param pSrc - source data in SOA form
892     /// @param pDst - output data in AOS form
893     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
894 #if ENABLE_AVX512_SIMD16
895 
896     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
897 #endif
898 };
899 
900 //////////////////////////////////////////////////////////////////////////
901 /// Transpose64_64
902 //////////////////////////////////////////////////////////////////////////
903 struct Transpose64_64
904 {
905     //////////////////////////////////////////////////////////////////////////
906     /// @brief Performs an SOA to AOS conversion
907     /// @param pSrc - source data in SOA form
908     /// @param pDst - output data in AOS form
909     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
910 #if ENABLE_AVX512_SIMD16
911 
912     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
913 #endif
914 };
915 
916 //////////////////////////////////////////////////////////////////////////
917 /// Transpose64_64_64
918 //////////////////////////////////////////////////////////////////////////
919 struct Transpose64_64_64
920 {
921     //////////////////////////////////////////////////////////////////////////
922     /// @brief Performs an SOA to AOS conversion
923     /// @param pSrc - source data in SOA form
924     /// @param pDst - output data in AOS form
925     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
926 #if ENABLE_AVX512_SIMD16
927 
928     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
929 #endif
930 };
931 
932 //////////////////////////////////////////////////////////////////////////
933 /// Transpose64_64_64_64
934 //////////////////////////////////////////////////////////////////////////
935 struct Transpose64_64_64_64
936 {
937     //////////////////////////////////////////////////////////////////////////
938     /// @brief Performs an SOA to AOS conversion
939     /// @param pSrc - source data in SOA form
940     /// @param pDst - output data in AOS form
941     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
942 #if ENABLE_AVX512_SIMD16
943 
944     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
945 #endif
946 };
947 
948 // helper function to unroll loops
949 template<int Begin, int End, int Step = 1>
950 struct UnrollerL {
951     template<typename Lambda>
stepUnrollerL952     INLINE static void step(Lambda& func) {
953         func(Begin);
954         UnrollerL<Begin + Step, End, Step>::step(func);
955     }
956 };
957 
958 template<int End, int Step>
959 struct UnrollerL<End, End, Step> {
960     template<typename Lambda>
961     static void step(Lambda& func) {
962     }
963 };
964 
965 // helper function to unroll loops, with mask to skip specific iterations
966 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
967 struct UnrollerLMask {
968     template<typename Lambda>
969     INLINE static void step(Lambda& func) {
970         if(Mask & (1 << Begin))
971         {
972             func(Begin);
973         }
974         UnrollerL<Begin + Step, End, Step>::step(func);
975     }
976 };
977 
978 template<int End, int Step, int Mask>
979 struct UnrollerLMask<End, End, Step, Mask> {
980     template<typename Lambda>
981     static void step(Lambda& func) {
982     }
983 };
984 
985 // general CRC compute
986 INLINE
987 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
988 {
989 #if defined(_WIN64) || defined(__x86_64__)
990     uint32_t sizeInQwords = size / sizeof(uint64_t);
991     uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
992     uint64_t* pDataWords = (uint64_t*)pData;
993     for (uint32_t i = 0; i < sizeInQwords; ++i)
994     {
995         crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
996     }
997 #else
998     uint32_t sizeInDwords = size / sizeof(uint32_t);
999     uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
1000     uint32_t* pDataWords = (uint32_t*)pData;
1001     for (uint32_t i = 0; i < sizeInDwords; ++i)
1002     {
1003         crc = _mm_crc32_u32(crc, *pDataWords++);
1004     }
1005 #endif
1006 
1007     uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1008     for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1009     {
1010         crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1011     }
1012 
1013     return crc;
1014 }
1015 
1016 //////////////////////////////////////////////////////////////////////////
1017 /// Add byte offset to any-type pointer
1018 //////////////////////////////////////////////////////////////////////////
1019 template <typename T>
1020 INLINE
1021 static T* PtrAdd(T* p, intptr_t offset)
1022 {
1023     intptr_t intp = reinterpret_cast<intptr_t>(p);
1024     return reinterpret_cast<T*>(intp + offset);
1025 }
1026 
1027 //////////////////////////////////////////////////////////////////////////
1028 /// Is a power-of-2?
1029 //////////////////////////////////////////////////////////////////////////
1030 template <typename T>
1031 INLINE
1032 static bool IsPow2(T value)
1033 {
1034     return value == (value & (0 - value));
1035 }
1036 
1037 //////////////////////////////////////////////////////////////////////////
1038 /// Align down to specified alignment
1039 /// Note: IsPow2(alignment) MUST be true
1040 //////////////////////////////////////////////////////////////////////////
1041 template <typename T1, typename T2>
1042 INLINE
1043 static T1 AlignDownPow2(T1 value, T2 alignment)
1044 {
1045     SWR_ASSERT(IsPow2(alignment));
1046     return value & ~T1(alignment - 1);
1047 }
1048 
1049 //////////////////////////////////////////////////////////////////////////
1050 /// Align up to specified alignment
1051 /// Note: IsPow2(alignment) MUST be true
1052 //////////////////////////////////////////////////////////////////////////
1053 template <typename T1, typename T2>
1054 INLINE
1055 static T1 AlignUpPow2(T1 value, T2 alignment)
1056 {
1057     return AlignDownPow2(value + T1(alignment - 1), alignment);
1058 }
1059 
1060 //////////////////////////////////////////////////////////////////////////
1061 /// Align up ptr to specified alignment
1062 /// Note: IsPow2(alignment) MUST be true
1063 //////////////////////////////////////////////////////////////////////////
1064 template <typename T1, typename T2>
1065 INLINE
1066 static T1* AlignUpPow2(T1* value, T2 alignment)
1067 {
1068     return reinterpret_cast<T1*>(
1069         AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1070 }
1071 
1072 //////////////////////////////////////////////////////////////////////////
1073 /// Align down to specified alignment
1074 //////////////////////////////////////////////////////////////////////////
1075 template <typename T1, typename T2>
1076 INLINE
1077 static T1 AlignDown(T1 value, T2 alignment)
1078 {
1079     if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1080     return value - T1(value % alignment);
1081 }
1082 
1083 //////////////////////////////////////////////////////////////////////////
1084 /// Align down to specified alignment
1085 //////////////////////////////////////////////////////////////////////////
1086 template <typename T1, typename T2>
1087 INLINE
1088 static T1* AlignDown(T1* value, T2 alignment)
1089 {
1090     return (T1*)AlignDown(uintptr_t(value), alignment);
1091 }
1092 
1093 //////////////////////////////////////////////////////////////////////////
1094 /// Align up to specified alignment
1095 /// Note: IsPow2(alignment) MUST be true
1096 //////////////////////////////////////////////////////////////////////////
1097 template <typename T1, typename T2>
1098 INLINE
1099 static T1 AlignUp(T1 value, T2 alignment)
1100 {
1101     return AlignDown(value + T1(alignment - 1), alignment);
1102 }
1103 
1104 //////////////////////////////////////////////////////////////////////////
1105 /// Align up to specified alignment
1106 /// Note: IsPow2(alignment) MUST be true
1107 //////////////////////////////////////////////////////////////////////////
1108 template <typename T1, typename T2>
1109 INLINE
1110 static T1* AlignUp(T1* value, T2 alignment)
1111 {
1112     return AlignDown(PtrAdd(value, alignment - 1), alignment);
1113 }
1114 
1115 //////////////////////////////////////////////////////////////////////////
1116 /// Helper structure used to access an array of elements that don't
1117 /// correspond to a typical word size.
1118 //////////////////////////////////////////////////////////////////////////
1119 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1120 class BitsArray
1121 {
1122 private:
1123     static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1124     static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1125     static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1126     static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1127 
1128     static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1129         "Element size must an integral fraction of pointer size");
1130 
1131     size_t              m_words[NUM_WORDS] = {};
1132 
1133 public:
1134 
1135     T operator[] (size_t elementIndex) const
1136     {
1137         size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1138         word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1139         return T(word & ELEMENT_MASK);
1140     }
1141 };
1142 
1143 // Ranged integer argument for TemplateArgUnroller
1144 template <uint32_t TMin, uint32_t TMax>
1145 struct IntArg
1146 {
1147     uint32_t val;
1148 };
1149 
1150 // Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
1151 // arguments to static template arguments.
1152 template <typename TermT, typename... ArgsB>
1153 struct TemplateArgUnroller
1154 {
1155     //-----------------------------------------
1156     // Boolean value
1157     //-----------------------------------------
1158 
1159     // Last Arg Terminator
1160     static typename TermT::FuncType GetFunc(bool bArg)
1161     {
1162         if (bArg)
1163         {
1164             return TermT::template GetFunc<ArgsB..., std::true_type>();
1165         }
1166 
1167         return TermT::template GetFunc<ArgsB..., std::false_type>();
1168     }
1169 
1170     // Recursively parse args
1171     template <typename... TArgsT>
1172     static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1173     {
1174         if (bArg)
1175         {
1176             return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1177         }
1178 
1179         return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1180     }
1181 
1182     //-----------------------------------------
1183     // Integer value (within specified range)
1184     //-----------------------------------------
1185 
1186     // Last Arg Terminator
1187     template <uint32_t TMin, uint32_t TMax>
1188     static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1189     {
1190         if (iArg.val == TMax)
1191         {
1192             return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1193         }
1194         if (TMax > TMin)
1195         {
1196             return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1197         }
1198         SWR_ASSUME(false); return nullptr;
1199     }
1200     template <uint32_t TVal>
1201     static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1202     {
1203         SWR_ASSERT(iArg.val == TVal);
1204         return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1205     }
1206 
1207     // Recursively parse args
1208     template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1209     static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1210     {
1211         if (iArg.val == TMax)
1212         {
1213             return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1214         }
1215         if (TMax > TMin)
1216         {
1217             return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1218         }
1219         SWR_ASSUME(false); return nullptr;
1220     }
1221     template <uint32_t TVal, typename... TArgsT>
1222     static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1223     {
1224         SWR_ASSERT(iArg.val == TVal);
1225         return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1226     }
1227 };
1228 
1229 
1230