1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
_MM_EXTRACT_EPI64(__m128i a,const int32_t ndx)42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
_MM_INSERT_EPI64(__m128i a,int64_t b,const int32_t ndx)60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
vTranspose(__m128 & row0,__m128 & row1,__m128 & row2,__m128 & row3)87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
vTranspose(__m128i & row0,__m128i & row1,__m128i & row2,__m128i & row3)117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
139
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147
148 #if KNOB_SIMD_WIDTH == 8
149 INLINE
vTranspose3x8(__m128 (& vDst)[8],const __m256 & vSrc0,const __m256 & vSrc1,const __m256 & vSrc2)150 void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151 {
152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
156
157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
161
162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172
173 INLINE
vTranspose4x8(__m128 (& vDst)[8],const __m256 & vSrc0,const __m256 & vSrc1,const __m256 & vSrc2,const __m256 & vSrc3)174 void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175 {
176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
180
181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
185
186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196
197 #if ENABLE_AVX512_SIMD16
198 INLINE
vTranspose4x16(simd16scalar (& dst)[4],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2,const simd16scalar & src3)199 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
200 {
201 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
202
203 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
204 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
205 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
206 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
207
208 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
209 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
210 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
211 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
212
213 dst[0] = _simd16_unpacklo_ps(rblo, galo);
214 dst[1] = _simd16_unpackhi_ps(rblo, galo);
215 dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
216 dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
217 }
218
219 #endif
220 INLINE
vTranspose8x8(__m256 (& vDst)[8],const __m256 & vMask0,const __m256 & vMask1,const __m256 & vMask2,const __m256 & vMask3,const __m256 & vMask4,const __m256 & vMask5,const __m256 & vMask6,const __m256 & vMask7)221 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
222 {
223 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
224 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
225 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
226 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
227 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
228 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
229 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
230 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
231 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
232 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
233 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
234 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
235 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
236 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
237 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
238 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
239 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
240 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
241 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
242 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
243 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
244 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
245 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
246 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
247 }
248
249 INLINE
vTranspose8x8(__m256 (& vDst)[8],const __m256i & vMask0,const __m256i & vMask1,const __m256i & vMask2,const __m256i & vMask3,const __m256i & vMask4,const __m256i & vMask5,const __m256i & vMask6,const __m256i & vMask7)250 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
251 {
252 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
253 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
254 }
255 #endif
256
257 //////////////////////////////////////////////////////////////////////////
258 /// TranposeSingleComponent
259 //////////////////////////////////////////////////////////////////////////
260 template<uint32_t bpp>
261 struct TransposeSingleComponent
262 {
263 //////////////////////////////////////////////////////////////////////////
264 /// @brief Pass-thru for single component.
265 /// @param pSrc - source data in SOA form
266 /// @param pDst - output data in AOS form
TransposeTransposeSingleComponent267 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
268 {
269 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
270 }
271 #if ENABLE_AVX512_SIMD16
272
Transpose_16TransposeSingleComponent273 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
274 {
275 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
276 }
277 #endif
278 };
279
280 //////////////////////////////////////////////////////////////////////////
281 /// Transpose8_8_8_8
282 //////////////////////////////////////////////////////////////////////////
283 struct Transpose8_8_8_8
284 {
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287 /// @param pSrc - source data in SOA form
288 /// @param pDst - output data in AOS form
TransposeTranspose8_8_8_8289 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
290 {
291 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
292
293 #if KNOB_SIMD_WIDTH == 8
294 #if KNOB_ARCH == KNOB_ARCH_AVX
295 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
296 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
297 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
298 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
299 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
300 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
301 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
302 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
303 _mm_store_si128((__m128i*)pDst, c0123lo);
304 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
305 #elif KNOB_ARCH == KNOB_ARCH_AVX2
306 simdscalari dst01 = _mm256_shuffle_epi8(src,
307 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
308 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
309 dst23 = _mm256_shuffle_epi8(dst23,
310 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
311 simdscalari dst = _mm256_or_si256(dst01, dst23);
312 _simd_store_si((simdscalari*)pDst, dst);
313 #endif
314 #else
315 #error Unsupported vector width
316 #endif
317 }
318 #if ENABLE_AVX512_SIMD16
319
Transpose_16Transpose8_8_8_8320 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
321 {
322 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
323 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
324 __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
325 __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
326
327 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
328 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
329 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
330 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
331
332 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
333 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
334 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
335
336 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
337
338 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
339 }
340 #endif
341 };
342
343 //////////////////////////////////////////////////////////////////////////
344 /// Transpose8_8_8
345 //////////////////////////////////////////////////////////////////////////
346 struct Transpose8_8_8
347 {
348 //////////////////////////////////////////////////////////////////////////
349 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
350 /// @param pSrc - source data in SOA form
351 /// @param pDst - output data in AOS form
352 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
353 #if ENABLE_AVX512_SIMD16
354
355 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
356 #endif
357 };
358
359 //////////////////////////////////////////////////////////////////////////
360 /// Transpose8_8
361 //////////////////////////////////////////////////////////////////////////
362 struct Transpose8_8
363 {
364 //////////////////////////////////////////////////////////////////////////
365 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
366 /// @param pSrc - source data in SOA form
367 /// @param pDst - output data in AOS form
TransposeTranspose8_8368 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
369 {
370 #if KNOB_SIMD_WIDTH == 8
371 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
372
373 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
374 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
375 rg = _mm_unpacklo_epi8(rg, g);
376 _mm_store_si128((__m128i*)pDst, rg);
377 #else
378 #error Unsupported vector width
379 #endif
380 }
381 #if ENABLE_AVX512_SIMD16
382
Transpose_16Transpose8_8383 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
384 {
385 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
386 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
387
388 simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
389 simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
390
391 simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
392
393 simdscalari dst = _simd_or_si(cvt0, shl1);
394
395 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
396 }
397 #endif
398 };
399
400 //////////////////////////////////////////////////////////////////////////
401 /// Transpose32_32_32_32
402 //////////////////////////////////////////////////////////////////////////
403 struct Transpose32_32_32_32
404 {
405 //////////////////////////////////////////////////////////////////////////
406 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
407 /// @param pSrc - source data in SOA form
408 /// @param pDst - output data in AOS form
TransposeTranspose32_32_32_32409 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
410 {
411 #if KNOB_SIMD_WIDTH == 8
412 simdscalar src0 = _simd_load_ps((const float*)pSrc);
413 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
414 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
415 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
416
417 __m128 vDst[8];
418 vTranspose4x8(vDst, src0, src1, src2, src3);
419 _mm_store_ps((float*)pDst, vDst[0]);
420 _mm_store_ps((float*)pDst+4, vDst[1]);
421 _mm_store_ps((float*)pDst+8, vDst[2]);
422 _mm_store_ps((float*)pDst+12, vDst[3]);
423 _mm_store_ps((float*)pDst+16, vDst[4]);
424 _mm_store_ps((float*)pDst+20, vDst[5]);
425 _mm_store_ps((float*)pDst+24, vDst[6]);
426 _mm_store_ps((float*)pDst+28, vDst[7]);
427 #else
428 #error Unsupported vector width
429 #endif
430 }
431 #if ENABLE_AVX512_SIMD16
432
Transpose_16Transpose32_32_32_32433 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
434 {
435 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
436 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
437 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
438 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
439
440 simd16scalar dst[4];
441
442 vTranspose4x16(dst, src0, src1, src2, src3);
443
444 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
445 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
446 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
447 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
448 }
449 #endif
450 };
451
452 //////////////////////////////////////////////////////////////////////////
453 /// Transpose32_32_32
454 //////////////////////////////////////////////////////////////////////////
455 struct Transpose32_32_32
456 {
457 //////////////////////////////////////////////////////////////////////////
458 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
459 /// @param pSrc - source data in SOA form
460 /// @param pDst - output data in AOS form
TransposeTranspose32_32_32461 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
462 {
463 #if KNOB_SIMD_WIDTH == 8
464 simdscalar src0 = _simd_load_ps((const float*)pSrc);
465 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
466 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
467
468 __m128 vDst[8];
469 vTranspose3x8(vDst, src0, src1, src2);
470 _mm_store_ps((float*)pDst, vDst[0]);
471 _mm_store_ps((float*)pDst + 4, vDst[1]);
472 _mm_store_ps((float*)pDst + 8, vDst[2]);
473 _mm_store_ps((float*)pDst + 12, vDst[3]);
474 _mm_store_ps((float*)pDst + 16, vDst[4]);
475 _mm_store_ps((float*)pDst + 20, vDst[5]);
476 _mm_store_ps((float*)pDst + 24, vDst[6]);
477 _mm_store_ps((float*)pDst + 28, vDst[7]);
478 #else
479 #error Unsupported vector width
480 #endif
481 }
482 #if ENABLE_AVX512_SIMD16
483
Transpose_16Transpose32_32_32484 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
485 {
486 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
487 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
488 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
489 simd16scalar src3 = _simd16_setzero_ps();
490
491 simd16scalar dst[4];
492
493 vTranspose4x16(dst, src0, src1, src2, src3);
494
495 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
496 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
497 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
498 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
499 }
500 #endif
501 };
502
503 //////////////////////////////////////////////////////////////////////////
504 /// Transpose32_32
505 //////////////////////////////////////////////////////////////////////////
506 struct Transpose32_32
507 {
508 //////////////////////////////////////////////////////////////////////////
509 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
510 /// @param pSrc - source data in SOA form
511 /// @param pDst - output data in AOS form
TransposeTranspose32_32512 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
513 {
514 #if KNOB_SIMD_WIDTH == 8
515 const float* pfSrc = (const float*)pSrc;
516 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
517 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
518 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
519 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
520
521 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
522 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
523 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
524 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
525
526 float* pfDst = (float*)pDst;
527 _mm_store_ps(pfDst + 0, dst0);
528 _mm_store_ps(pfDst + 4, dst1);
529 _mm_store_ps(pfDst + 8, dst2);
530 _mm_store_ps(pfDst + 12, dst3);
531 #else
532 #error Unsupported vector width
533 #endif
534 }
535 #if ENABLE_AVX512_SIMD16
536
Transpose_16Transpose32_32537 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
538 {
539 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr
540 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg
541
542 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
543 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
544
545 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
546 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
547
548 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
549 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
550
551 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
552 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
553 }
554 #endif
555 };
556
557 //////////////////////////////////////////////////////////////////////////
558 /// Transpose16_16_16_16
559 //////////////////////////////////////////////////////////////////////////
560 struct Transpose16_16_16_16
561 {
562 //////////////////////////////////////////////////////////////////////////
563 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
564 /// @param pSrc - source data in SOA form
565 /// @param pDst - output data in AOS form
TransposeTranspose16_16_16_16566 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
567 {
568 #if KNOB_SIMD_WIDTH == 8
569 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
570 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
571
572 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
573 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
574 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
575 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
576
577 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
578 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
579 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
580 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
581
582 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
583 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
584 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
585 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
586
587 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
588 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
589 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
590 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
591 #else
592 #error Unsupported vector width
593 #endif
594 }
595 #if ENABLE_AVX512_SIMD16
596
Transpose_16Transpose16_16_16_16597 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
598 {
599 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
600 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
601 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
602 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
603
604 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
605 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
606 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
607 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
608
609 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
610 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
611 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
612 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
613
614 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
615 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
616 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
617 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
618
619 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
620 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
621 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
622 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
623 }
624 #endif
625 };
626
627 //////////////////////////////////////////////////////////////////////////
628 /// Transpose16_16_16
629 //////////////////////////////////////////////////////////////////////////
630 struct Transpose16_16_16
631 {
632 //////////////////////////////////////////////////////////////////////////
633 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
634 /// @param pSrc - source data in SOA form
635 /// @param pDst - output data in AOS form
TransposeTranspose16_16_16636 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
637 {
638 #if KNOB_SIMD_WIDTH == 8
639 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
640
641 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
642 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
643 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
644 __m128i src_a = _mm_undefined_si128();
645
646 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
647 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
648 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
649 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
650
651 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
652 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
653 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
654 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
655
656 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
657 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
658 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
659 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
660 #else
661 #error Unsupported vector width
662 #endif
663 }
664 #if ENABLE_AVX512_SIMD16
665
Transpose_16Transpose16_16_16666 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
667 {
668 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
669 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
670 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
671 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
672
673 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
674 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
675 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
676 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
677
678 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
679 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
680 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
681 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
682
683 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
684 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
685 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
686 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
687
688 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
689 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
690 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
691 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
692 }
693 #endif
694 };
695
696 //////////////////////////////////////////////////////////////////////////
697 /// Transpose16_16
698 //////////////////////////////////////////////////////////////////////////
699 struct Transpose16_16
700 {
701 //////////////////////////////////////////////////////////////////////////
702 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
703 /// @param pSrc - source data in SOA form
704 /// @param pDst - output data in AOS form
TransposeTranspose16_16705 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
706 {
707 #if KNOB_SIMD_WIDTH == 8
708 simdscalar src = _simd_load_ps((const float*)pSrc);
709
710 __m128 comp0 = _mm256_castps256_ps128(src);
711 __m128 comp1 = _mm256_extractf128_ps(src, 1);
712
713 __m128i comp0i = _mm_castps_si128(comp0);
714 __m128i comp1i = _mm_castps_si128(comp1);
715
716 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
717 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
718
719 _mm_store_si128((__m128i*)pDst, resLo);
720 _mm_store_si128((__m128i*)pDst + 1, resHi);
721 #else
722 #error Unsupported vector width
723 #endif
724 }
725 #if ENABLE_AVX512_SIMD16
726
Transpose_16Transpose16_16727 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
728 {
729 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
730 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
731
732 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
733 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
734
735 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
736 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
737
738 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
739 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
740 }
741 #endif
742 };
743
744 //////////////////////////////////////////////////////////////////////////
745 /// Transpose24_8
746 //////////////////////////////////////////////////////////////////////////
747 struct Transpose24_8
748 {
749 //////////////////////////////////////////////////////////////////////////
750 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
751 /// @param pSrc - source data in SOA form
752 /// @param pDst - output data in AOS form
753 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
754 #if ENABLE_AVX512_SIMD16
755
756 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
757 #endif
758 };
759
760 //////////////////////////////////////////////////////////////////////////
761 /// Transpose32_8_24
762 //////////////////////////////////////////////////////////////////////////
763 struct Transpose32_8_24
764 {
765 //////////////////////////////////////////////////////////////////////////
766 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
767 /// @param pSrc - source data in SOA form
768 /// @param pDst - output data in AOS form
769 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
770 #if ENABLE_AVX512_SIMD16
771
772 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
773 #endif
774 };
775
776 //////////////////////////////////////////////////////////////////////////
777 /// Transpose4_4_4_4
778 //////////////////////////////////////////////////////////////////////////
779 struct Transpose4_4_4_4
780 {
781 //////////////////////////////////////////////////////////////////////////
782 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
783 /// @param pSrc - source data in SOA form
784 /// @param pDst - output data in AOS form
785 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
786 #if ENABLE_AVX512_SIMD16
787
788 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
789 #endif
790 };
791
792 //////////////////////////////////////////////////////////////////////////
793 /// Transpose5_6_5
794 //////////////////////////////////////////////////////////////////////////
795 struct Transpose5_6_5
796 {
797 //////////////////////////////////////////////////////////////////////////
798 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
799 /// @param pSrc - source data in SOA form
800 /// @param pDst - output data in AOS form
801 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
802 #if ENABLE_AVX512_SIMD16
803
804 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
805 #endif
806 };
807
808 //////////////////////////////////////////////////////////////////////////
809 /// Transpose9_9_9_5
810 //////////////////////////////////////////////////////////////////////////
811 struct Transpose9_9_9_5
812 {
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
815 /// @param pSrc - source data in SOA form
816 /// @param pDst - output data in AOS form
817 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
818 #if ENABLE_AVX512_SIMD16
819
820 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
821 #endif
822 };
823
824 //////////////////////////////////////////////////////////////////////////
825 /// Transpose5_5_5_1
826 //////////////////////////////////////////////////////////////////////////
827 struct Transpose5_5_5_1
828 {
829 //////////////////////////////////////////////////////////////////////////
830 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
831 /// @param pSrc - source data in SOA form
832 /// @param pDst - output data in AOS form
833 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
834 #if ENABLE_AVX512_SIMD16
835
836 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
837 #endif
838 };
839
840 //////////////////////////////////////////////////////////////////////////
841 /// Transpose1_5_5_5
842 //////////////////////////////////////////////////////////////////////////
843 struct Transpose1_5_5_5
844 {
845 //////////////////////////////////////////////////////////////////////////
846 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
847 /// @param pSrc - source data in SOA form
848 /// @param pDst - output data in AOS form
849 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
850 };
851
852 //////////////////////////////////////////////////////////////////////////
853 /// Transpose10_10_10_2
854 //////////////////////////////////////////////////////////////////////////
855 struct Transpose10_10_10_2
856 {
857 //////////////////////////////////////////////////////////////////////////
858 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
859 /// @param pSrc - source data in SOA form
860 /// @param pDst - output data in AOS form
861 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
862 #if ENABLE_AVX512_SIMD16
863
864 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
865 #endif
866 };
867
868 //////////////////////////////////////////////////////////////////////////
869 /// Transpose11_11_10
870 //////////////////////////////////////////////////////////////////////////
871 struct Transpose11_11_10
872 {
873 //////////////////////////////////////////////////////////////////////////
874 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
875 /// @param pSrc - source data in SOA form
876 /// @param pDst - output data in AOS form
877 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
878 #if ENABLE_AVX512_SIMD16
879
880 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
881 #endif
882 };
883
884 //////////////////////////////////////////////////////////////////////////
885 /// Transpose64
886 //////////////////////////////////////////////////////////////////////////
887 struct Transpose64
888 {
889 //////////////////////////////////////////////////////////////////////////
890 /// @brief Performs an SOA to AOS conversion
891 /// @param pSrc - source data in SOA form
892 /// @param pDst - output data in AOS form
893 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
894 #if ENABLE_AVX512_SIMD16
895
896 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
897 #endif
898 };
899
900 //////////////////////////////////////////////////////////////////////////
901 /// Transpose64_64
902 //////////////////////////////////////////////////////////////////////////
903 struct Transpose64_64
904 {
905 //////////////////////////////////////////////////////////////////////////
906 /// @brief Performs an SOA to AOS conversion
907 /// @param pSrc - source data in SOA form
908 /// @param pDst - output data in AOS form
909 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
910 #if ENABLE_AVX512_SIMD16
911
912 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
913 #endif
914 };
915
916 //////////////////////////////////////////////////////////////////////////
917 /// Transpose64_64_64
918 //////////////////////////////////////////////////////////////////////////
919 struct Transpose64_64_64
920 {
921 //////////////////////////////////////////////////////////////////////////
922 /// @brief Performs an SOA to AOS conversion
923 /// @param pSrc - source data in SOA form
924 /// @param pDst - output data in AOS form
925 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
926 #if ENABLE_AVX512_SIMD16
927
928 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
929 #endif
930 };
931
932 //////////////////////////////////////////////////////////////////////////
933 /// Transpose64_64_64_64
934 //////////////////////////////////////////////////////////////////////////
935 struct Transpose64_64_64_64
936 {
937 //////////////////////////////////////////////////////////////////////////
938 /// @brief Performs an SOA to AOS conversion
939 /// @param pSrc - source data in SOA form
940 /// @param pDst - output data in AOS form
941 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
942 #if ENABLE_AVX512_SIMD16
943
944 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
945 #endif
946 };
947
948 // helper function to unroll loops
949 template<int Begin, int End, int Step = 1>
950 struct UnrollerL {
951 template<typename Lambda>
stepUnrollerL952 INLINE static void step(Lambda& func) {
953 func(Begin);
954 UnrollerL<Begin + Step, End, Step>::step(func);
955 }
956 };
957
958 template<int End, int Step>
959 struct UnrollerL<End, End, Step> {
960 template<typename Lambda>
961 static void step(Lambda& func) {
962 }
963 };
964
965 // helper function to unroll loops, with mask to skip specific iterations
966 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
967 struct UnrollerLMask {
968 template<typename Lambda>
969 INLINE static void step(Lambda& func) {
970 if(Mask & (1 << Begin))
971 {
972 func(Begin);
973 }
974 UnrollerL<Begin + Step, End, Step>::step(func);
975 }
976 };
977
978 template<int End, int Step, int Mask>
979 struct UnrollerLMask<End, End, Step, Mask> {
980 template<typename Lambda>
981 static void step(Lambda& func) {
982 }
983 };
984
985 // general CRC compute
986 INLINE
987 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
988 {
989 #if defined(_WIN64) || defined(__x86_64__)
990 uint32_t sizeInQwords = size / sizeof(uint64_t);
991 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
992 uint64_t* pDataWords = (uint64_t*)pData;
993 for (uint32_t i = 0; i < sizeInQwords; ++i)
994 {
995 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
996 }
997 #else
998 uint32_t sizeInDwords = size / sizeof(uint32_t);
999 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
1000 uint32_t* pDataWords = (uint32_t*)pData;
1001 for (uint32_t i = 0; i < sizeInDwords; ++i)
1002 {
1003 crc = _mm_crc32_u32(crc, *pDataWords++);
1004 }
1005 #endif
1006
1007 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1008 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1009 {
1010 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1011 }
1012
1013 return crc;
1014 }
1015
1016 //////////////////////////////////////////////////////////////////////////
1017 /// Add byte offset to any-type pointer
1018 //////////////////////////////////////////////////////////////////////////
1019 template <typename T>
1020 INLINE
1021 static T* PtrAdd(T* p, intptr_t offset)
1022 {
1023 intptr_t intp = reinterpret_cast<intptr_t>(p);
1024 return reinterpret_cast<T*>(intp + offset);
1025 }
1026
1027 //////////////////////////////////////////////////////////////////////////
1028 /// Is a power-of-2?
1029 //////////////////////////////////////////////////////////////////////////
1030 template <typename T>
1031 INLINE
1032 static bool IsPow2(T value)
1033 {
1034 return value == (value & (0 - value));
1035 }
1036
1037 //////////////////////////////////////////////////////////////////////////
1038 /// Align down to specified alignment
1039 /// Note: IsPow2(alignment) MUST be true
1040 //////////////////////////////////////////////////////////////////////////
1041 template <typename T1, typename T2>
1042 INLINE
1043 static T1 AlignDownPow2(T1 value, T2 alignment)
1044 {
1045 SWR_ASSERT(IsPow2(alignment));
1046 return value & ~T1(alignment - 1);
1047 }
1048
1049 //////////////////////////////////////////////////////////////////////////
1050 /// Align up to specified alignment
1051 /// Note: IsPow2(alignment) MUST be true
1052 //////////////////////////////////////////////////////////////////////////
1053 template <typename T1, typename T2>
1054 INLINE
1055 static T1 AlignUpPow2(T1 value, T2 alignment)
1056 {
1057 return AlignDownPow2(value + T1(alignment - 1), alignment);
1058 }
1059
1060 //////////////////////////////////////////////////////////////////////////
1061 /// Align up ptr to specified alignment
1062 /// Note: IsPow2(alignment) MUST be true
1063 //////////////////////////////////////////////////////////////////////////
1064 template <typename T1, typename T2>
1065 INLINE
1066 static T1* AlignUpPow2(T1* value, T2 alignment)
1067 {
1068 return reinterpret_cast<T1*>(
1069 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1070 }
1071
1072 //////////////////////////////////////////////////////////////////////////
1073 /// Align down to specified alignment
1074 //////////////////////////////////////////////////////////////////////////
1075 template <typename T1, typename T2>
1076 INLINE
1077 static T1 AlignDown(T1 value, T2 alignment)
1078 {
1079 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1080 return value - T1(value % alignment);
1081 }
1082
1083 //////////////////////////////////////////////////////////////////////////
1084 /// Align down to specified alignment
1085 //////////////////////////////////////////////////////////////////////////
1086 template <typename T1, typename T2>
1087 INLINE
1088 static T1* AlignDown(T1* value, T2 alignment)
1089 {
1090 return (T1*)AlignDown(uintptr_t(value), alignment);
1091 }
1092
1093 //////////////////////////////////////////////////////////////////////////
1094 /// Align up to specified alignment
1095 /// Note: IsPow2(alignment) MUST be true
1096 //////////////////////////////////////////////////////////////////////////
1097 template <typename T1, typename T2>
1098 INLINE
1099 static T1 AlignUp(T1 value, T2 alignment)
1100 {
1101 return AlignDown(value + T1(alignment - 1), alignment);
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// Align up to specified alignment
1106 /// Note: IsPow2(alignment) MUST be true
1107 //////////////////////////////////////////////////////////////////////////
1108 template <typename T1, typename T2>
1109 INLINE
1110 static T1* AlignUp(T1* value, T2 alignment)
1111 {
1112 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1113 }
1114
1115 //////////////////////////////////////////////////////////////////////////
1116 /// Helper structure used to access an array of elements that don't
1117 /// correspond to a typical word size.
1118 //////////////////////////////////////////////////////////////////////////
1119 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1120 class BitsArray
1121 {
1122 private:
1123 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1124 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1125 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1126 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1127
1128 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1129 "Element size must an integral fraction of pointer size");
1130
1131 size_t m_words[NUM_WORDS] = {};
1132
1133 public:
1134
1135 T operator[] (size_t elementIndex) const
1136 {
1137 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1138 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1139 return T(word & ELEMENT_MASK);
1140 }
1141 };
1142
1143 // Ranged integer argument for TemplateArgUnroller
1144 template <uint32_t TMin, uint32_t TMax>
1145 struct IntArg
1146 {
1147 uint32_t val;
1148 };
1149
1150 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1151 // arguments to static template arguments.
1152 template <typename TermT, typename... ArgsB>
1153 struct TemplateArgUnroller
1154 {
1155 //-----------------------------------------
1156 // Boolean value
1157 //-----------------------------------------
1158
1159 // Last Arg Terminator
1160 static typename TermT::FuncType GetFunc(bool bArg)
1161 {
1162 if (bArg)
1163 {
1164 return TermT::template GetFunc<ArgsB..., std::true_type>();
1165 }
1166
1167 return TermT::template GetFunc<ArgsB..., std::false_type>();
1168 }
1169
1170 // Recursively parse args
1171 template <typename... TArgsT>
1172 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1173 {
1174 if (bArg)
1175 {
1176 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1177 }
1178
1179 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1180 }
1181
1182 //-----------------------------------------
1183 // Integer value (within specified range)
1184 //-----------------------------------------
1185
1186 // Last Arg Terminator
1187 template <uint32_t TMin, uint32_t TMax>
1188 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1189 {
1190 if (iArg.val == TMax)
1191 {
1192 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1193 }
1194 if (TMax > TMin)
1195 {
1196 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1197 }
1198 SWR_ASSUME(false); return nullptr;
1199 }
1200 template <uint32_t TVal>
1201 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1202 {
1203 SWR_ASSERT(iArg.val == TVal);
1204 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1205 }
1206
1207 // Recursively parse args
1208 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1209 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1210 {
1211 if (iArg.val == TMax)
1212 {
1213 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1214 }
1215 if (TMax > TMin)
1216 {
1217 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1218 }
1219 SWR_ASSUME(false); return nullptr;
1220 }
1221 template <uint32_t TVal, typename... TArgsT>
1222 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1223 {
1224 SWR_ASSERT(iArg.val == TVal);
1225 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1226 }
1227 };
1228
1229
1230