1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core related to pixel formats.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
32
33 INLINE
vTranspose(simd4scalar & row0,simd4scalar & row1,simd4scalar & row2,simd4scalar & row3)34 void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
35 {
36 simd4scalari row0i = SIMD128::castps_si(row0);
37 simd4scalari row1i = SIMD128::castps_si(row1);
38 simd4scalari row2i = SIMD128::castps_si(row2);
39 simd4scalari row3i = SIMD128::castps_si(row3);
40
41 simd4scalari vTemp = row2i;
42 row2i = SIMD128::unpacklo_epi32(row2i, row3i);
43 vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
44
45 row3i = row0i;
46 row0i = SIMD128::unpacklo_epi32(row0i, row1i);
47 row3i = SIMD128::unpackhi_epi32(row3i, row1i);
48
49 row1i = row0i;
50 row0i = SIMD128::unpacklo_epi64(row0i, row2i);
51 row1i = SIMD128::unpackhi_epi64(row1i, row2i);
52
53 row2i = row3i;
54 row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
55 row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
56
57 row0 = SIMD128::castsi_ps(row0i);
58 row1 = SIMD128::castsi_ps(row1i);
59 row2 = SIMD128::castsi_ps(row2i);
60 row3 = SIMD128::castsi_ps(row3i);
61 }
62
63 INLINE
vTranspose(simd4scalari & row0,simd4scalari & row1,simd4scalari & row2,simd4scalari & row3)64 void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
65 {
66 simd4scalari vTemp = row2;
67 row2 = SIMD128::unpacklo_epi32(row2, row3);
68 vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
69
70 row3 = row0;
71 row0 = SIMD128::unpacklo_epi32(row0, row1);
72 row3 = SIMD128::unpackhi_epi32(row3, row1);
73
74 row1 = row0;
75 row0 = SIMD128::unpacklo_epi64(row0, row2);
76 row1 = SIMD128::unpackhi_epi64(row1, row2);
77
78 row2 = row3;
79 row2 = SIMD128::unpacklo_epi64(row2, vTemp);
80 row3 = SIMD128::unpackhi_epi64(row3, vTemp);
81 }
82
83 #if KNOB_SIMD_WIDTH == 8
84 INLINE
vTranspose3x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2)85 void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
86 {
87 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
88 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
89 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
90 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
91
92 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
93 r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
94 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
95 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
96
97 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
98 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
99 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
100 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
101
102 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
103 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
104 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
105 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
106 }
107
108 INLINE
vTranspose4x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2,const simdscalar & vSrc3)109 void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
110 {
111 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
112 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
113 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
114 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
115
116 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
117 r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77
118 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
119 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
120
121 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
122 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
123 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
124 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
125
126 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
127 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
128 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
129 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
130 }
131
132 #if ENABLE_AVX512_SIMD16
133 INLINE
vTranspose4x16(simd16scalar (& dst)[4],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2,const simd16scalar & src3)134 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
135 {
136 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
137
138 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
139 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
140 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
141 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
142
143 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
144 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
145 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
146 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
147
148 dst[0] = _simd16_unpacklo_ps(rblo, galo);
149 dst[1] = _simd16_unpackhi_ps(rblo, galo);
150 dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
151 dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
152 }
153
154 #endif
155 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalar & vMask0,const simdscalar & vMask1,const simdscalar & vMask2,const simdscalar & vMask3,const simdscalar & vMask4,const simdscalar & vMask5,const simdscalar & vMask6,const simdscalar & vMask7)156 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
157 {
158 simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
159 simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
160 simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
161 simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
162 simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
163 simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
164 simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
165 simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
166 simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
167 simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
168 simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
169 simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
170 simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
171 simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
172 simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
173 simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
174 vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
175 vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
176 vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
177 vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
178 vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
179 vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
180 vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
181 vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
182 }
183
184 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalari & vMask0,const simdscalari & vMask1,const simdscalari & vMask2,const simdscalari & vMask3,const simdscalari & vMask4,const simdscalari & vMask5,const simdscalari & vMask6,const simdscalari & vMask7)185 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
186 {
187 vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
188 _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
189 }
190 #endif
191
192 //////////////////////////////////////////////////////////////////////////
193 /// TranposeSingleComponent
194 //////////////////////////////////////////////////////////////////////////
195 template<uint32_t bpp>
196 struct TransposeSingleComponent
197 {
198 //////////////////////////////////////////////////////////////////////////
199 /// @brief Pass-thru for single component.
200 /// @param pSrc - source data in SOA form
201 /// @param pDst - output data in AOS form
TransposeTransposeSingleComponent202 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
203 {
204 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
205 }
206 #if ENABLE_AVX512_SIMD16
207
Transpose_16TransposeSingleComponent208 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
209 {
210 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
211 }
212 #endif
213 };
214
215 //////////////////////////////////////////////////////////////////////////
216 /// Transpose8_8_8_8
217 //////////////////////////////////////////////////////////////////////////
218 struct Transpose8_8_8_8
219 {
220 //////////////////////////////////////////////////////////////////////////
221 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
222 /// @param pSrc - source data in SOA form
223 /// @param pDst - output data in AOS form
TransposeTranspose8_8_8_8224 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
225 {
226 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
227
228 #if KNOB_SIMD_WIDTH == 8
229 #if KNOB_ARCH <= KNOB_ARCH_AVX
230 simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
231 simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
232 simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
233 simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
234 simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
235 simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
236 simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
237 simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
238 SIMD128::store_si((simd4scalari*)pDst, c0123lo);
239 SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
240 #else
241 simdscalari dst01 = _simd_shuffle_epi8(src,
242 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
243 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
244 dst23 = _simd_shuffle_epi8(dst23,
245 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
246 simdscalari dst = _simd_or_si(dst01, dst23);
247 _simd_store_si((simdscalari*)pDst, dst);
248 #endif
249 #else
250 #error Unsupported vector width
251 #endif
252 }
253 #if ENABLE_AVX512_SIMD16
254
Transpose_16Transpose8_8_8_8255 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
256 {
257 simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
258 simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
259 simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
260 simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
261
262 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
263 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
264 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
265 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
266
267 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
268 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
269 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
270
271 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
272
273 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
274 }
275 #endif
276 };
277
278 //////////////////////////////////////////////////////////////////////////
279 /// Transpose8_8_8
280 //////////////////////////////////////////////////////////////////////////
281 struct Transpose8_8_8
282 {
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
285 /// @param pSrc - source data in SOA form
286 /// @param pDst - output data in AOS form
287 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
288 #if ENABLE_AVX512_SIMD16
289
290 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
291 #endif
292 };
293
294 //////////////////////////////////////////////////////////////////////////
295 /// Transpose8_8
296 //////////////////////////////////////////////////////////////////////////
297 struct Transpose8_8
298 {
299 //////////////////////////////////////////////////////////////////////////
300 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
301 /// @param pSrc - source data in SOA form
302 /// @param pDst - output data in AOS form
TransposeTranspose8_8303 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
304 {
305 #if KNOB_SIMD_WIDTH == 8
306 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
307
308 simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
309 simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
310 rg = SIMD128::unpacklo_epi8(rg, g);
311 SIMD128::store_si((simd4scalari*)pDst, rg);
312 #else
313 #error Unsupported vector width
314 #endif
315 }
316 #if ENABLE_AVX512_SIMD16
317
Transpose_16Transpose8_8318 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
319 {
320 simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
321 simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
322
323 simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
324 simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
325
326 simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
327
328 simdscalari dst = _simd_or_si(cvt0, shl1);
329
330 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
331 }
332 #endif
333 };
334
335 //////////////////////////////////////////////////////////////////////////
336 /// Transpose32_32_32_32
337 //////////////////////////////////////////////////////////////////////////
338 struct Transpose32_32_32_32
339 {
340 //////////////////////////////////////////////////////////////////////////
341 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
342 /// @param pSrc - source data in SOA form
343 /// @param pDst - output data in AOS form
TransposeTranspose32_32_32_32344 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
345 {
346 #if KNOB_SIMD_WIDTH == 8
347 simdscalar src0 = _simd_load_ps((const float*)pSrc);
348 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
349 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
350 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
351
352 simd4scalar vDst[8];
353 vTranspose4x8(vDst, src0, src1, src2, src3);
354 SIMD128::store_ps((float*)pDst, vDst[0]);
355 SIMD128::store_ps((float*)pDst+4, vDst[1]);
356 SIMD128::store_ps((float*)pDst+8, vDst[2]);
357 SIMD128::store_ps((float*)pDst+12, vDst[3]);
358 SIMD128::store_ps((float*)pDst+16, vDst[4]);
359 SIMD128::store_ps((float*)pDst+20, vDst[5]);
360 SIMD128::store_ps((float*)pDst+24, vDst[6]);
361 SIMD128::store_ps((float*)pDst+28, vDst[7]);
362 #else
363 #error Unsupported vector width
364 #endif
365 }
366 #if ENABLE_AVX512_SIMD16
367
Transpose_16Transpose32_32_32_32368 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
369 {
370 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
371 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
372 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
373 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
374
375 simd16scalar dst[4];
376
377 vTranspose4x16(dst, src0, src1, src2, src3);
378
379 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
380 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
381 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
382 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
383 }
384 #endif
385 };
386
387 //////////////////////////////////////////////////////////////////////////
388 /// Transpose32_32_32
389 //////////////////////////////////////////////////////////////////////////
390 struct Transpose32_32_32
391 {
392 //////////////////////////////////////////////////////////////////////////
393 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
394 /// @param pSrc - source data in SOA form
395 /// @param pDst - output data in AOS form
TransposeTranspose32_32_32396 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
397 {
398 #if KNOB_SIMD_WIDTH == 8
399 simdscalar src0 = _simd_load_ps((const float*)pSrc);
400 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
401 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
402
403 simd4scalar vDst[8];
404 vTranspose3x8(vDst, src0, src1, src2);
405 SIMD128::store_ps((float*)pDst, vDst[0]);
406 SIMD128::store_ps((float*)pDst + 4, vDst[1]);
407 SIMD128::store_ps((float*)pDst + 8, vDst[2]);
408 SIMD128::store_ps((float*)pDst + 12, vDst[3]);
409 SIMD128::store_ps((float*)pDst + 16, vDst[4]);
410 SIMD128::store_ps((float*)pDst + 20, vDst[5]);
411 SIMD128::store_ps((float*)pDst + 24, vDst[6]);
412 SIMD128::store_ps((float*)pDst + 28, vDst[7]);
413 #else
414 #error Unsupported vector width
415 #endif
416 }
417 #if ENABLE_AVX512_SIMD16
418
Transpose_16Transpose32_32_32419 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
420 {
421 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
422 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
423 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
424 simd16scalar src3 = _simd16_setzero_ps();
425
426 simd16scalar dst[4];
427
428 vTranspose4x16(dst, src0, src1, src2, src3);
429
430 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
431 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
432 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
433 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
434 }
435 #endif
436 };
437
438 //////////////////////////////////////////////////////////////////////////
439 /// Transpose32_32
440 //////////////////////////////////////////////////////////////////////////
441 struct Transpose32_32
442 {
443 //////////////////////////////////////////////////////////////////////////
444 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
445 /// @param pSrc - source data in SOA form
446 /// @param pDst - output data in AOS form
TransposeTranspose32_32447 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
448 {
449 #if KNOB_SIMD_WIDTH == 8
450 const float* pfSrc = (const float*)pSrc;
451 simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
452 simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
453 simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
454 simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
455
456 simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
457 simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
458 simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
459 simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
460
461 float* pfDst = (float*)pDst;
462 SIMD128::store_ps(pfDst + 0, dst0);
463 SIMD128::store_ps(pfDst + 4, dst1);
464 SIMD128::store_ps(pfDst + 8, dst2);
465 SIMD128::store_ps(pfDst + 12, dst3);
466 #else
467 #error Unsupported vector width
468 #endif
469 }
470 #if ENABLE_AVX512_SIMD16
471
Transpose_16Transpose32_32472 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
473 {
474 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr
475 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg
476
477 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
478 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
479
480 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
481 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
482
483 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
484 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
485
486 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
487 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
488 }
489 #endif
490 };
491
492 //////////////////////////////////////////////////////////////////////////
493 /// Transpose16_16_16_16
494 //////////////////////////////////////////////////////////////////////////
495 struct Transpose16_16_16_16
496 {
497 //////////////////////////////////////////////////////////////////////////
498 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
499 /// @param pSrc - source data in SOA form
500 /// @param pDst - output data in AOS form
TransposeTranspose16_16_16_16501 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
502 {
503 #if KNOB_SIMD_WIDTH == 8
504 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
505 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
506
507 simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
508 simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
509 simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
510 simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
511
512 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
513 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
514 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
515 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
516
517 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
518 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
519 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
520 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
521
522 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
523 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
524 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
525 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
526 #else
527 #error Unsupported vector width
528 #endif
529 }
530 #if ENABLE_AVX512_SIMD16
531
Transpose_16Transpose16_16_16_16532 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
533 {
534 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
535 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
536 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
537 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
538
539 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
540 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
541 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
542 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
543
544 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
545 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
546 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
547 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
548
549 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
550 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
551 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
552 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
553
554 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
555 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
556 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
557 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
558 }
559 #endif
560 };
561
562 //////////////////////////////////////////////////////////////////////////
563 /// Transpose16_16_16
564 //////////////////////////////////////////////////////////////////////////
565 struct Transpose16_16_16
566 {
567 //////////////////////////////////////////////////////////////////////////
568 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
569 /// @param pSrc - source data in SOA form
570 /// @param pDst - output data in AOS form
TransposeTranspose16_16_16571 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
572 {
573 #if KNOB_SIMD_WIDTH == 8
574 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
575
576 simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
577 simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
578 simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
579 simd4scalari src_a = SIMD128::setzero_si();
580
581 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
582 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
583 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
584 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
585
586 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
587 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
588 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
589 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
590
591 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
592 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
593 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
594 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
595 #else
596 #error Unsupported vector width
597 #endif
598 }
599 #if ENABLE_AVX512_SIMD16
600
Transpose_16Transpose16_16_16601 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
602 {
603 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
604 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
605 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
606 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
607
608 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
609 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
610 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
611 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
612
613 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
614 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
615 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
616 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
617
618 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
619 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
620 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
621 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
622
623 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
624 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
625 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
626 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
627 }
628 #endif
629 };
630
631 //////////////////////////////////////////////////////////////////////////
632 /// Transpose16_16
633 //////////////////////////////////////////////////////////////////////////
634 struct Transpose16_16
635 {
636 //////////////////////////////////////////////////////////////////////////
637 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
638 /// @param pSrc - source data in SOA form
639 /// @param pDst - output data in AOS form
TransposeTranspose16_16640 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
641 {
642 #if KNOB_SIMD_WIDTH == 8
643 simdscalar src = _simd_load_ps((const float*)pSrc);
644
645 simd4scalar comp0 = _simd_extractf128_ps(src, 0);
646 simd4scalar comp1 = _simd_extractf128_ps(src, 1);
647
648 simd4scalari comp0i = SIMD128::castps_si(comp0);
649 simd4scalari comp1i = SIMD128::castps_si(comp1);
650
651 simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
652 simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
653
654 SIMD128::store_si((simd4scalari*)pDst, resLo);
655 SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
656 #else
657 #error Unsupported vector width
658 #endif
659 }
660 #if ENABLE_AVX512_SIMD16
661
Transpose_16Transpose16_16662 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
663 {
664 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
665 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
666
667 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
668 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
669
670 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
671 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
672
673 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
674 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
675 }
676 #endif
677 };
678
679 //////////////////////////////////////////////////////////////////////////
680 /// Transpose24_8
681 //////////////////////////////////////////////////////////////////////////
682 struct Transpose24_8
683 {
684 //////////////////////////////////////////////////////////////////////////
685 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
686 /// @param pSrc - source data in SOA form
687 /// @param pDst - output data in AOS form
688 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
689 #if ENABLE_AVX512_SIMD16
690
691 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
692 #endif
693 };
694
695 //////////////////////////////////////////////////////////////////////////
696 /// Transpose32_8_24
697 //////////////////////////////////////////////////////////////////////////
698 struct Transpose32_8_24
699 {
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
702 /// @param pSrc - source data in SOA form
703 /// @param pDst - output data in AOS form
704 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
705 #if ENABLE_AVX512_SIMD16
706
707 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
708 #endif
709 };
710
711 //////////////////////////////////////////////////////////////////////////
712 /// Transpose4_4_4_4
713 //////////////////////////////////////////////////////////////////////////
714 struct Transpose4_4_4_4
715 {
716 //////////////////////////////////////////////////////////////////////////
717 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
718 /// @param pSrc - source data in SOA form
719 /// @param pDst - output data in AOS form
720 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
721 #if ENABLE_AVX512_SIMD16
722
723 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
724 #endif
725 };
726
727 //////////////////////////////////////////////////////////////////////////
728 /// Transpose5_6_5
729 //////////////////////////////////////////////////////////////////////////
730 struct Transpose5_6_5
731 {
732 //////////////////////////////////////////////////////////////////////////
733 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
734 /// @param pSrc - source data in SOA form
735 /// @param pDst - output data in AOS form
736 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
737 #if ENABLE_AVX512_SIMD16
738
739 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
740 #endif
741 };
742
743 //////////////////////////////////////////////////////////////////////////
744 /// Transpose9_9_9_5
745 //////////////////////////////////////////////////////////////////////////
746 struct Transpose9_9_9_5
747 {
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
750 /// @param pSrc - source data in SOA form
751 /// @param pDst - output data in AOS form
752 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
753 #if ENABLE_AVX512_SIMD16
754
755 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
756 #endif
757 };
758
759 //////////////////////////////////////////////////////////////////////////
760 /// Transpose5_5_5_1
761 //////////////////////////////////////////////////////////////////////////
762 struct Transpose5_5_5_1
763 {
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766 /// @param pSrc - source data in SOA form
767 /// @param pDst - output data in AOS form
768 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
769 #if ENABLE_AVX512_SIMD16
770
771 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
772 #endif
773 };
774
775 //////////////////////////////////////////////////////////////////////////
776 /// Transpose1_5_5_5
777 //////////////////////////////////////////////////////////////////////////
778 struct Transpose1_5_5_5
779 {
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
782 /// @param pSrc - source data in SOA form
783 /// @param pDst - output data in AOS form
784 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
785 };
786
787 //////////////////////////////////////////////////////////////////////////
788 /// Transpose10_10_10_2
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose10_10_10_2
791 {
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
797 #if ENABLE_AVX512_SIMD16
798
799 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
800 #endif
801 };
802
803 //////////////////////////////////////////////////////////////////////////
804 /// Transpose11_11_10
805 //////////////////////////////////////////////////////////////////////////
806 struct Transpose11_11_10
807 {
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
810 /// @param pSrc - source data in SOA form
811 /// @param pDst - output data in AOS form
812 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
813 #if ENABLE_AVX512_SIMD16
814
815 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
816 #endif
817 };
818
819 //////////////////////////////////////////////////////////////////////////
820 /// Transpose64
821 //////////////////////////////////////////////////////////////////////////
822 struct Transpose64
823 {
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Performs an SOA to AOS conversion
826 /// @param pSrc - source data in SOA form
827 /// @param pDst - output data in AOS form
828 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
829 #if ENABLE_AVX512_SIMD16
830
831 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
832 #endif
833 };
834
835 //////////////////////////////////////////////////////////////////////////
836 /// Transpose64_64
837 //////////////////////////////////////////////////////////////////////////
838 struct Transpose64_64
839 {
840 //////////////////////////////////////////////////////////////////////////
841 /// @brief Performs an SOA to AOS conversion
842 /// @param pSrc - source data in SOA form
843 /// @param pDst - output data in AOS form
844 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
845 #if ENABLE_AVX512_SIMD16
846
847 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
848 #endif
849 };
850
851 //////////////////////////////////////////////////////////////////////////
852 /// Transpose64_64_64
853 //////////////////////////////////////////////////////////////////////////
854 struct Transpose64_64_64
855 {
856 //////////////////////////////////////////////////////////////////////////
857 /// @brief Performs an SOA to AOS conversion
858 /// @param pSrc - source data in SOA form
859 /// @param pDst - output data in AOS form
860 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
861 #if ENABLE_AVX512_SIMD16
862
863 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
864 #endif
865 };
866
867 //////////////////////////////////////////////////////////////////////////
868 /// Transpose64_64_64_64
869 //////////////////////////////////////////////////////////////////////////
870 struct Transpose64_64_64_64
871 {
872 //////////////////////////////////////////////////////////////////////////
873 /// @brief Performs an SOA to AOS conversion
874 /// @param pSrc - source data in SOA form
875 /// @param pDst - output data in AOS form
876 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
877 #if ENABLE_AVX512_SIMD16
878
879 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
880 #endif
881 };
882
883