• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "utils.h"
31 
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits, bool Signed = false>
36 struct PackTraits
37 {
38     static const uint32_t MyNumBits = NumBits;
39     static simdscalar loadSOA(const uint8_t *pSrc) = delete;
40     static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
41     static simdscalar unpack(simdscalar &in) = delete;
42     static simdscalar pack(simdscalar &in) = delete;
43 #if ENABLE_AVX512_SIMD16
44     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
45     static void storeSOA(uint8_t *pDst, simd16scalar src) = delete;
46     static simd16scalar unpack(simd16scalar &in) = delete;
47     static simd16scalar pack(simd16scalar &in) = delete;
48 #endif
49 };
50 
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
54 template <>
55 struct PackTraits<0, false>
56 {
57     static const uint32_t MyNumBits = 0;
58 
59     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
60     static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
61     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
62     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
65     static void storeSOA(uint8_t *pDst, simd16scalar src) { return; }
66     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
67     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
68 #endif
69 };
70 
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
74 template <>
75 struct PackTraits<8, false>
76 {
77     static const uint32_t MyNumBits = 8;
78 
79     static simdscalar loadSOA(const uint8_t *pSrc)
80     {
81 #if KNOB_SIMD_WIDTH == 8
82         __m256 result = _mm256_setzero_ps();
83         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
84         return _mm256_insertf128_ps(result, vLo, 0);
85 #else
86 #error Unsupported vector width
87 #endif
88     }
89 
90     static void storeSOA(uint8_t *pDst, simdscalar src)
91     {
92         // store simd bytes
93 #if KNOB_SIMD_WIDTH == 8
94         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
95 #else
96 #error Unsupported vector width
97 #endif
98     }
99 
100     static simdscalar unpack(simdscalar &in)
101     {
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH==KNOB_ARCH_AVX
104         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
105         __m128i resLo = _mm_cvtepu8_epi32(src);
106         __m128i resHi = _mm_shuffle_epi8(src,
107             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
108 
109         __m256i result = _mm256_castsi128_si256(resLo);
110         result = _mm256_insertf128_si256(result, resHi, 1);
111         return _mm256_castsi256_ps(result);
112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
113         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
114 #endif
115 #else
116 #error Unsupported vector width
117 #endif
118     }
119 
120     static simdscalar pack(simdscalar &in)
121     {
122 #if KNOB_SIMD_WIDTH == 8
123         simdscalari src = _simd_castps_si(in);
124         __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
125         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
126         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
127 #else
128 #error Unsupported vector width
129 #endif
130     }
131 #if ENABLE_AVX512_SIMD16
132 
133     static simd16scalar loadSOA_16(const uint8_t *pSrc)
134     {
135         simd16scalar result = _simd16_setzero_ps();
136         simdscalar resultlo = _simd_setzero_ps();
137 
138         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
139 
140         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
141         result = _simd16_insert_ps(result, resultlo, 0);
142 
143         return result;
144     }
145 
146     static void storeSOA(uint8_t *pDst, simd16scalar src)
147     {
148         // store simd16 bytes
149         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
150     }
151 
152     static simd16scalar unpack(simd16scalar &in)
153     {
154         simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
155 
156         return _simd16_castsi_ps(result);
157     }
158 
159     static simd16scalar pack(simd16scalar &in)
160     {
161         simd16scalari result = _simd16_setzero_si();
162 
163         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
164         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
165 
166         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
167         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
168 
169         simdscalari pack = _simd_packus_epi32(permlo, permhi);                  // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
170 
171         const simdscalari zero = _simd_setzero_si();
172 
173         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
174         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
175 
176         pack = _simd_packus_epi16(permlo, permhi);                              // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
177 
178         result = _simd16_insert_si(result, pack, 0);
179 
180         return _simd16_castsi_ps(result);
181     }
182 #endif
183 };
184 
185 //////////////////////////////////////////////////////////////////////////
186 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
187 //////////////////////////////////////////////////////////////////////////
188 template <>
189 struct PackTraits<8, true>
190 {
191     static const uint32_t MyNumBits = 8;
192 
193     static simdscalar loadSOA(const uint8_t *pSrc)
194     {
195 #if KNOB_SIMD_WIDTH == 8
196         __m256 result = _mm256_setzero_ps();
197         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
198         return _mm256_insertf128_ps(result, vLo, 0);
199 #else
200 #error Unsupported vector width
201 #endif
202     }
203 
204     static void storeSOA(uint8_t *pDst, simdscalar src)
205     {
206         // store simd bytes
207 #if KNOB_SIMD_WIDTH == 8
208         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
209 #else
210 #error Unsupported vector width
211 #endif
212     }
213 
214     static simdscalar unpack(simdscalar &in)
215     {
216 #if KNOB_SIMD_WIDTH == 8
217 #if KNOB_ARCH==KNOB_ARCH_AVX
218         SWR_ASSERT(0); // I think this may be incorrect.
219         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
220         __m128i resLo = _mm_cvtepi8_epi32(src);
221         __m128i resHi = _mm_shuffle_epi8(src,
222             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
223 
224         __m256i result = _mm256_castsi128_si256(resLo);
225         result = _mm256_insertf128_si256(result, resHi, 1);
226         return _mm256_castsi256_ps(result);
227 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
228         return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
229 #endif
230 #else
231 #error Unsupported vector width
232 #endif
233     }
234 
235     static simdscalar pack(simdscalar &in)
236     {
237 #if KNOB_SIMD_WIDTH == 8
238         simdscalari src = _simd_castps_si(in);
239         __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
240         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
241         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
242 #else
243 #error Unsupported vector width
244 #endif
245     }
246 #if ENABLE_AVX512_SIMD16
247 
248     static simd16scalar loadSOA_16(const uint8_t *pSrc)
249     {
250         simd16scalar result = _simd16_setzero_ps();
251         simdscalar resultlo = _simd_setzero_ps();
252 
253         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
254 
255         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
256         result = _simd16_insert_ps(result, resultlo, 0);
257 
258         return result;
259     }
260 
261     static void storeSOA(uint8_t *pDst, simd16scalar src)
262     {
263         // store simd16 bytes
264         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
265     }
266 
267     static simd16scalar unpack(simd16scalar &in)
268     {
269         simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
270 
271         return _simd16_castsi_ps(result);
272     }
273 
274     static simd16scalar pack(simd16scalar &in)
275     {
276         simd16scalari result = _simd16_setzero_si();
277 
278         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
279         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
280 
281         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
282         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
283 
284         simdscalari pack = _simd_packs_epi32(permlo, permhi);                   // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
285 
286         const simdscalari zero = _simd_setzero_si();
287 
288         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
289         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
290 
291         pack = _simd_packs_epi16(permlo, permhi);                               // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
292 
293         result = _simd16_insert_si(result, pack, 0);
294 
295         return _simd16_castsi_ps(result);
296     }
297 #endif
298 };
299 
300 //////////////////////////////////////////////////////////////////////////
301 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
302 //////////////////////////////////////////////////////////////////////////
303 template <>
304 struct PackTraits<16, false>
305 {
306     static const uint32_t MyNumBits = 16;
307 
308     static simdscalar loadSOA(const uint8_t *pSrc)
309     {
310 #if KNOB_SIMD_WIDTH == 8
311         __m256 result = _mm256_setzero_ps();
312         __m128 vLo = _mm_load_ps((const float*)pSrc);
313         return _mm256_insertf128_ps(result, vLo, 0);
314 #else
315 #error Unsupported vector width
316 #endif
317     }
318 
319     static void storeSOA(uint8_t *pDst, simdscalar src)
320     {
321 #if KNOB_SIMD_WIDTH == 8
322         // store 16B (2B * 8)
323         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
324 #else
325 #error Unsupported vector width
326 #endif
327     }
328 
329     static simdscalar unpack(simdscalar &in)
330     {
331 #if KNOB_SIMD_WIDTH == 8
332 #if KNOB_ARCH==KNOB_ARCH_AVX
333         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
334         __m128i resLo = _mm_cvtepu16_epi32(src);
335         __m128i resHi = _mm_shuffle_epi8(src,
336             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
337 
338         __m256i result = _mm256_castsi128_si256(resLo);
339         result = _mm256_insertf128_si256(result, resHi, 1);
340         return _mm256_castsi256_ps(result);
341 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
342         return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
343 #endif
344 #else
345 #error Unsupported vector width
346 #endif
347     }
348 
349     static simdscalar pack(simdscalar &in)
350     {
351 #if KNOB_SIMD_WIDTH == 8
352         simdscalari src = _simd_castps_si(in);
353         __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
354         return _mm256_castsi256_ps(res);
355 #else
356 #error Unsupported vector width
357 #endif
358     }
359 #if ENABLE_AVX512_SIMD16
360 
361     static simd16scalar loadSOA_16(const uint8_t *pSrc)
362     {
363         simd16scalar result = _simd16_setzero_ps();
364 
365         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
366 
367         result = _simd16_insert_ps(result, resultlo, 0);
368 
369         return result;
370     }
371 
372     static void storeSOA(uint8_t *pDst, simd16scalar src)
373     {
374         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
375     }
376 
377     static simd16scalar unpack(simd16scalar &in)
378     {
379         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
380 
381         return _simd16_castsi_ps(result);
382     }
383 
384     static simd16scalar pack(simd16scalar &in)
385     {
386         const simd16scalari zero = _simd16_setzero_si();
387 
388         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
389         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
390 
391         simd16scalari result = _simd16_packus_epi32(permlo, permhi);    // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
392 
393         return _simd16_castsi_ps(result);
394     }
395 #endif
396 };
397 
398 //////////////////////////////////////////////////////////////////////////
399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
400 //////////////////////////////////////////////////////////////////////////
401 template <>
402 struct PackTraits<16, true>
403 {
404     static const uint32_t MyNumBits = 16;
405 
406     static simdscalar loadSOA(const uint8_t *pSrc)
407     {
408 #if KNOB_SIMD_WIDTH == 8
409         __m256 result = _mm256_setzero_ps();
410         __m128 vLo = _mm_load_ps((const float*)pSrc);
411         return _mm256_insertf128_ps(result, vLo, 0);
412 #else
413 #error Unsupported vector width
414 #endif
415     }
416 
417     static void storeSOA(uint8_t *pDst, simdscalar src)
418     {
419 #if KNOB_SIMD_WIDTH == 8
420         // store 16B (2B * 8)
421         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
422 #else
423 #error Unsupported vector width
424 #endif
425     }
426 
427     static simdscalar unpack(simdscalar &in)
428     {
429 #if KNOB_SIMD_WIDTH == 8
430 #if KNOB_ARCH==KNOB_ARCH_AVX
431         SWR_ASSERT(0); // I think this is incorrectly implemented
432         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
433         __m128i resLo = _mm_cvtepi16_epi32(src);
434         __m128i resHi = _mm_shuffle_epi8(src,
435             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
436 
437         __m256i result = _mm256_castsi128_si256(resLo);
438         result = _mm256_insertf128_si256(result, resHi, 1);
439         return _mm256_castsi256_ps(result);
440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
441         return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
442 #endif
443 #else
444 #error Unsupported vector width
445 #endif
446     }
447 
448     static simdscalar pack(simdscalar &in)
449     {
450 #if KNOB_SIMD_WIDTH == 8
451         simdscalari src = _simd_castps_si(in);
452         __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
453         return _mm256_castsi256_ps(res);
454 #else
455 #error Unsupported vector width
456 #endif
457     }
458 #if ENABLE_AVX512_SIMD16
459 
460     static simd16scalar loadSOA_16(const uint8_t *pSrc)
461     {
462         simd16scalar result = _simd16_setzero_ps();
463 
464         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
465 
466         result = _simd16_insert_ps(result, resultlo, 0);
467 
468         return result;
469     }
470 
471     static void storeSOA(uint8_t *pDst, simd16scalar src)
472     {
473         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
474     }
475 
476     static simd16scalar unpack(simd16scalar &in)
477     {
478         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
479 
480         return _simd16_castsi_ps(result);
481     }
482 
483     static simd16scalar pack(simd16scalar &in)
484     {
485         const simd16scalari zero = _simd16_setzero_si();
486 
487         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
488         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
489 
490         simd16scalari result = _simd16_packs_epi32(permlo, permhi);     // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
491 
492         return _simd16_castsi_ps(result);
493     }
494 #endif
495 };
496 
497 //////////////////////////////////////////////////////////////////////////
498 /// PackTraits - Helpers for packing / unpacking 32 bit channels
499 //////////////////////////////////////////////////////////////////////////
500 template <>
501 struct PackTraits<32, false>
502 {
503     static const uint32_t MyNumBits = 32;
504 
505     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
506     static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
507     static simdscalar unpack(simdscalar &in) { return in; }
508     static simdscalar pack(simdscalar &in) { return in; }
509 #if ENABLE_AVX512_SIMD16
510 
511     static simd16scalar loadSOA_16(const uint8_t *pSrc)
512     {
513         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
514     }
515 
516     static void storeSOA(uint8_t *pDst, simd16scalar src)
517     {
518         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
519     }
520 
521     static simd16scalar unpack(simd16scalar &in)
522     {
523         return in;
524     }
525 
526     static simd16scalar pack(simd16scalar &in)
527     {
528         return in;
529     }
530 #endif
531 };
532 
533 //////////////////////////////////////////////////////////////////////////
534 /// TypeTraits - Format type traits.
535 //////////////////////////////////////////////////////////////////////////
536 template<SWR_TYPE type, uint32_t NumBits>
537 struct TypeTraits : PackTraits<NumBits>
538 {
539     static const SWR_TYPE MyType = type;
540     static float toFloat() { return 0.0; }
541     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
542     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
543 };
544 
545 //////////////////////////////////////////////////////////////////////////
546 /// TypeTraits - Format type traits specialization for UINT8
547 //////////////////////////////////////////////////////////////////////////
548 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
549 {
550     static const SWR_TYPE MyType = SWR_TYPE_UINT;
551     static float toFloat() { return 0.0; }
552     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
553     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
554 };
555 
556 //////////////////////////////////////////////////////////////////////////
557 /// TypeTraits - Format type traits specialization for UINT8
558 //////////////////////////////////////////////////////////////////////////
559 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
560 {
561     static const SWR_TYPE MyType = SWR_TYPE_SINT;
562     static float toFloat() { return 0.0; }
563     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
564     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
565 };
566 
567 //////////////////////////////////////////////////////////////////////////
568 /// TypeTraits - Format type traits specialization for UINT16
569 //////////////////////////////////////////////////////////////////////////
570 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
571 {
572     static const SWR_TYPE MyType = SWR_TYPE_UINT;
573     static float toFloat() { return 0.0; }
574     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
575     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
576 };
577 
578 //////////////////////////////////////////////////////////////////////////
579 /// TypeTraits - Format type traits specialization for SINT16
580 //////////////////////////////////////////////////////////////////////////
581 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
582 {
583     static const SWR_TYPE MyType = SWR_TYPE_SINT;
584     static float toFloat() { return 0.0; }
585     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
586     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
587 };
588 
589 //////////////////////////////////////////////////////////////////////////
590 /// TypeTraits - Format type traits specialization for UINT32
591 //////////////////////////////////////////////////////////////////////////
592 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
593 {
594     static const SWR_TYPE MyType = SWR_TYPE_UINT;
595     static float toFloat() { return 0.0; }
596     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
597     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
598 };
599 
600 //////////////////////////////////////////////////////////////////////////
601 /// TypeTraits - Format type traits specialization for UINT32
602 //////////////////////////////////////////////////////////////////////////
603 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
604 {
605     static const SWR_TYPE MyType = SWR_TYPE_SINT;
606     static float toFloat() { return 0.0; }
607     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
608     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
609 };
610 
611 //////////////////////////////////////////////////////////////////////////
612 /// TypeTraits - Format type traits specialization for UNORM5
613 //////////////////////////////////////////////////////////////////////////
614 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
615 {
616     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
617     static float toFloat() { return 1.0f / 31.0f; }
618     static float fromFloat() { return 31.0f; }
619     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
620 };
621 
622 //////////////////////////////////////////////////////////////////////////
623 /// TypeTraits - Format type traits specialization for UNORM6
624 //////////////////////////////////////////////////////////////////////////
625 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
626 {
627     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
628     static float toFloat() { return 1.0f / 63.0f; }
629     static float fromFloat() { return 63.0f; }
630     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
631 };
632 
633 //////////////////////////////////////////////////////////////////////////
634 /// TypeTraits - Format type traits specialization for UNORM8
635 //////////////////////////////////////////////////////////////////////////
636 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
637 {
638     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
639     static float toFloat() { return 1.0f / 255.0f; }
640     static float fromFloat() { return 255.0f; }
641     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
642 };
643 
644 //////////////////////////////////////////////////////////////////////////
645 /// TypeTraits - Format type traits specialization for UNORM8
646 //////////////////////////////////////////////////////////////////////////
647 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
648 {
649     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
650     static float toFloat() { return 1.0f / 127.0f; }
651     static float fromFloat() { return 127.0f; }
652     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
653 };
654 
655 //////////////////////////////////////////////////////////////////////////
656 /// TypeTraits - Format type traits specialization for UNORM16
657 //////////////////////////////////////////////////////////////////////////
658 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
659 {
660     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
661     static float toFloat() { return 1.0f / 65535.0f; }
662     static float fromFloat() { return 65535.0f; }
663     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
664 };
665 
666 //////////////////////////////////////////////////////////////////////////
667 /// TypeTraits - Format type traits specialization for SNORM16
668 //////////////////////////////////////////////////////////////////////////
669 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
670 {
671     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
672     static float toFloat() { return 1.0f / 32767.0f; }
673     static float fromFloat() { return 32767.0f; }
674     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
675 };
676 
677 //////////////////////////////////////////////////////////////////////////
678 /// TypeTraits - Format type traits specialization for UNORM24
679 //////////////////////////////////////////////////////////////////////////
680 template<>
681 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
682 {
683     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
684     static float toFloat() { return 1.0f / 16777215.0f; }
685     static float fromFloat() { return 16777215.0f; }
686     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
687 };
688 
689 //////////////////////////////////////////////////////////////////////////
690 // FLOAT Specializations from here on...
691 //////////////////////////////////////////////////////////////////////////
692 #define TO_M128i(a) _mm_castps_si128(a)
693 #define TO_M128(a) _mm_castsi128_ps(a)
694 
695 #include "math.h"
696 
697 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
698 inline static __m128 fastpow(__m128 arg) {
699     __m128 ret = arg;
700 
701     static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
702         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
703 
704     // Apply a constant pre-correction factor.
705     ret = _mm_mul_ps(ret, factor);
706 
707     // Reinterpret arg as integer to obtain logarithm.
708     //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
709     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
710 
711     // Multiply logarithm by power.
712     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
713 
714     // Convert back to "integer" to exponentiate.
715     //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
716     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
717 
718     return ret;
719 }
720 
721 inline static __m128 pow512_4(__m128 arg) {
722     // 5/12 is too small, so compute the 4th root of 20/12 instead.
723     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
724     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
725     __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
726     __m128 xover = _mm_mul_ps(arg, xf);
727 
728     __m128 xfm1 = _mm_rsqrt_ps(xf);
729     __m128 x2 = _mm_mul_ps(arg, arg);
730     __m128 xunder = _mm_mul_ps(x2, xfm1);
731 
732     // sqrt2 * over + 2 * sqrt2 * under
733     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
734         _mm_add_ps(xover, xunder));
735 
736     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
737     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
738     return xavg;
739 }
740 
741 inline static __m128 powf_wrapper(__m128 Base, float Exp)
742 {
743     float *f = (float *)(&Base);
744 
745     return _mm_set_ps(powf(f[3], Exp),
746                       powf(f[2], Exp),
747                       powf(f[1], Exp),
748                       powf(f[0], Exp));
749 }
750 
751 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
752 {
753     // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
754     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
755 
756     // squeeze the mask down to 16 bits (4 bits per DWORD)
757     int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
758 
759     __m128 Result;
760 
761     //
762     if (CompareResult == 0xFFFF)
763     {
764         // all DWORDs are <= the threshold
765         Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
766     }
767     else if (CompareResult == 0x0)
768     {
769         // all DWORDs are > the threshold
770         __m128 fSrc_0RGB = Src;
771 
772         // --> 1.055f * c(1.0f/2.4f) - 0.055f
773 #if KNOB_USE_FAST_SRGB == TRUE
774         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
775         __m128 f = pow512_4(fSrc_0RGB);
776 #else
777         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
778 #endif
779         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
780         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
781     }
782     else
783     {
784         // some DWORDs are <= the threshold and some are > threshold
785         __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
786 
787         __m128 fSrc_0RGB = Src;
788 
789         // --> 1.055f * c(1.0f/2.4f) - 0.055f
790 #if KNOB_USE_FAST_SRGB == TRUE
791         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
792         __m128 f = pow512_4(fSrc_0RGB);
793 #else
794         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
795 #endif
796         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
797         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
798 
799         // Clear the alpha (is garbage after the sub)
800         __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
801 
802         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
803         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
804         __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
805 
806         Result = TO_M128(CombinedParts);
807     }
808 
809     return Result;
810 }
811 
812 #if ENABLE_AVX512_SIMD16
813 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
814 inline static simd16scalar fastpow(simd16scalar value)
815 {
816     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
817         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
818 
819     // Apply a constant pre-correction factor.
820     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
821 
822     // Reinterpret arg as integer to obtain logarithm.
823     //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
824     result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
825 
826     // Multiply logarithm by power.
827     result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
828 
829     // Convert back to "integer" to exponentiate.
830     //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
831     result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
832 
833     return result;
834 }
835 
836 inline static simd16scalar pow512_4(simd16scalar arg)
837 {
838     // 5/12 is too small, so compute the 4th root of 20/12 instead.
839     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
840     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
841     simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
842     simd16scalar xover = _simd16_mul_ps(arg, xf);
843 
844     simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
845     simd16scalar x2 = _simd16_mul_ps(arg, arg);
846     simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
847 
848     // sqrt2 * over + 2 * sqrt2 * under
849     simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
850 
851     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
852     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
853 
854     return xavg;
855 }
856 
857 inline static simd16scalar powf_wrapper(const simd16scalar base, float exp)
858 {
859     const float *f = reinterpret_cast<const float *>(&base);
860 
861     return _simd16_set_ps(
862         powf(f[15], exp),
863         powf(f[14], exp),
864         powf(f[13], exp),
865         powf(f[12], exp),
866         powf(f[11], exp),
867         powf(f[10], exp),
868         powf(f[ 9], exp),
869         powf(f[ 8], exp),
870         powf(f[ 7], exp),
871         powf(f[ 6], exp),
872         powf(f[ 5], exp),
873         powf(f[ 4], exp),
874         powf(f[ 3], exp),
875         powf(f[ 2], exp),
876         powf(f[ 1], exp),
877         powf(f[ 0], exp)
878     );
879 }
880 
881 // float to SRGB conversion formula
882 //
883 // if (value < 0.0031308f)
884 //     value *= 12.92f;
885 // else
886 //     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
887 //
888 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
889 {
890     // create a mask where the source is < the minimal SRGB float value
891     const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
892 
893     // if all elements are < the threshold, result = value * 12.92
894     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
895 
896     if (_simd16_mask2int(mask) != 0xFFFF)
897     {
898         // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
899 #if KNOB_USE_FAST_SRGB == TRUE
900         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
901         simd16scalar result2 = pow512_4(value);
902 #else
903         simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
904 #endif
905 
906         result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
907         result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
908 
909 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
910         // only native AVX512 can directly use the computed mask for the blend operation
911         result = _mm512_mask_blend_ps(mask, result2, result);
912 #else
913         result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
914 #endif
915     }
916 
917     return result;
918 }
919 
920 #endif
921 //////////////////////////////////////////////////////////////////////////
922 /// TypeTraits - Format type traits specialization for FLOAT16
923 //////////////////////////////////////////////////////////////////////////
924 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
925 {
926     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
927     static float toFloat() { return 1.0f; }
928     static float fromFloat() { return 1.0f; }
929     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
930 
931     static simdscalar pack(const simdscalar &in)
932     {
933 #if KNOB_SIMD_WIDTH == 8
934 #if (KNOB_ARCH == KNOB_ARCH_AVX)
935         // input is 8 packed float32, output is 8 packed float16
936         simdscalari src = _simd_castps_si(in);
937 
938         static const uint32_t FLOAT_EXP_BITS = 8;
939         static const uint32_t FLOAT_MANTISSA_BITS = 23;
940         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
941         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
942 
943         static const uint32_t HALF_EXP_BITS = 5;
944         static const uint32_t HALF_MANTISSA_BITS = 10;
945         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
946 
947         // minimum exponent required, exponents below this are flushed to 0.
948         static const int32_t HALF_EXP_MIN = -14;
949         static const int32_t FLOAT_EXP_BIAS = 127;
950         static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
951         static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
952 
953         // maximum exponent required, exponents above this are set to infinity
954         static const int32_t HALF_EXP_MAX = 15;
955         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
956 
957         const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
958         const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
959         const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
960         const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
961         const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
962         const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
963 
964         simdscalari vSign       = _simd_and_si(src, vSignMask);
965         simdscalari vExp        = _simd_and_si(src, vExpMask);
966         simdscalari vMan        = _simd_and_si(src, vManMask);
967 
968         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
969         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
970         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
971         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
972 
973         simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
974 
975         // pack output 16-bits into the lower 16-bits of each 32-bit channel
976         simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
977         vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
978 
979         // Flush To Zero
980         vDst   = _simd_andnot_si(vFTZMask, vDst);
981         // Apply Infinites / NaN
982         vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
983 
984         // Apply clamps
985         vDst = _simd_andnot_si(vClampMask, vDst);
986         vDst = _simd_or_si(vDst,
987                 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
988 
989         // Compute Denormals (subnormals)
990         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
991         {
992             uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
993             uint32_t *pExp = (uint32_t*)&vExp;
994             uint32_t *pMan = (uint32_t*)&vMan;
995             uint32_t *pDst = (uint32_t*)&vDst;
996             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
997             {
998                 if (pDenormMask[i])
999                 {
1000                     // Need to compute subnormal value
1001                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1002                     uint32_t mantissa = pMan[i] |
1003                                         (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
1004 
1005                     pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1006                 }
1007             }
1008         }
1009 
1010         // Add in sign bits
1011         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1012 
1013         // Pack to lower 128-bits
1014         vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1015 
1016 #if 0
1017 #if !defined(NDEBUG)
1018         simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1019 
1020         for (uint32_t i = 0; i < 4; ++i)
1021         {
1022             SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1023         }
1024 #endif
1025 #endif
1026 
1027         return _simd_castsi_ps(vDst);
1028 
1029 #else
1030         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1031 #endif
1032 #else
1033 #error Unsupported vector width
1034 #endif
1035     }
1036 
1037     static simdscalar unpack(const simdscalar &in)
1038     {
1039         // input is 8 packed float16, output is 8 packed float32
1040         SWR_ASSERT(0); // @todo
1041         return _simd_setzero_ps();
1042     }
1043 #if ENABLE_AVX512_SIMD16
1044 
1045     static simd16scalar pack(const simd16scalar &in)
1046     {
1047         simd16scalari result = _simd16_setzero_si();
1048         simdscalari resultlo = _simd_setzero_si();
1049 
1050 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1051         simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1052         simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1053 
1054         __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
1055         __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
1056 
1057 #else
1058         __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1059         __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1060 
1061 #endif
1062         resultlo = _simd_insertf128_si(resultlo, templo, 0);
1063         resultlo = _simd_insertf128_si(resultlo, temphi, 1);
1064 
1065         result = _simd16_insert_si(result, resultlo, 0);
1066 
1067         return _simd16_castsi_ps(result);
1068     }
1069 
1070     static simd16scalar unpack(const simd16scalar &in)
1071     {
1072         // input is 16 packed float16, output is 16 packed float32
1073         SWR_ASSERT(0); // @todo
1074         return _simd16_setzero_ps();
1075     }
1076 #endif
1077 };
1078 
1079 //////////////////////////////////////////////////////////////////////////
1080 /// TypeTraits - Format type traits specialization for FLOAT32
1081 //////////////////////////////////////////////////////////////////////////
1082 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1083 {
1084     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1085     static float toFloat() { return 1.0f; }
1086     static float fromFloat() { return 1.0f; }
1087     static inline simdscalar convertSrgb(simdscalar &in)
1088     {
1089 #if KNOB_SIMD_WIDTH == 8
1090 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1091         __m128 srcLo = _mm256_extractf128_ps(in, 0);
1092         __m128 srcHi = _mm256_extractf128_ps(in, 1);
1093 
1094         srcLo = ConvertFloatToSRGB2(srcLo);
1095         srcHi = ConvertFloatToSRGB2(srcHi);
1096 
1097         in = _mm256_insertf128_ps(in, srcLo, 0);
1098         in = _mm256_insertf128_ps(in, srcHi, 1);
1099 #endif
1100 #else
1101 #error Unsupported vector width
1102 #endif
1103         return in;
1104     }
1105 #if ENABLE_AVX512_SIMD16
1106 
1107     static inline simd16scalar convertSrgb(simd16scalar &in)
1108     {
1109         return ConvertFloatToSRGB2(in);
1110     }
1111 #endif
1112 };
1113 
1114 //////////////////////////////////////////////////////////////////////////
1115 /// Format1 - Bitfield for single component formats.
1116 //////////////////////////////////////////////////////////////////////////
1117 template<uint32_t x>
1118 struct Format1
1119 {
1120     union
1121     {
1122         uint32_t r : x;
1123 
1124         ///@ The following are here to provide full template needed in Formats.
1125         uint32_t g : x;
1126         uint32_t b : x;
1127         uint32_t a : x;
1128     };
1129 };
1130 
1131 //////////////////////////////////////////////////////////////////////////
1132 /// Format1 - Bitfield for single component formats - 8 bit specialization
1133 //////////////////////////////////////////////////////////////////////////
1134 template<>
1135 struct Format1<8>
1136 {
1137     union
1138     {
1139         uint8_t r;
1140 
1141         ///@ The following are here to provide full template needed in Formats.
1142         uint8_t g;
1143         uint8_t b;
1144         uint8_t a;
1145     };
1146 };
1147 
1148 //////////////////////////////////////////////////////////////////////////
1149 /// Format1 - Bitfield for single component formats - 16 bit specialization
1150 //////////////////////////////////////////////////////////////////////////
1151 template<>
1152 struct Format1<16>
1153 {
1154     union
1155     {
1156         uint16_t r;
1157 
1158         ///@ The following are here to provide full template needed in Formats.
1159         uint16_t g;
1160         uint16_t b;
1161         uint16_t a;
1162     };
1163 };
1164 
1165 //////////////////////////////////////////////////////////////////////////
1166 /// Format2 - Bitfield for 2 component formats.
1167 //////////////////////////////////////////////////////////////////////////
1168 template<uint32_t x, uint32_t y>
1169 union Format2
1170 {
1171     struct
1172     {
1173         uint32_t r : x;
1174         uint32_t g : y;
1175     };
1176     struct
1177     {
1178         ///@ The following are here to provide full template needed in Formats.
1179         uint32_t b : x;
1180         uint32_t a : y;
1181     };
1182 };
1183 
1184 //////////////////////////////////////////////////////////////////////////
1185 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1186 //////////////////////////////////////////////////////////////////////////
1187 template<>
1188 union Format2<8,8>
1189 {
1190     struct
1191     {
1192         uint16_t r : 8;
1193         uint16_t g : 8;
1194     };
1195     struct
1196     {
1197         ///@ The following are here to provide full template needed in Formats.
1198         uint16_t b : 8;
1199         uint16_t a : 8;
1200     };
1201 };
1202 
1203 //////////////////////////////////////////////////////////////////////////
1204 /// Format3 - Bitfield for 3 component formats.
1205 //////////////////////////////////////////////////////////////////////////
1206 template<uint32_t x, uint32_t y, uint32_t z>
1207 union Format3
1208 {
1209     struct
1210     {
1211         uint32_t r : x;
1212         uint32_t g : y;
1213         uint32_t b : z;
1214     };
1215     uint32_t a;  ///@note This is here to provide full template needed in Formats.
1216 };
1217 
1218 //////////////////////////////////////////////////////////////////////////
1219 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1220 //////////////////////////////////////////////////////////////////////////
1221 template<>
1222 union Format3<5,6,5>
1223 {
1224     struct
1225     {
1226         uint16_t r : 5;
1227         uint16_t g : 6;
1228         uint16_t b : 5;
1229     };
1230     uint16_t a;  ///@note This is here to provide full template needed in Formats.
1231 };
1232 
1233 //////////////////////////////////////////////////////////////////////////
1234 /// Format4 - Bitfield for 4 component formats.
1235 //////////////////////////////////////////////////////////////////////////
1236 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1237 struct Format4
1238 {
1239     uint32_t r : x;
1240     uint32_t g : y;
1241     uint32_t b : z;
1242     uint32_t a : w;
1243 };
1244 
1245 //////////////////////////////////////////////////////////////////////////
1246 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1247 //////////////////////////////////////////////////////////////////////////
1248 template<>
1249 struct Format4<5,5,5,1>
1250 {
1251     uint16_t r : 5;
1252     uint16_t g : 5;
1253     uint16_t b : 5;
1254     uint16_t a : 1;
1255 };
1256 
1257 //////////////////////////////////////////////////////////////////////////
1258 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1259 //////////////////////////////////////////////////////////////////////////
1260 template<>
1261 struct Format4<4,4,4,4>
1262 {
1263     uint16_t r : 4;
1264     uint16_t g : 4;
1265     uint16_t b : 4;
1266     uint16_t a : 4;
1267 };
1268 
1269 //////////////////////////////////////////////////////////////////////////
1270 /// ComponentTraits - Default components
1271 //////////////////////////////////////////////////////////////////////////
1272 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1273 struct Defaults
1274 {
1275     INLINE static uint32_t GetDefault(uint32_t comp)
1276     {
1277         static const uint32_t defaults[4]{ x, y, z, w };
1278         return defaults[comp];
1279     }
1280 };
1281 
1282 //////////////////////////////////////////////////////////////////////////
1283 /// ComponentTraits - Component type traits.
1284 //////////////////////////////////////////////////////////////////////////
1285 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1286 struct ComponentTraits
1287 {
1288     INLINE static SWR_TYPE GetType(uint32_t comp)
1289     {
1290         static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1291         return CompType[comp];
1292     }
1293 
1294     INLINE static uint32_t GetBPC(uint32_t comp)
1295     {
1296         static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1297         return MyBpc[comp];
1298     }
1299 
1300     INLINE static bool isNormalized(uint32_t comp)
1301     {
1302         switch (comp)
1303         {
1304         case 0:
1305             return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1306         case 1:
1307             return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1308         case 2:
1309             return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1310         case 3:
1311             return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1312         }
1313         SWR_ASSERT(0);
1314         return false;
1315     }
1316 
1317     INLINE static float toFloat(uint32_t comp)
1318     {
1319         switch (comp)
1320         {
1321         case 0:
1322             return TypeTraits<X, NumBitsX>::toFloat();
1323         case 1:
1324             return TypeTraits<Y, NumBitsY>::toFloat();
1325         case 2:
1326             return TypeTraits<Z, NumBitsZ>::toFloat();
1327         case 3:
1328             return TypeTraits<W, NumBitsW>::toFloat();
1329         }
1330         SWR_ASSERT(0);
1331         return TypeTraits<X, NumBitsX>::toFloat();
1332 
1333     }
1334 
1335     INLINE static float fromFloat(uint32_t comp)
1336     {
1337         switch (comp)
1338         {
1339         case 0:
1340             return TypeTraits<X, NumBitsX>::fromFloat();
1341         case 1:
1342             return TypeTraits<Y, NumBitsY>::fromFloat();
1343         case 2:
1344             return TypeTraits<Z, NumBitsZ>::fromFloat();
1345         case 3:
1346             return TypeTraits<W, NumBitsW>::fromFloat();
1347         }
1348         SWR_ASSERT(0);
1349         return TypeTraits<X, NumBitsX>::fromFloat();
1350     }
1351 
1352     INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1353     {
1354         switch (comp)
1355         {
1356         case 0:
1357             return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1358         case 1:
1359             return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1360         case 2:
1361             return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1362         case 3:
1363             return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1364         }
1365         SWR_ASSERT(0);
1366         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1367     }
1368 
1369     INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1370     {
1371         switch (comp)
1372         {
1373         case 0:
1374             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1375             return;
1376         case 1:
1377             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1378             return;
1379         case 2:
1380             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1381             return;
1382         case 3:
1383             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1384             return;
1385         }
1386         SWR_ASSERT(0);
1387         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1388     }
1389 
1390     INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1391     {
1392         switch (comp)
1393         {
1394         case 0:
1395             return TypeTraits<X, NumBitsX>::unpack(in);
1396         case 1:
1397             return TypeTraits<Y, NumBitsY>::unpack(in);
1398         case 2:
1399             return TypeTraits<Z, NumBitsZ>::unpack(in);
1400         case 3:
1401             return TypeTraits<W, NumBitsW>::unpack(in);
1402         }
1403         SWR_ASSERT(0);
1404         return TypeTraits<X, NumBitsX>::unpack(in);
1405     }
1406 
1407     INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1408     {
1409         switch (comp)
1410         {
1411         case 0:
1412             return TypeTraits<X, NumBitsX>::pack(in);
1413         case 1:
1414             return TypeTraits<Y, NumBitsY>::pack(in);
1415         case 2:
1416             return TypeTraits<Z, NumBitsZ>::pack(in);
1417         case 3:
1418             return TypeTraits<W, NumBitsW>::pack(in);
1419         }
1420         SWR_ASSERT(0);
1421         return TypeTraits<X, NumBitsX>::pack(in);
1422     }
1423 
1424     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1425     {
1426         switch (comp)
1427         {
1428         case 0:
1429             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1430         case 1:
1431             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1432         case 2:
1433             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1434         case 3:
1435             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1436         }
1437         SWR_ASSERT(0);
1438         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1439     }
1440 #if ENABLE_AVX512_SIMD16
1441 
1442     INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1443     {
1444         switch (comp)
1445         {
1446         case 0:
1447             return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1448         case 1:
1449             return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1450         case 2:
1451             return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1452         case 3:
1453             return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1454         }
1455         SWR_ASSERT(0);
1456         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1457     }
1458 
1459     INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
1460     {
1461         switch (comp)
1462         {
1463         case 0:
1464             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1465             return;
1466         case 1:
1467             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1468             return;
1469         case 2:
1470             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1471             return;
1472         case 3:
1473             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1474             return;
1475         }
1476         SWR_ASSERT(0);
1477         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1478     }
1479 
1480     INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1481     {
1482         switch (comp)
1483         {
1484         case 0:
1485             return TypeTraits<X, NumBitsX>::unpack(in);
1486         case 1:
1487             return TypeTraits<Y, NumBitsY>::unpack(in);
1488         case 2:
1489             return TypeTraits<Z, NumBitsZ>::unpack(in);
1490         case 3:
1491             return TypeTraits<W, NumBitsW>::unpack(in);
1492         }
1493         SWR_ASSERT(0);
1494         return TypeTraits<X, NumBitsX>::unpack(in);
1495     }
1496 
1497     INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1498     {
1499         switch (comp)
1500         {
1501         case 0:
1502             return TypeTraits<X, NumBitsX>::pack(in);
1503         case 1:
1504             return TypeTraits<Y, NumBitsY>::pack(in);
1505         case 2:
1506             return TypeTraits<Z, NumBitsZ>::pack(in);
1507         case 3:
1508             return TypeTraits<W, NumBitsW>::pack(in);
1509         }
1510         SWR_ASSERT(0);
1511         return TypeTraits<X, NumBitsX>::pack(in);
1512     }
1513 
1514     INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1515     {
1516         switch (comp)
1517         {
1518         case 0:
1519             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1520         case 1:
1521             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1522         case 2:
1523             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1524         case 3:
1525             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1526         }
1527         SWR_ASSERT(0);
1528         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1529     }
1530 #endif
1531 };
1532