• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "utils.h"
31 #include "common/simdintrin.h"
32 
33 //////////////////////////////////////////////////////////////////////////
34 /// PackTraits - Helpers for packing / unpacking same pixel sizes
35 //////////////////////////////////////////////////////////////////////////
36 template <uint32_t NumBits, bool Signed = false>
37 struct PackTraits
38 {
39     static const uint32_t MyNumBits = NumBits;
40     static simdscalar loadSOA(const uint8_t *pSrc) = delete;
41     static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
42     static simdscalar unpack(simdscalar &in) = delete;
43     static simdscalar pack(simdscalar &in) = delete;
44 #if ENABLE_AVX512_SIMD16
45     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
46     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
47     static simd16scalar unpack(simd16scalar &in) = delete;
48     static simd16scalar pack(simd16scalar &in) = delete;
49 #endif
50 };
51 
52 //////////////////////////////////////////////////////////////////////////
53 /// PackTraits - Helpers for packing / unpacking unused channels
54 //////////////////////////////////////////////////////////////////////////
55 template <>
56 struct PackTraits<0, false>
57 {
58     static const uint32_t MyNumBits = 0;
59 
60     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
61     static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
62     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
63     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
64 #if ENABLE_AVX512_SIMD16
65     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
66     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
67     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
68     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
69 #endif
70 };
71 
72 //////////////////////////////////////////////////////////////////////////
73 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
74 //////////////////////////////////////////////////////////////////////////
75 template <>
76 struct PackTraits<8, false>
77 {
78     static const uint32_t MyNumBits = 8;
79 
80     static simdscalar loadSOA(const uint8_t *pSrc)
81     {
82 #if KNOB_SIMD_WIDTH == 8
83         __m256 result = _mm256_setzero_ps();
84         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
85         return _mm256_insertf128_ps(result, vLo, 0);
86 #else
87 #error Unsupported vector width
88 #endif
89     }
90 
91     static void storeSOA(uint8_t *pDst, simdscalar const &src)
92     {
93         // store simd bytes
94 #if KNOB_SIMD_WIDTH == 8
95         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
96 #else
97 #error Unsupported vector width
98 #endif
99     }
100 
101     static simdscalar unpack(simdscalar &in)
102     {
103 #if KNOB_SIMD_WIDTH == 8
104 #if KNOB_ARCH <= KNOB_ARCH_AVX
105         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
106         __m128i resLo = _mm_cvtepu8_epi32(src);
107         __m128i resHi = _mm_shuffle_epi8(src,
108             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
109 
110         __m256i result = _mm256_castsi128_si256(resLo);
111         result = _mm256_insertf128_si256(result, resHi, 1);
112         return simdscalar{ _mm256_castsi256_ps(result) };
113 #else
114         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
115 #endif
116 #else
117 #error Unsupported vector width
118 #endif
119     }
120 
121     static simdscalar pack(simdscalar &in)
122     {
123 #if KNOB_SIMD_WIDTH == 8
124         simdscalari src = _simd_castps_si(in);
125         __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
126         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
127         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
128 #else
129 #error Unsupported vector width
130 #endif
131     }
132 #if ENABLE_AVX512_SIMD16
133 
134     static simd16scalar loadSOA_16(const uint8_t *pSrc)
135     {
136         simd16scalar result = _simd16_setzero_ps();
137         simdscalar resultlo = _simd_setzero_ps();
138 
139         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
140 
141         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
142         result = _simd16_insert_ps(result, resultlo, 0);
143 
144         return result;
145     }
146 
147     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
148     {
149         // store simd16 bytes
150         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
151     }
152 
153     static simd16scalar unpack(simd16scalar &in)
154     {
155         simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
156         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
157 
158         return _simd16_castsi_ps(result);
159     }
160 
161     static simd16scalar pack(simd16scalar &in)
162     {
163         simd16scalari result = _simd16_setzero_si();
164 
165         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
166         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
167 
168         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
169         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
170 
171         simdscalari pack = _simd_packus_epi32(permlo, permhi);                  // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
172 
173         const simdscalari zero = _simd_setzero_si();
174 
175         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
176         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
177 
178         pack = _simd_packus_epi16(permlo, permhi);                              // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
179 
180         result = _simd16_insert_si(result, pack, 0);
181 
182         return _simd16_castsi_ps(result);
183     }
184 #endif
185 };
186 
187 //////////////////////////////////////////////////////////////////////////
188 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
189 //////////////////////////////////////////////////////////////////////////
190 template <>
191 struct PackTraits<8, true>
192 {
193     static const uint32_t MyNumBits = 8;
194 
195     static simdscalar loadSOA(const uint8_t *pSrc)
196     {
197 #if KNOB_SIMD_WIDTH == 8
198         __m256 result = _mm256_setzero_ps();
199         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
200         return _mm256_insertf128_ps(result, vLo, 0);
201 #else
202 #error Unsupported vector width
203 #endif
204     }
205 
206     static void storeSOA(uint8_t *pDst, simdscalar const &src)
207     {
208         // store simd bytes
209 #if KNOB_SIMD_WIDTH == 8
210         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
211 #else
212 #error Unsupported vector width
213 #endif
214     }
215 
216     static simdscalar unpack(simdscalar &in)
217     {
218 #if KNOB_SIMD_WIDTH == 8
219 #if KNOB_ARCH <= KNOB_ARCH_AVX
220         SWR_INVALID("I think this may be incorrect.");
221         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
222         __m128i resLo = _mm_cvtepi8_epi32(src);
223         __m128i resHi = _mm_shuffle_epi8(src,
224             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
225 
226         __m256i result = _mm256_castsi128_si256(resLo);
227         result = _mm256_insertf128_si256(result, resHi, 1);
228         return _mm256_castsi256_ps(result);
229 #else
230         return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
231 #endif
232 #else
233 #error Unsupported vector width
234 #endif
235     }
236 
237     static simdscalar pack(simdscalar &in)
238     {
239 #if KNOB_SIMD_WIDTH == 8
240         simdscalari src = _simd_castps_si(in);
241         __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
242         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
243         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
244 #else
245 #error Unsupported vector width
246 #endif
247     }
248 #if ENABLE_AVX512_SIMD16
249 
250     static simd16scalar loadSOA_16(const uint8_t *pSrc)
251     {
252         simd16scalar result = _simd16_setzero_ps();
253         simdscalar resultlo = _simd_setzero_ps();
254 
255         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
256 
257         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
258         result = _simd16_insert_ps(result, resultlo, 0);
259 
260         return result;
261     }
262 
263     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
264     {
265         // store simd16 bytes
266         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
267     }
268 
269     static simd16scalar unpack(simd16scalar &in)
270     {
271         simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
272         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
273 
274         return _simd16_castsi_ps(result);
275     }
276 
277     static simd16scalar pack(simd16scalar &in)
278     {
279         simd16scalari result = _simd16_setzero_si();
280 
281         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
282         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
283 
284         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
285         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
286 
287         simdscalari pack = _simd_packs_epi32(permlo, permhi);                   // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
288 
289         const simdscalari zero = _simd_setzero_si();
290 
291         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
292         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
293 
294         pack = _simd_packs_epi16(permlo, permhi);                               // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
295 
296         result = _simd16_insert_si(result, pack, 0);
297 
298         return _simd16_castsi_ps(result);
299     }
300 #endif
301 };
302 
303 //////////////////////////////////////////////////////////////////////////
304 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
305 //////////////////////////////////////////////////////////////////////////
306 template <>
307 struct PackTraits<16, false>
308 {
309     static const uint32_t MyNumBits = 16;
310 
311     static simdscalar loadSOA(const uint8_t *pSrc)
312     {
313 #if KNOB_SIMD_WIDTH == 8
314         __m256 result = _mm256_setzero_ps();
315         __m128 vLo = _mm_load_ps((const float*)pSrc);
316         return _mm256_insertf128_ps(result, vLo, 0);
317 #else
318 #error Unsupported vector width
319 #endif
320     }
321 
322     static void storeSOA(uint8_t *pDst, simdscalar const &src)
323     {
324 #if KNOB_SIMD_WIDTH == 8
325         // store 16B (2B * 8)
326         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
327 #else
328 #error Unsupported vector width
329 #endif
330     }
331 
332     static simdscalar unpack(simdscalar &in)
333     {
334 #if KNOB_SIMD_WIDTH == 8
335 #if KNOB_ARCH <= KNOB_ARCH_AVX
336         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
337         __m128i resLo = _mm_cvtepu16_epi32(src);
338         __m128i resHi = _mm_shuffle_epi8(src,
339             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
340 
341         __m256i result = _mm256_castsi128_si256(resLo);
342         result = _mm256_insertf128_si256(result, resHi, 1);
343         return _mm256_castsi256_ps(result);
344 #else
345         return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
346 #endif
347 #else
348 #error Unsupported vector width
349 #endif
350     }
351 
352     static simdscalar pack(simdscalar &in)
353     {
354 #if KNOB_SIMD_WIDTH == 8
355         simdscalari src = _simd_castps_si(in);
356         __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
357         return _mm256_castsi256_ps(res);
358 #else
359 #error Unsupported vector width
360 #endif
361     }
362 #if ENABLE_AVX512_SIMD16
363 
364     static simd16scalar loadSOA_16(const uint8_t *pSrc)
365     {
366         simd16scalar result = _simd16_setzero_ps();
367 
368         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
369 
370         result = _simd16_insert_ps(result, resultlo, 0);
371 
372         return result;
373     }
374 
375     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
376     {
377         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
378     }
379 
380     static simd16scalar unpack(simd16scalar &in)
381     {
382         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
383 
384         return _simd16_castsi_ps(result);
385     }
386 
387     static simd16scalar pack(simd16scalar &in)
388     {
389         const simd16scalari zero = _simd16_setzero_si();
390 
391         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
392         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
393 
394         simd16scalari result = _simd16_packus_epi32(permlo, permhi);    // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
395 
396         return _simd16_castsi_ps(result);
397     }
398 #endif
399 };
400 
401 //////////////////////////////////////////////////////////////////////////
402 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
403 //////////////////////////////////////////////////////////////////////////
404 template <>
405 struct PackTraits<16, true>
406 {
407     static const uint32_t MyNumBits = 16;
408 
409     static simdscalar loadSOA(const uint8_t *pSrc)
410     {
411 #if KNOB_SIMD_WIDTH == 8
412         __m256 result = _mm256_setzero_ps();
413         __m128 vLo = _mm_load_ps((const float*)pSrc);
414         return _mm256_insertf128_ps(result, vLo, 0);
415 #else
416 #error Unsupported vector width
417 #endif
418     }
419 
420     static void storeSOA(uint8_t *pDst, simdscalar const &src)
421     {
422 #if KNOB_SIMD_WIDTH == 8
423         // store 16B (2B * 8)
424         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
425 #else
426 #error Unsupported vector width
427 #endif
428     }
429 
430     static simdscalar unpack(simdscalar &in)
431     {
432 #if KNOB_SIMD_WIDTH == 8
433 #if KNOB_ARCH <= KNOB_ARCH_AVX
434         SWR_INVALID("I think this may be incorrect.");
435         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
436         __m128i resLo = _mm_cvtepi16_epi32(src);
437         __m128i resHi = _mm_shuffle_epi8(src,
438             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
439 
440         __m256i result = _mm256_castsi128_si256(resLo);
441         result = _mm256_insertf128_si256(result, resHi, 1);
442         return _mm256_castsi256_ps(result);
443 #else
444         return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
445 #endif
446 #else
447 #error Unsupported vector width
448 #endif
449     }
450 
451     static simdscalar pack(simdscalar &in)
452     {
453 #if KNOB_SIMD_WIDTH == 8
454         simdscalari src = _simd_castps_si(in);
455         __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
456         return _mm256_castsi256_ps(res);
457 #else
458 #error Unsupported vector width
459 #endif
460     }
461 #if ENABLE_AVX512_SIMD16
462 
463     static simd16scalar loadSOA_16(const uint8_t *pSrc)
464     {
465         simd16scalar result = _simd16_setzero_ps();
466 
467         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
468 
469         result = _simd16_insert_ps(result, resultlo, 0);
470 
471         return result;
472     }
473 
474     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
475     {
476         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
477     }
478 
479     static simd16scalar unpack(simd16scalar &in)
480     {
481         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
482 
483         return _simd16_castsi_ps(result);
484     }
485 
486     static simd16scalar pack(simd16scalar &in)
487     {
488         const simd16scalari zero = _simd16_setzero_si();
489 
490         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
491         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
492 
493         simd16scalari result = _simd16_packs_epi32(permlo, permhi);     // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
494 
495         return _simd16_castsi_ps(result);
496     }
497 #endif
498 };
499 
500 //////////////////////////////////////////////////////////////////////////
501 /// PackTraits - Helpers for packing / unpacking 32 bit channels
502 //////////////////////////////////////////////////////////////////////////
503 template <>
504 struct PackTraits<32, false>
505 {
506     static const uint32_t MyNumBits = 32;
507 
508     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
509     static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
510     static simdscalar unpack(simdscalar &in) { return in; }
511     static simdscalar pack(simdscalar &in) { return in; }
512 #if ENABLE_AVX512_SIMD16
513 
514     static simd16scalar loadSOA_16(const uint8_t *pSrc)
515     {
516         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
517     }
518 
519     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
520     {
521         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
522     }
523 
524     static simd16scalar unpack(simd16scalar &in)
525     {
526         return in;
527     }
528 
529     static simd16scalar pack(simd16scalar &in)
530     {
531         return in;
532     }
533 #endif
534 };
535 
536 //////////////////////////////////////////////////////////////////////////
537 /// TypeTraits - Format type traits.
538 //////////////////////////////////////////////////////////////////////////
539 template<SWR_TYPE type, uint32_t NumBits>
540 struct TypeTraits : PackTraits<NumBits>
541 {
542     static const SWR_TYPE MyType = type;
543     static float toFloat() { return 0.0; }
544     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
545     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
546 };
547 
548 //////////////////////////////////////////////////////////////////////////
549 /// TypeTraits - Format type traits specialization for UINT8
550 //////////////////////////////////////////////////////////////////////////
551 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
552 {
553     static const SWR_TYPE MyType = SWR_TYPE_UINT;
554     static float toFloat() { return 0.0; }
555     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
556     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
557 };
558 
559 //////////////////////////////////////////////////////////////////////////
560 /// TypeTraits - Format type traits specialization for UINT8
561 //////////////////////////////////////////////////////////////////////////
562 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
563 {
564     static const SWR_TYPE MyType = SWR_TYPE_SINT;
565     static float toFloat() { return 0.0; }
566     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
567     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
568 };
569 
570 //////////////////////////////////////////////////////////////////////////
571 /// TypeTraits - Format type traits specialization for UINT16
572 //////////////////////////////////////////////////////////////////////////
573 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
574 {
575     static const SWR_TYPE MyType = SWR_TYPE_UINT;
576     static float toFloat() { return 0.0; }
577     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
578     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
579 };
580 
581 //////////////////////////////////////////////////////////////////////////
582 /// TypeTraits - Format type traits specialization for SINT16
583 //////////////////////////////////////////////////////////////////////////
584 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
585 {
586     static const SWR_TYPE MyType = SWR_TYPE_SINT;
587     static float toFloat() { return 0.0; }
588     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
589     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
590 };
591 
592 //////////////////////////////////////////////////////////////////////////
593 /// TypeTraits - Format type traits specialization for UINT32
594 //////////////////////////////////////////////////////////////////////////
595 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
596 {
597     static const SWR_TYPE MyType = SWR_TYPE_UINT;
598     static float toFloat() { return 0.0; }
599     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
600     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
601 };
602 
603 //////////////////////////////////////////////////////////////////////////
604 /// TypeTraits - Format type traits specialization for UINT32
605 //////////////////////////////////////////////////////////////////////////
606 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
607 {
608     static const SWR_TYPE MyType = SWR_TYPE_SINT;
609     static float toFloat() { return 0.0; }
610     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
611     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
612 };
613 
614 //////////////////////////////////////////////////////////////////////////
615 /// TypeTraits - Format type traits specialization for UNORM5
616 //////////////////////////////////////////////////////////////////////////
617 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
618 {
619     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
620     static float toFloat() { return 1.0f / 31.0f; }
621     static float fromFloat() { return 31.0f; }
622     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
623 };
624 
625 //////////////////////////////////////////////////////////////////////////
626 /// TypeTraits - Format type traits specialization for UNORM6
627 //////////////////////////////////////////////////////////////////////////
628 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
629 {
630     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
631     static float toFloat() { return 1.0f / 63.0f; }
632     static float fromFloat() { return 63.0f; }
633     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
634 };
635 
636 //////////////////////////////////////////////////////////////////////////
637 /// TypeTraits - Format type traits specialization for UNORM8
638 //////////////////////////////////////////////////////////////////////////
639 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
640 {
641     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
642     static float toFloat() { return 1.0f / 255.0f; }
643     static float fromFloat() { return 255.0f; }
644     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
645 };
646 
647 //////////////////////////////////////////////////////////////////////////
648 /// TypeTraits - Format type traits specialization for UNORM8
649 //////////////////////////////////////////////////////////////////////////
650 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
651 {
652     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
653     static float toFloat() { return 1.0f / 127.0f; }
654     static float fromFloat() { return 127.0f; }
655     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
656 };
657 
658 //////////////////////////////////////////////////////////////////////////
659 /// TypeTraits - Format type traits specialization for UNORM16
660 //////////////////////////////////////////////////////////////////////////
661 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
662 {
663     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
664     static float toFloat() { return 1.0f / 65535.0f; }
665     static float fromFloat() { return 65535.0f; }
666     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
667 };
668 
669 //////////////////////////////////////////////////////////////////////////
670 /// TypeTraits - Format type traits specialization for SNORM16
671 //////////////////////////////////////////////////////////////////////////
672 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
673 {
674     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
675     static float toFloat() { return 1.0f / 32767.0f; }
676     static float fromFloat() { return 32767.0f; }
677     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
678 };
679 
680 //////////////////////////////////////////////////////////////////////////
681 /// TypeTraits - Format type traits specialization for UNORM24
682 //////////////////////////////////////////////////////////////////////////
683 template<>
684 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
685 {
686     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
687     static float toFloat() { return 1.0f / 16777215.0f; }
688     static float fromFloat() { return 16777215.0f; }
689     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
690 };
691 
692 //////////////////////////////////////////////////////////////////////////
693 // FLOAT Specializations from here on...
694 //////////////////////////////////////////////////////////////////////////
695 #define TO_M128i(a) _mm_castps_si128(a)
696 #define TO_M128(a) _mm_castsi128_ps(a)
697 
698 #include "math.h"
699 
700 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
701 inline static __m128 fastpow(__m128 arg) {
702     __m128 ret = arg;
703 
704     static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
705         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
706 
707     // Apply a constant pre-correction factor.
708     ret = _mm_mul_ps(ret, factor);
709 
710     // Reinterpret arg as integer to obtain logarithm.
711     //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
712     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
713 
714     // Multiply logarithm by power.
715     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
716 
717     // Convert back to "integer" to exponentiate.
718     //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
719     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
720 
721     return ret;
722 }
723 
724 inline static __m128 pow512_4(__m128 arg) {
725     // 5/12 is too small, so compute the 4th root of 20/12 instead.
726     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
727     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
728     __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
729     __m128 xover = _mm_mul_ps(arg, xf);
730 
731     __m128 xfm1 = _mm_rsqrt_ps(xf);
732     __m128 x2 = _mm_mul_ps(arg, arg);
733     __m128 xunder = _mm_mul_ps(x2, xfm1);
734 
735     // sqrt2 * over + 2 * sqrt2 * under
736     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
737         _mm_add_ps(xover, xunder));
738 
739     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
740     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
741     return xavg;
742 }
743 
744 inline static __m128 powf_wrapper(__m128 Base, float Exp)
745 {
746     float *f = (float *)(&Base);
747 
748     return _mm_set_ps(powf(f[3], Exp),
749                       powf(f[2], Exp),
750                       powf(f[1], Exp),
751                       powf(f[0], Exp));
752 }
753 
754 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
755 {
756     // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
757     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
758 
759     // squeeze the mask down to 16 bits (4 bits per DWORD)
760     int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
761 
762     __m128 Result;
763 
764     //
765     if (CompareResult == 0xFFFF)
766     {
767         // all DWORDs are <= the threshold
768         Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
769     }
770     else if (CompareResult == 0x0)
771     {
772         // all DWORDs are > the threshold
773         __m128 fSrc_0RGB = Src;
774 
775         // --> 1.055f * c(1.0f/2.4f) - 0.055f
776 #if KNOB_USE_FAST_SRGB == TRUE
777         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
778         __m128 f = pow512_4(fSrc_0RGB);
779 #else
780         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
781 #endif
782         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
783         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
784     }
785     else
786     {
787         // some DWORDs are <= the threshold and some are > threshold
788         __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
789 
790         __m128 fSrc_0RGB = Src;
791 
792         // --> 1.055f * c(1.0f/2.4f) - 0.055f
793 #if KNOB_USE_FAST_SRGB == TRUE
794         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
795         __m128 f = pow512_4(fSrc_0RGB);
796 #else
797         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
798 #endif
799         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
800         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
801 
802         // Clear the alpha (is garbage after the sub)
803         __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
804 
805         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
806         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
807         __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
808 
809         Result = TO_M128(CombinedParts);
810     }
811 
812     return Result;
813 }
814 
815 #if ENABLE_AVX512_SIMD16
816 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
817 inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
818 {
819     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
820         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
821 
822     // Apply a constant pre-correction factor.
823     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
824 
825     // Reinterpret arg as integer to obtain logarithm.
826     //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
827     result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
828 
829     // Multiply logarithm by power.
830     result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
831 
832     // Convert back to "integer" to exponentiate.
833     //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
834     result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
835 
836     return result;
837 }
838 
839 inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
840 {
841     // 5/12 is too small, so compute the 4th root of 20/12 instead.
842     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
843     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
844     simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
845     simd16scalar xover = _simd16_mul_ps(arg, xf);
846 
847     simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
848     simd16scalar x2 = _simd16_mul_ps(arg, arg);
849     simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
850 
851     // sqrt2 * over + 2 * sqrt2 * under
852     simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
853 
854     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
855     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
856 
857     return xavg;
858 }
859 
860 inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
861 {
862     const float *f = reinterpret_cast<const float *>(&base);
863 
864     return _simd16_set_ps(
865         powf(f[15], exp),
866         powf(f[14], exp),
867         powf(f[13], exp),
868         powf(f[12], exp),
869         powf(f[11], exp),
870         powf(f[10], exp),
871         powf(f[ 9], exp),
872         powf(f[ 8], exp),
873         powf(f[ 7], exp),
874         powf(f[ 6], exp),
875         powf(f[ 5], exp),
876         powf(f[ 4], exp),
877         powf(f[ 3], exp),
878         powf(f[ 2], exp),
879         powf(f[ 1], exp),
880         powf(f[ 0], exp)
881     );
882 }
883 
884 // float to SRGB conversion formula
885 //
886 // if (value < 0.0031308f)
887 //     value *= 12.92f;
888 // else
889 //     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
890 //
891 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
892 {
893     // create a mask where the source is < the minimal SRGB float value
894     const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
895 
896     // if all elements are < the threshold, result = value * 12.92
897     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
898 
899     if (_simd16_mask2int(mask) != 0xFFFF)
900     {
901         // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
902 #if KNOB_USE_FAST_SRGB == TRUE
903         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
904         simd16scalar result2 = pow512_4(value);
905 #else
906         simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
907 #endif
908 
909         result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
910         result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
911 
912 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
913         // only native AVX512 can directly use the computed mask for the blend operation
914         result = _mm512_mask_blend_ps(mask, result2, result);
915 #else
916         result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
917 #endif
918     }
919 
920     return result;
921 }
922 
923 #endif
924 //////////////////////////////////////////////////////////////////////////
925 /// TypeTraits - Format type traits specialization for FLOAT16
926 //////////////////////////////////////////////////////////////////////////
927 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
928 {
929     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
930     static float toFloat() { return 1.0f; }
931     static float fromFloat() { return 1.0f; }
932     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
933 
934     static simdscalar pack(const simdscalar &in)
935     {
936 #if KNOB_SIMD_WIDTH == 8
937 #if (KNOB_ARCH == KNOB_ARCH_AVX)
938         // input is 8 packed float32, output is 8 packed float16
939         simdscalari src = _simd_castps_si(in);
940 
941         static const uint32_t FLOAT_EXP_BITS = 8;
942         static const uint32_t FLOAT_MANTISSA_BITS = 23;
943         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
944         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
945 
946         static const uint32_t HALF_EXP_BITS = 5;
947         static const uint32_t HALF_MANTISSA_BITS = 10;
948         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
949 
950         // minimum exponent required, exponents below this are flushed to 0.
951         static const int32_t HALF_EXP_MIN = -14;
952         static const int32_t FLOAT_EXP_BIAS = 127;
953         static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
954         static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
955 
956         // maximum exponent required, exponents above this are set to infinity
957         static const int32_t HALF_EXP_MAX = 15;
958         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
959 
960         const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
961         const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
962         const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
963         const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
964         const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
965         const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
966 
967         simdscalari vSign       = _simd_and_si(src, vSignMask);
968         simdscalari vExp        = _simd_and_si(src, vExpMask);
969         simdscalari vMan        = _simd_and_si(src, vManMask);
970 
971         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
972         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
973         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
974         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
975 
976         simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
977 
978         // pack output 16-bits into the lower 16-bits of each 32-bit channel
979         simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
980         vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
981 
982         // Flush To Zero
983         vDst   = _simd_andnot_si(vFTZMask, vDst);
984         // Apply Infinites / NaN
985         vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
986 
987         // Apply clamps
988         vDst = _simd_andnot_si(vClampMask, vDst);
989         vDst = _simd_or_si(vDst,
990                 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
991 
992         // Compute Denormals (subnormals)
993         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
994         {
995             uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
996             uint32_t *pExp = (uint32_t*)&vExp;
997             uint32_t *pMan = (uint32_t*)&vMan;
998             uint32_t *pDst = (uint32_t*)&vDst;
999             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1000             {
1001                 if (pDenormMask[i])
1002                 {
1003                     // Need to compute subnormal value
1004                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1005                     uint32_t mantissa = pMan[i] |
1006                                         (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
1007 
1008                     pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1009                 }
1010             }
1011         }
1012 
1013         // Add in sign bits
1014         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1015 
1016         // Pack to lower 128-bits
1017         vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1018 
1019 #if 0
1020 #if !defined(NDEBUG)
1021         simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1022 
1023         for (uint32_t i = 0; i < 4; ++i)
1024         {
1025             SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1026         }
1027 #endif
1028 #endif
1029 
1030         return _simd_castsi_ps(vDst);
1031 
1032 #else
1033         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1034 #endif
1035 #else
1036 #error Unsupported vector width
1037 #endif
1038     }
1039 
1040     static simdscalar unpack(const simdscalar &in)
1041     {
1042         // input is 8 packed float16, output is 8 packed float32
1043         SWR_NOT_IMPL; // @todo
1044         return _simd_setzero_ps();
1045     }
1046 #if ENABLE_AVX512_SIMD16
1047 
1048     static simd16scalar pack(const simd16scalar &in)
1049     {
1050         simd16scalari result = _simd16_setzero_si();
1051         simdscalari resultlo = _simd_setzero_si();
1052 
1053 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1054         simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1055         simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1056 
1057         __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
1058         __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
1059 
1060 #else
1061         __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1062         __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1063 
1064 #endif
1065         resultlo = _simd_insertf128_si(resultlo, templo, 0);
1066         resultlo = _simd_insertf128_si(resultlo, temphi, 1);
1067 
1068         result = _simd16_insert_si(result, resultlo, 0);
1069 
1070         return _simd16_castsi_ps(result);
1071     }
1072 
1073     static simd16scalar unpack(const simd16scalar &in)
1074     {
1075         // input is 16 packed float16, output is 16 packed float32
1076         SWR_NOT_IMPL; //  @todo
1077         return _simd16_setzero_ps();
1078     }
1079 #endif
1080 };
1081 
1082 //////////////////////////////////////////////////////////////////////////
1083 /// TypeTraits - Format type traits specialization for FLOAT32
1084 //////////////////////////////////////////////////////////////////////////
1085 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1086 {
1087     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1088     static float toFloat() { return 1.0f; }
1089     static float fromFloat() { return 1.0f; }
1090     static inline simdscalar convertSrgb(simdscalar &in)
1091     {
1092 #if KNOB_SIMD_WIDTH == 8
1093         __m128 srcLo = _mm256_extractf128_ps(in, 0);
1094         __m128 srcHi = _mm256_extractf128_ps(in, 1);
1095 
1096         srcLo = ConvertFloatToSRGB2(srcLo);
1097         srcHi = ConvertFloatToSRGB2(srcHi);
1098 
1099         in = _mm256_insertf128_ps(in, srcLo, 0);
1100         in = _mm256_insertf128_ps(in, srcHi, 1);
1101 #else
1102 #error Unsupported vector width
1103 #endif
1104         return in;
1105     }
1106 #if ENABLE_AVX512_SIMD16
1107 
1108     static inline simd16scalar convertSrgb(simd16scalar &in)
1109     {
1110         return ConvertFloatToSRGB2(in);
1111     }
1112 #endif
1113 };
1114 
1115 //////////////////////////////////////////////////////////////////////////
1116 /// FormatIntType - Calculate base integer type for pixel components based
1117 ///                 on total number of bits.  Components can be smaller
1118 ///                 that this type, but the entire pixel must not be
1119 ///                 any smaller than this type.
1120 //////////////////////////////////////////////////////////////////////////
1121 template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
1122 struct FormatIntType
1123 {
1124     typedef uint32_t TYPE;
1125 };
1126 
1127 template <uint32_t bits>
1128 struct FormatIntType<bits, true, true>
1129 {
1130     typedef uint8_t TYPE;
1131 };
1132 
1133 template <uint32_t bits>
1134 struct FormatIntType<bits, false, true>
1135 {
1136     typedef uint16_t TYPE;
1137 };
1138 
1139 //////////////////////////////////////////////////////////////////////////
1140 /// Format1 - Bitfield for single component formats.
1141 //////////////////////////////////////////////////////////////////////////
1142 template<uint32_t x>
1143 union Format1
1144 {
1145     typedef typename FormatIntType<x>::TYPE TYPE;
1146     struct
1147     {
1148         TYPE r : x;
1149     };
1150 
1151     ///@ The following are here to provide full template needed in Formats.
1152     struct
1153     {
1154         TYPE g : x;
1155     };
1156     struct
1157     {
1158         TYPE b : x;
1159     };
1160     struct
1161     {
1162         TYPE a : x;
1163     };
1164 };
1165 
1166 //////////////////////////////////////////////////////////////////////////
1167 /// Format2 - Bitfield for 2 component formats.
1168 //////////////////////////////////////////////////////////////////////////
1169 template<uint32_t x, uint32_t y>
1170 union Format2
1171 {
1172     typedef typename FormatIntType<x + y>::TYPE TYPE;
1173 
1174     struct
1175     {
1176         TYPE r : x;
1177         TYPE g : y;
1178     };
1179     struct
1180     {
1181         ///@ The following are here to provide full template needed in Formats.
1182         TYPE b : x;
1183         TYPE a : y;
1184     };
1185 };
1186 
1187 //////////////////////////////////////////////////////////////////////////
1188 /// Format3 - Bitfield for 3 component formats.
1189 //////////////////////////////////////////////////////////////////////////
1190 template<uint32_t x, uint32_t y, uint32_t z>
1191 union Format3
1192 {
1193     typedef typename FormatIntType<x + y + z>::TYPE TYPE;
1194 
1195     struct
1196     {
1197         TYPE r : x;
1198         TYPE g : y;
1199         TYPE b : z;
1200     };
1201     TYPE a;  ///@note This is here to provide full template needed in Formats.
1202 };
1203 
1204 //////////////////////////////////////////////////////////////////////////
1205 /// Format4 - Bitfield for 4 component formats.
1206 //////////////////////////////////////////////////////////////////////////
1207 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1208 struct Format4
1209 {
1210     typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
1211 
1212     TYPE r : x;
1213     TYPE g : y;
1214     TYPE b : z;
1215     TYPE a : w;
1216 };
1217 
1218 //////////////////////////////////////////////////////////////////////////
1219 /// ComponentTraits - Default components
1220 //////////////////////////////////////////////////////////////////////////
1221 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1222 struct Defaults
1223 {
1224     INLINE static uint32_t GetDefault(uint32_t comp)
1225     {
1226         static const uint32_t defaults[4]{ x, y, z, w };
1227         return defaults[comp];
1228     }
1229 };
1230 
1231 //////////////////////////////////////////////////////////////////////////
1232 /// ComponentTraits - Component type traits.
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1235 struct ComponentTraits
1236 {
1237     INLINE static SWR_TYPE GetType(uint32_t comp)
1238     {
1239         static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1240         return CompType[comp];
1241     }
1242 
1243     INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
1244     {
1245         return (comp == 3) ? NumBitsW :
1246             ((comp == 2) ? NumBitsZ :
1247                 ((comp == 1) ? NumBitsY : NumBitsX) );
1248     }
1249 
1250     INLINE static uint32_t GetBPC(uint32_t comp)
1251     {
1252         static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1253         return MyBpc[comp];
1254     }
1255 
1256     INLINE static bool isNormalized(uint32_t comp)
1257     {
1258         switch (comp)
1259         {
1260         case 0:
1261             return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1262         case 1:
1263             return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1264         case 2:
1265             return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1266         case 3:
1267             return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1268         }
1269         SWR_INVALID("Invalid component: %d", comp);
1270         return false;
1271     }
1272 
1273     INLINE static float toFloat(uint32_t comp)
1274     {
1275         switch (comp)
1276         {
1277         case 0:
1278             return TypeTraits<X, NumBitsX>::toFloat();
1279         case 1:
1280             return TypeTraits<Y, NumBitsY>::toFloat();
1281         case 2:
1282             return TypeTraits<Z, NumBitsZ>::toFloat();
1283         case 3:
1284             return TypeTraits<W, NumBitsW>::toFloat();
1285         }
1286         SWR_INVALID("Invalid component: %d", comp);
1287         return TypeTraits<X, NumBitsX>::toFloat();
1288 
1289     }
1290 
1291     INLINE static float fromFloat(uint32_t comp)
1292     {
1293         switch (comp)
1294         {
1295         case 0:
1296             return TypeTraits<X, NumBitsX>::fromFloat();
1297         case 1:
1298             return TypeTraits<Y, NumBitsY>::fromFloat();
1299         case 2:
1300             return TypeTraits<Z, NumBitsZ>::fromFloat();
1301         case 3:
1302             return TypeTraits<W, NumBitsW>::fromFloat();
1303         }
1304         SWR_INVALID("Invalid component: %d", comp);
1305         return TypeTraits<X, NumBitsX>::fromFloat();
1306     }
1307 
1308     INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1309     {
1310         switch (comp)
1311         {
1312         case 0:
1313             return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1314         case 1:
1315             return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1316         case 2:
1317             return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1318         case 3:
1319             return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1320         }
1321         SWR_INVALID("Invalid component: %d", comp);
1322         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1323     }
1324 
1325     INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
1326     {
1327         switch (comp)
1328         {
1329         case 0:
1330             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1331             return;
1332         case 1:
1333             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1334             return;
1335         case 2:
1336             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1337             return;
1338         case 3:
1339             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1340             return;
1341         }
1342         SWR_INVALID("Invalid component: %d", comp);
1343     }
1344 
1345     INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1346     {
1347         simdscalar out;
1348         switch (comp)
1349         {
1350         case 0:
1351             out = TypeTraits<X, NumBitsX>::unpack(in); break;
1352         case 1:
1353             out = TypeTraits<Y, NumBitsY>::unpack(in); break;
1354         case 2:
1355             out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
1356         case 3:
1357             out = TypeTraits<W, NumBitsW>::unpack(in); break;
1358         default:
1359             SWR_INVALID("Invalid component: %d", comp);
1360             out = in;
1361             break;
1362         }
1363         return out;
1364     }
1365 
1366     INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1367     {
1368         simdscalar out;
1369         switch (comp)
1370         {
1371         case 0:
1372             out = TypeTraits<X, NumBitsX>::pack(in); break;
1373         case 1:
1374             out = TypeTraits<Y, NumBitsY>::pack(in); break;
1375         case 2:
1376             out = TypeTraits<Z, NumBitsZ>::pack(in); break;
1377         case 3:
1378             out = TypeTraits<W, NumBitsW>::pack(in); break;
1379         default:
1380             SWR_INVALID("Invalid component: %d", comp);
1381             out = in;
1382             break;
1383         }
1384         return out;
1385     }
1386 
1387     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1388     {
1389         switch (comp)
1390         {
1391         case 0:
1392             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1393         case 1:
1394             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1395         case 2:
1396             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1397         case 3:
1398             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1399         }
1400         SWR_INVALID("Invalid component: %d", comp);
1401         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1402     }
1403 #if ENABLE_AVX512_SIMD16
1404 
1405     INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1406     {
1407         switch (comp)
1408         {
1409         case 0:
1410             return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1411         case 1:
1412             return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1413         case 2:
1414             return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1415         case 3:
1416             return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1417         }
1418         SWR_INVALID("Invalid component: %d", comp);
1419         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1420     }
1421 
1422     INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
1423     {
1424         switch (comp)
1425         {
1426         case 0:
1427             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1428             return;
1429         case 1:
1430             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1431             return;
1432         case 2:
1433             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1434             return;
1435         case 3:
1436             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1437             return;
1438         }
1439         SWR_INVALID("Invalid component: %d", comp);
1440         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1441     }
1442 
1443     INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1444     {
1445         switch (comp)
1446         {
1447         case 0:
1448             return TypeTraits<X, NumBitsX>::unpack(in);
1449         case 1:
1450             return TypeTraits<Y, NumBitsY>::unpack(in);
1451         case 2:
1452             return TypeTraits<Z, NumBitsZ>::unpack(in);
1453         case 3:
1454             return TypeTraits<W, NumBitsW>::unpack(in);
1455         }
1456         SWR_INVALID("Invalid component: %d", comp);
1457         return TypeTraits<X, NumBitsX>::unpack(in);
1458     }
1459 
1460     INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1461     {
1462         switch (comp)
1463         {
1464         case 0:
1465             return TypeTraits<X, NumBitsX>::pack(in);
1466         case 1:
1467             return TypeTraits<Y, NumBitsY>::pack(in);
1468         case 2:
1469             return TypeTraits<Z, NumBitsZ>::pack(in);
1470         case 3:
1471             return TypeTraits<W, NumBitsW>::pack(in);
1472         }
1473         SWR_INVALID("Invalid component: %d", comp);
1474         return TypeTraits<X, NumBitsX>::pack(in);
1475     }
1476 
1477     INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1478     {
1479         switch (comp)
1480         {
1481         case 0:
1482             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1483         case 1:
1484             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1485         case 2:
1486             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1487         case 3:
1488             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1489         }
1490         SWR_INVALID("Invalid component: %d", comp);
1491         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1492     }
1493 #endif
1494 };
1495