1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file formats.h 24 * 25 * @brief Definitions for SWR_FORMAT functions. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "utils.h" 31 32 ////////////////////////////////////////////////////////////////////////// 33 /// PackTraits - Helpers for packing / unpacking same pixel sizes 34 ////////////////////////////////////////////////////////////////////////// 35 template <uint32_t NumBits, bool Signed = false> 36 struct PackTraits 37 { 38 static const uint32_t MyNumBits = NumBits; 39 static simdscalar loadSOA(const uint8_t *pSrc) = delete; 40 static void storeSOA(uint8_t *pDst, simdscalar src) = delete; 41 static simdscalar unpack(simdscalar &in) = delete; 42 static simdscalar pack(simdscalar &in) = delete; 43 #if ENABLE_AVX512_SIMD16 44 static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete; 45 static void storeSOA(uint8_t *pDst, simd16scalar src) = delete; 46 static simd16scalar unpack(simd16scalar &in) = delete; 47 static simd16scalar pack(simd16scalar &in) = delete; 48 #endif 49 }; 50 51 ////////////////////////////////////////////////////////////////////////// 52 /// PackTraits - Helpers for packing / unpacking unused channels 53 ////////////////////////////////////////////////////////////////////////// 54 template <> 55 struct PackTraits<0, false> 56 { 57 static const uint32_t MyNumBits = 0; 58 59 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); } 60 static void storeSOA(uint8_t *pDst, simdscalar src) { return; } 61 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } 62 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } 63 #if ENABLE_AVX512_SIMD16 64 static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); } 65 static void storeSOA(uint8_t *pDst, simd16scalar src) { return; } 66 static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); } 67 static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); } 68 #endif 69 }; 70 71 ////////////////////////////////////////////////////////////////////////// 72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels 73 ////////////////////////////////////////////////////////////////////////// 74 template <> 75 struct PackTraits<8, false> 76 { 77 static const uint32_t MyNumBits = 8; 78 79 static simdscalar loadSOA(const uint8_t *pSrc) 80 { 81 #if KNOB_SIMD_WIDTH == 8 82 __m256 result = _mm256_setzero_ps(); 83 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); 84 return _mm256_insertf128_ps(result, vLo, 0); 85 #else 86 #error Unsupported vector width 87 #endif 88 } 89 90 static void storeSOA(uint8_t *pDst, simdscalar src) 91 { 92 // store simd bytes 93 #if KNOB_SIMD_WIDTH == 8 94 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); 95 #else 96 #error Unsupported vector width 97 #endif 98 } 99 100 static simdscalar unpack(simdscalar &in) 101 { 102 #if KNOB_SIMD_WIDTH == 8 103 #if KNOB_ARCH==KNOB_ARCH_AVX 104 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 105 __m128i resLo = _mm_cvtepu8_epi32(src); 106 __m128i resHi = _mm_shuffle_epi8(src, 107 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); 108 109 __m256i result = _mm256_castsi128_si256(resLo); 110 result = _mm256_insertf128_si256(result, resHi, 1); 111 return _mm256_castsi256_ps(result); 112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2 113 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 114 #endif 115 #else 116 #error Unsupported vector width 117 #endif 118 } 119 120 static simdscalar pack(simdscalar &in) 121 { 122 #if KNOB_SIMD_WIDTH == 8 123 simdscalari src = _simd_castps_si(in); 124 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); 125 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); 126 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); 127 #else 128 #error Unsupported vector width 129 #endif 130 } 131 #if ENABLE_AVX512_SIMD16 132 133 static simd16scalar loadSOA_16(const uint8_t *pSrc) 134 { 135 simd16scalar result = _simd16_setzero_ps(); 136 simdscalar resultlo = _simd_setzero_ps(); 137 138 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc)); 139 140 resultlo = _mm256_insertf128_ps(resultlo, src, 0); 141 result = _simd16_insert_ps(result, resultlo, 0); 142 143 return result; 144 } 145 146 static void storeSOA(uint8_t *pDst, simd16scalar src) 147 { 148 // store simd16 bytes 149 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); 150 } 151 152 static simd16scalar unpack(simd16scalar &in) 153 { 154 simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)))); 155 156 return _simd16_castsi_ps(result); 157 } 158 159 static simd16scalar pack(simd16scalar &in) 160 { 161 simd16scalari result = _simd16_setzero_si(); 162 163 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) 164 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF 165 166 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) 167 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) 168 169 simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) 170 171 const simdscalari zero = _simd_setzero_si(); 172 173 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) 174 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) 175 176 pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) 177 178 result = _simd16_insert_si(result, pack, 0); 179 180 return _simd16_castsi_ps(result); 181 } 182 #endif 183 }; 184 185 ////////////////////////////////////////////////////////////////////////// 186 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels 187 ////////////////////////////////////////////////////////////////////////// 188 template <> 189 struct PackTraits<8, true> 190 { 191 static const uint32_t MyNumBits = 8; 192 193 static simdscalar loadSOA(const uint8_t *pSrc) 194 { 195 #if KNOB_SIMD_WIDTH == 8 196 __m256 result = _mm256_setzero_ps(); 197 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); 198 return _mm256_insertf128_ps(result, vLo, 0); 199 #else 200 #error Unsupported vector width 201 #endif 202 } 203 204 static void storeSOA(uint8_t *pDst, simdscalar src) 205 { 206 // store simd bytes 207 #if KNOB_SIMD_WIDTH == 8 208 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); 209 #else 210 #error Unsupported vector width 211 #endif 212 } 213 214 static simdscalar unpack(simdscalar &in) 215 { 216 #if KNOB_SIMD_WIDTH == 8 217 #if KNOB_ARCH==KNOB_ARCH_AVX 218 SWR_ASSERT(0); // I think this may be incorrect. 219 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 220 __m128i resLo = _mm_cvtepi8_epi32(src); 221 __m128i resHi = _mm_shuffle_epi8(src, 222 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); 223 224 __m256i result = _mm256_castsi128_si256(resLo); 225 result = _mm256_insertf128_si256(result, resHi, 1); 226 return _mm256_castsi256_ps(result); 227 #elif KNOB_ARCH>=KNOB_ARCH_AVX2 228 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 229 #endif 230 #else 231 #error Unsupported vector width 232 #endif 233 } 234 235 static simdscalar pack(simdscalar &in) 236 { 237 #if KNOB_SIMD_WIDTH == 8 238 simdscalari src = _simd_castps_si(in); 239 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); 240 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); 241 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); 242 #else 243 #error Unsupported vector width 244 #endif 245 } 246 #if ENABLE_AVX512_SIMD16 247 248 static simd16scalar loadSOA_16(const uint8_t *pSrc) 249 { 250 simd16scalar result = _simd16_setzero_ps(); 251 simdscalar resultlo = _simd_setzero_ps(); 252 253 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc)); 254 255 resultlo = _mm256_insertf128_ps(resultlo, src, 0); 256 result = _simd16_insert_ps(result, resultlo, 0); 257 258 return result; 259 } 260 261 static void storeSOA(uint8_t *pDst, simd16scalar src) 262 { 263 // store simd16 bytes 264 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); 265 } 266 267 static simd16scalar unpack(simd16scalar &in) 268 { 269 simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)))); 270 271 return _simd16_castsi_ps(result); 272 } 273 274 static simd16scalar pack(simd16scalar &in) 275 { 276 simd16scalari result = _simd16_setzero_si(); 277 278 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) 279 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF 280 281 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) 282 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) 283 284 simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) 285 286 const simdscalari zero = _simd_setzero_si(); 287 288 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) 289 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) 290 291 pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) 292 293 result = _simd16_insert_si(result, pack, 0); 294 295 return _simd16_castsi_ps(result); 296 } 297 #endif 298 }; 299 300 ////////////////////////////////////////////////////////////////////////// 301 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels 302 ////////////////////////////////////////////////////////////////////////// 303 template <> 304 struct PackTraits<16, false> 305 { 306 static const uint32_t MyNumBits = 16; 307 308 static simdscalar loadSOA(const uint8_t *pSrc) 309 { 310 #if KNOB_SIMD_WIDTH == 8 311 __m256 result = _mm256_setzero_ps(); 312 __m128 vLo = _mm_load_ps((const float*)pSrc); 313 return _mm256_insertf128_ps(result, vLo, 0); 314 #else 315 #error Unsupported vector width 316 #endif 317 } 318 319 static void storeSOA(uint8_t *pDst, simdscalar src) 320 { 321 #if KNOB_SIMD_WIDTH == 8 322 // store 16B (2B * 8) 323 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); 324 #else 325 #error Unsupported vector width 326 #endif 327 } 328 329 static simdscalar unpack(simdscalar &in) 330 { 331 #if KNOB_SIMD_WIDTH == 8 332 #if KNOB_ARCH==KNOB_ARCH_AVX 333 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 334 __m128i resLo = _mm_cvtepu16_epi32(src); 335 __m128i resHi = _mm_shuffle_epi8(src, 336 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); 337 338 __m256i result = _mm256_castsi128_si256(resLo); 339 result = _mm256_insertf128_si256(result, resHi, 1); 340 return _mm256_castsi256_ps(result); 341 #elif KNOB_ARCH>=KNOB_ARCH_AVX2 342 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 343 #endif 344 #else 345 #error Unsupported vector width 346 #endif 347 } 348 349 static simdscalar pack(simdscalar &in) 350 { 351 #if KNOB_SIMD_WIDTH == 8 352 simdscalari src = _simd_castps_si(in); 353 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); 354 return _mm256_castsi256_ps(res); 355 #else 356 #error Unsupported vector width 357 #endif 358 } 359 #if ENABLE_AVX512_SIMD16 360 361 static simd16scalar loadSOA_16(const uint8_t *pSrc) 362 { 363 simd16scalar result = _simd16_setzero_ps(); 364 365 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); 366 367 result = _simd16_insert_ps(result, resultlo, 0); 368 369 return result; 370 } 371 372 static void storeSOA(uint8_t *pDst, simd16scalar src) 373 { 374 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0)); 375 } 376 377 static simd16scalar unpack(simd16scalar &in) 378 { 379 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); 380 381 return _simd16_castsi_ps(result); 382 } 383 384 static simd16scalar pack(simd16scalar &in) 385 { 386 const simd16scalari zero = _simd16_setzero_si(); 387 388 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) 389 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 390 391 simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) 392 393 return _simd16_castsi_ps(result); 394 } 395 #endif 396 }; 397 398 ////////////////////////////////////////////////////////////////////////// 399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels 400 ////////////////////////////////////////////////////////////////////////// 401 template <> 402 struct PackTraits<16, true> 403 { 404 static const uint32_t MyNumBits = 16; 405 406 static simdscalar loadSOA(const uint8_t *pSrc) 407 { 408 #if KNOB_SIMD_WIDTH == 8 409 __m256 result = _mm256_setzero_ps(); 410 __m128 vLo = _mm_load_ps((const float*)pSrc); 411 return _mm256_insertf128_ps(result, vLo, 0); 412 #else 413 #error Unsupported vector width 414 #endif 415 } 416 417 static void storeSOA(uint8_t *pDst, simdscalar src) 418 { 419 #if KNOB_SIMD_WIDTH == 8 420 // store 16B (2B * 8) 421 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); 422 #else 423 #error Unsupported vector width 424 #endif 425 } 426 427 static simdscalar unpack(simdscalar &in) 428 { 429 #if KNOB_SIMD_WIDTH == 8 430 #if KNOB_ARCH==KNOB_ARCH_AVX 431 SWR_ASSERT(0); // I think this is incorrectly implemented 432 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 433 __m128i resLo = _mm_cvtepi16_epi32(src); 434 __m128i resHi = _mm_shuffle_epi8(src, 435 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); 436 437 __m256i result = _mm256_castsi128_si256(resLo); 438 result = _mm256_insertf128_si256(result, resHi, 1); 439 return _mm256_castsi256_ps(result); 440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2 441 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 442 #endif 443 #else 444 #error Unsupported vector width 445 #endif 446 } 447 448 static simdscalar pack(simdscalar &in) 449 { 450 #if KNOB_SIMD_WIDTH == 8 451 simdscalari src = _simd_castps_si(in); 452 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); 453 return _mm256_castsi256_ps(res); 454 #else 455 #error Unsupported vector width 456 #endif 457 } 458 #if ENABLE_AVX512_SIMD16 459 460 static simd16scalar loadSOA_16(const uint8_t *pSrc) 461 { 462 simd16scalar result = _simd16_setzero_ps(); 463 464 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); 465 466 result = _simd16_insert_ps(result, resultlo, 0); 467 468 return result; 469 } 470 471 static void storeSOA(uint8_t *pDst, simd16scalar src) 472 { 473 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0)); 474 } 475 476 static simd16scalar unpack(simd16scalar &in) 477 { 478 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); 479 480 return _simd16_castsi_ps(result); 481 } 482 483 static simd16scalar pack(simd16scalar &in) 484 { 485 const simd16scalari zero = _simd16_setzero_si(); 486 487 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) 488 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 489 490 simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) 491 492 return _simd16_castsi_ps(result); 493 } 494 #endif 495 }; 496 497 ////////////////////////////////////////////////////////////////////////// 498 /// PackTraits - Helpers for packing / unpacking 32 bit channels 499 ////////////////////////////////////////////////////////////////////////// 500 template <> 501 struct PackTraits<32, false> 502 { 503 static const uint32_t MyNumBits = 32; 504 505 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); } 506 static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } 507 static simdscalar unpack(simdscalar &in) { return in; } 508 static simdscalar pack(simdscalar &in) { return in; } 509 #if ENABLE_AVX512_SIMD16 510 511 static simd16scalar loadSOA_16(const uint8_t *pSrc) 512 { 513 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 514 } 515 516 static void storeSOA(uint8_t *pDst, simd16scalar src) 517 { 518 _simd16_store_ps(reinterpret_cast<float *>(pDst), src); 519 } 520 521 static simd16scalar unpack(simd16scalar &in) 522 { 523 return in; 524 } 525 526 static simd16scalar pack(simd16scalar &in) 527 { 528 return in; 529 } 530 #endif 531 }; 532 533 ////////////////////////////////////////////////////////////////////////// 534 /// TypeTraits - Format type traits. 535 ////////////////////////////////////////////////////////////////////////// 536 template<SWR_TYPE type, uint32_t NumBits> 537 struct TypeTraits : PackTraits<NumBits> 538 { 539 static const SWR_TYPE MyType = type; 540 static float toFloat() { return 0.0; } 541 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 542 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 543 }; 544 545 ////////////////////////////////////////////////////////////////////////// 546 /// TypeTraits - Format type traits specialization for UINT8 547 ////////////////////////////////////////////////////////////////////////// 548 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> 549 { 550 static const SWR_TYPE MyType = SWR_TYPE_UINT; 551 static float toFloat() { return 0.0; } 552 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 553 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 554 }; 555 556 ////////////////////////////////////////////////////////////////////////// 557 /// TypeTraits - Format type traits specialization for UINT8 558 ////////////////////////////////////////////////////////////////////////// 559 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> 560 { 561 static const SWR_TYPE MyType = SWR_TYPE_SINT; 562 static float toFloat() { return 0.0; } 563 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 564 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 565 }; 566 567 ////////////////////////////////////////////////////////////////////////// 568 /// TypeTraits - Format type traits specialization for UINT16 569 ////////////////////////////////////////////////////////////////////////// 570 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> 571 { 572 static const SWR_TYPE MyType = SWR_TYPE_UINT; 573 static float toFloat() { return 0.0; } 574 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 575 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 576 }; 577 578 ////////////////////////////////////////////////////////////////////////// 579 /// TypeTraits - Format type traits specialization for SINT16 580 ////////////////////////////////////////////////////////////////////////// 581 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> 582 { 583 static const SWR_TYPE MyType = SWR_TYPE_SINT; 584 static float toFloat() { return 0.0; } 585 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 586 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 587 }; 588 589 ////////////////////////////////////////////////////////////////////////// 590 /// TypeTraits - Format type traits specialization for UINT32 591 ////////////////////////////////////////////////////////////////////////// 592 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> 593 { 594 static const SWR_TYPE MyType = SWR_TYPE_UINT; 595 static float toFloat() { return 0.0; } 596 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 597 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 598 }; 599 600 ////////////////////////////////////////////////////////////////////////// 601 /// TypeTraits - Format type traits specialization for UINT32 602 ////////////////////////////////////////////////////////////////////////// 603 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> 604 { 605 static const SWR_TYPE MyType = SWR_TYPE_SINT; 606 static float toFloat() { return 0.0; } 607 static float fromFloat() { SWR_ASSERT(0); return 0.0; } 608 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 609 }; 610 611 ////////////////////////////////////////////////////////////////////////// 612 /// TypeTraits - Format type traits specialization for UNORM5 613 ////////////////////////////////////////////////////////////////////////// 614 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> 615 { 616 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 617 static float toFloat() { return 1.0f / 31.0f; } 618 static float fromFloat() { return 31.0f; } 619 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 620 }; 621 622 ////////////////////////////////////////////////////////////////////////// 623 /// TypeTraits - Format type traits specialization for UNORM6 624 ////////////////////////////////////////////////////////////////////////// 625 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> 626 { 627 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 628 static float toFloat() { return 1.0f / 63.0f; } 629 static float fromFloat() { return 63.0f; } 630 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 631 }; 632 633 ////////////////////////////////////////////////////////////////////////// 634 /// TypeTraits - Format type traits specialization for UNORM8 635 ////////////////////////////////////////////////////////////////////////// 636 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> 637 { 638 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 639 static float toFloat() { return 1.0f / 255.0f; } 640 static float fromFloat() { return 255.0f; } 641 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 642 }; 643 644 ////////////////////////////////////////////////////////////////////////// 645 /// TypeTraits - Format type traits specialization for UNORM8 646 ////////////////////////////////////////////////////////////////////////// 647 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> 648 { 649 static const SWR_TYPE MyType = SWR_TYPE_SNORM; 650 static float toFloat() { return 1.0f / 127.0f; } 651 static float fromFloat() { return 127.0f; } 652 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 653 }; 654 655 ////////////////////////////////////////////////////////////////////////// 656 /// TypeTraits - Format type traits specialization for UNORM16 657 ////////////////////////////////////////////////////////////////////////// 658 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> 659 { 660 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 661 static float toFloat() { return 1.0f / 65535.0f; } 662 static float fromFloat() { return 65535.0f; } 663 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 664 }; 665 666 ////////////////////////////////////////////////////////////////////////// 667 /// TypeTraits - Format type traits specialization for SNORM16 668 ////////////////////////////////////////////////////////////////////////// 669 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> 670 { 671 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 672 static float toFloat() { return 1.0f / 32767.0f; } 673 static float fromFloat() { return 32767.0f; } 674 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 675 }; 676 677 ////////////////////////////////////////////////////////////////////////// 678 /// TypeTraits - Format type traits specialization for UNORM24 679 ////////////////////////////////////////////////////////////////////////// 680 template<> 681 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> 682 { 683 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 684 static float toFloat() { return 1.0f / 16777215.0f; } 685 static float fromFloat() { return 16777215.0f; } 686 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 687 }; 688 689 ////////////////////////////////////////////////////////////////////////// 690 // FLOAT Specializations from here on... 691 ////////////////////////////////////////////////////////////////////////// 692 #define TO_M128i(a) _mm_castps_si128(a) 693 #define TO_M128(a) _mm_castsi128_ps(a) 694 695 #include "math.h" 696 697 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > 698 inline static __m128 fastpow(__m128 arg) { 699 __m128 ret = arg; 700 701 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) 702 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); 703 704 // Apply a constant pre-correction factor. 705 ret = _mm_mul_ps(ret, factor); 706 707 // Reinterpret arg as integer to obtain logarithm. 708 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); 709 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); 710 711 // Multiply logarithm by power. 712 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); 713 714 // Convert back to "integer" to exponentiate. 715 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); 716 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); 717 718 return ret; 719 } 720 721 inline static __m128 pow512_4(__m128 arg) { 722 // 5/12 is too small, so compute the 4th root of 20/12 instead. 723 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. 724 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 725 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); 726 __m128 xover = _mm_mul_ps(arg, xf); 727 728 __m128 xfm1 = _mm_rsqrt_ps(xf); 729 __m128 x2 = _mm_mul_ps(arg, arg); 730 __m128 xunder = _mm_mul_ps(x2, xfm1); 731 732 // sqrt2 * over + 2 * sqrt2 * under 733 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), 734 _mm_add_ps(xover, xunder)); 735 736 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); 737 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); 738 return xavg; 739 } 740 741 inline static __m128 powf_wrapper(__m128 Base, float Exp) 742 { 743 float *f = (float *)(&Base); 744 745 return _mm_set_ps(powf(f[3], Exp), 746 powf(f[2], Exp), 747 powf(f[1], Exp), 748 powf(f[0], Exp)); 749 } 750 751 static inline __m128 ConvertFloatToSRGB2(__m128& Src) 752 { 753 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value 754 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); 755 756 // squeeze the mask down to 16 bits (4 bits per DWORD) 757 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); 758 759 __m128 Result; 760 761 // 762 if (CompareResult == 0xFFFF) 763 { 764 // all DWORDs are <= the threshold 765 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); 766 } 767 else if (CompareResult == 0x0) 768 { 769 // all DWORDs are > the threshold 770 __m128 fSrc_0RGB = Src; 771 772 // --> 1.055f * c(1.0f/2.4f) - 0.055f 773 #if KNOB_USE_FAST_SRGB == TRUE 774 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 775 __m128 f = pow512_4(fSrc_0RGB); 776 #else 777 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); 778 #endif 779 f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); 780 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); 781 } 782 else 783 { 784 // some DWORDs are <= the threshold and some are > threshold 785 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); 786 787 __m128 fSrc_0RGB = Src; 788 789 // --> 1.055f * c(1.0f/2.4f) - 0.055f 790 #if KNOB_USE_FAST_SRGB == TRUE 791 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 792 __m128 f = pow512_4(fSrc_0RGB); 793 #else 794 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); 795 #endif 796 f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); 797 f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); 798 799 // Clear the alpha (is garbage after the sub) 800 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); 801 802 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); 803 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); 804 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); 805 806 Result = TO_M128(CombinedParts); 807 } 808 809 return Result; 810 } 811 812 #if ENABLE_AVX512_SIMD16 813 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > 814 inline static simd16scalar fastpow(simd16scalar value) 815 { 816 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) 817 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum); 818 819 // Apply a constant pre-correction factor. 820 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1)); 821 822 // Reinterpret arg as integer to obtain logarithm. 823 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result)); 824 result = _simd16_cvtepi32_ps(_simd16_castps_si(result)); 825 826 // Multiply logarithm by power. 827 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden)); 828 829 // Convert back to "integer" to exponentiate. 830 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result)); 831 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result)); 832 833 return result; 834 } 835 836 inline static simd16scalar pow512_4(simd16scalar arg) 837 { 838 // 5/12 is too small, so compute the 4th root of 20/12 instead. 839 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. 840 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 841 simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); 842 simd16scalar xover = _simd16_mul_ps(arg, xf); 843 844 simd16scalar xfm1 = _simd16_rsqrt_ps(xf); 845 simd16scalar x2 = _simd16_mul_ps(arg, arg); 846 simd16scalar xunder = _simd16_mul_ps(x2, xfm1); 847 848 // sqrt2 * over + 2 * sqrt2 * under 849 simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder)); 850 851 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); 852 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); 853 854 return xavg; 855 } 856 857 inline static simd16scalar powf_wrapper(const simd16scalar base, float exp) 858 { 859 const float *f = reinterpret_cast<const float *>(&base); 860 861 return _simd16_set_ps( 862 powf(f[15], exp), 863 powf(f[14], exp), 864 powf(f[13], exp), 865 powf(f[12], exp), 866 powf(f[11], exp), 867 powf(f[10], exp), 868 powf(f[ 9], exp), 869 powf(f[ 8], exp), 870 powf(f[ 7], exp), 871 powf(f[ 6], exp), 872 powf(f[ 5], exp), 873 powf(f[ 4], exp), 874 powf(f[ 3], exp), 875 powf(f[ 2], exp), 876 powf(f[ 1], exp), 877 powf(f[ 0], exp) 878 ); 879 } 880 881 // float to SRGB conversion formula 882 // 883 // if (value < 0.0031308f) 884 // value *= 12.92f; 885 // else 886 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f; 887 // 888 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value) 889 { 890 // create a mask where the source is < the minimal SRGB float value 891 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f)); 892 893 // if all elements are < the threshold, result = value * 12.92 894 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f)); 895 896 if (_simd16_mask2int(mask) != 0xFFFF) 897 { 898 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055 899 #if KNOB_USE_FAST_SRGB == TRUE 900 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 901 simd16scalar result2 = pow512_4(value); 902 #else 903 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f); 904 #endif 905 906 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f)); 907 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f)); 908 909 #if (KNOB_ARCH == KNOB_ARCH_AVX512) 910 // only native AVX512 can directly use the computed mask for the blend operation 911 result = _mm512_mask_blend_ps(mask, result2, result); 912 #else 913 result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); 914 #endif 915 } 916 917 return result; 918 } 919 920 #endif 921 ////////////////////////////////////////////////////////////////////////// 922 /// TypeTraits - Format type traits specialization for FLOAT16 923 ////////////////////////////////////////////////////////////////////////// 924 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> 925 { 926 static const SWR_TYPE MyType = SWR_TYPE_FLOAT; 927 static float toFloat() { return 1.0f; } 928 static float fromFloat() { return 1.0f; } 929 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } 930 931 static simdscalar pack(const simdscalar &in) 932 { 933 #if KNOB_SIMD_WIDTH == 8 934 #if (KNOB_ARCH == KNOB_ARCH_AVX) 935 // input is 8 packed float32, output is 8 packed float16 936 simdscalari src = _simd_castps_si(in); 937 938 static const uint32_t FLOAT_EXP_BITS = 8; 939 static const uint32_t FLOAT_MANTISSA_BITS = 23; 940 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; 941 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; 942 943 static const uint32_t HALF_EXP_BITS = 5; 944 static const uint32_t HALF_MANTISSA_BITS = 10; 945 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; 946 947 // minimum exponent required, exponents below this are flushed to 0. 948 static const int32_t HALF_EXP_MIN = -14; 949 static const int32_t FLOAT_EXP_BIAS = 127; 950 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; 951 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand 952 953 // maximum exponent required, exponents above this are set to infinity 954 static const int32_t HALF_EXP_MAX = 15; 955 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; 956 957 const simdscalari vSignMask = _simd_set1_epi32(0x80000000); 958 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); 959 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); 960 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); 961 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); 962 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); 963 964 simdscalari vSign = _simd_and_si(src, vSignMask); 965 simdscalari vExp = _simd_and_si(src, vExpMask); 966 simdscalari vMan = _simd_and_si(src, vManMask); 967 968 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); 969 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); 970 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); 971 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); 972 973 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); 974 975 // pack output 16-bits into the lower 16-bits of each 32-bit channel 976 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); 977 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); 978 979 // Flush To Zero 980 vDst = _simd_andnot_si(vFTZMask, vDst); 981 // Apply Infinites / NaN 982 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); 983 984 // Apply clamps 985 vDst = _simd_andnot_si(vClampMask, vDst); 986 vDst = _simd_or_si(vDst, 987 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); 988 989 // Compute Denormals (subnormals) 990 if (!_mm256_testz_si256(vDenormMask, vDenormMask)) 991 { 992 uint32_t *pDenormMask = (uint32_t*)&vDenormMask; 993 uint32_t *pExp = (uint32_t*)&vExp; 994 uint32_t *pMan = (uint32_t*)&vMan; 995 uint32_t *pDst = (uint32_t*)&vDst; 996 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 997 { 998 if (pDenormMask[i]) 999 { 1000 // Need to compute subnormal value 1001 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; 1002 uint32_t mantissa = pMan[i] | 1003 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit 1004 1005 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); 1006 } 1007 } 1008 } 1009 1010 // Add in sign bits 1011 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); 1012 1013 // Pack to lower 128-bits 1014 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); 1015 1016 #if 0 1017 #if !defined(NDEBUG) 1018 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); 1019 1020 for (uint32_t i = 0; i < 4; ++i) 1021 { 1022 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); 1023 } 1024 #endif 1025 #endif 1026 1027 return _simd_castsi_ps(vDst); 1028 1029 #else 1030 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); 1031 #endif 1032 #else 1033 #error Unsupported vector width 1034 #endif 1035 } 1036 1037 static simdscalar unpack(const simdscalar &in) 1038 { 1039 // input is 8 packed float16, output is 8 packed float32 1040 SWR_ASSERT(0); // @todo 1041 return _simd_setzero_ps(); 1042 } 1043 #if ENABLE_AVX512_SIMD16 1044 1045 static simd16scalar pack(const simd16scalar &in) 1046 { 1047 simd16scalari result = _simd16_setzero_si(); 1048 simdscalari resultlo = _simd_setzero_si(); 1049 1050 #if (KNOB_ARCH == KNOB_ARCH_AVX) 1051 simdscalar simdlo = pack(_simd16_extract_ps(in, 0)); 1052 simdscalar simdhi = pack(_simd16_extract_ps(in, 1)); 1053 1054 __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0); 1055 __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0); 1056 1057 #else 1058 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC); 1059 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC); 1060 1061 #endif 1062 resultlo = _simd_insertf128_si(resultlo, templo, 0); 1063 resultlo = _simd_insertf128_si(resultlo, temphi, 1); 1064 1065 result = _simd16_insert_si(result, resultlo, 0); 1066 1067 return _simd16_castsi_ps(result); 1068 } 1069 1070 static simd16scalar unpack(const simd16scalar &in) 1071 { 1072 // input is 16 packed float16, output is 16 packed float32 1073 SWR_ASSERT(0); // @todo 1074 return _simd16_setzero_ps(); 1075 } 1076 #endif 1077 }; 1078 1079 ////////////////////////////////////////////////////////////////////////// 1080 /// TypeTraits - Format type traits specialization for FLOAT32 1081 ////////////////////////////////////////////////////////////////////////// 1082 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> 1083 { 1084 static const SWR_TYPE MyType = SWR_TYPE_FLOAT; 1085 static float toFloat() { return 1.0f; } 1086 static float fromFloat() { return 1.0f; } 1087 static inline simdscalar convertSrgb(simdscalar &in) 1088 { 1089 #if KNOB_SIMD_WIDTH == 8 1090 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2) 1091 __m128 srcLo = _mm256_extractf128_ps(in, 0); 1092 __m128 srcHi = _mm256_extractf128_ps(in, 1); 1093 1094 srcLo = ConvertFloatToSRGB2(srcLo); 1095 srcHi = ConvertFloatToSRGB2(srcHi); 1096 1097 in = _mm256_insertf128_ps(in, srcLo, 0); 1098 in = _mm256_insertf128_ps(in, srcHi, 1); 1099 #endif 1100 #else 1101 #error Unsupported vector width 1102 #endif 1103 return in; 1104 } 1105 #if ENABLE_AVX512_SIMD16 1106 1107 static inline simd16scalar convertSrgb(simd16scalar &in) 1108 { 1109 return ConvertFloatToSRGB2(in); 1110 } 1111 #endif 1112 }; 1113 1114 ////////////////////////////////////////////////////////////////////////// 1115 /// Format1 - Bitfield for single component formats. 1116 ////////////////////////////////////////////////////////////////////////// 1117 template<uint32_t x> 1118 struct Format1 1119 { 1120 union 1121 { 1122 uint32_t r : x; 1123 1124 ///@ The following are here to provide full template needed in Formats. 1125 uint32_t g : x; 1126 uint32_t b : x; 1127 uint32_t a : x; 1128 }; 1129 }; 1130 1131 ////////////////////////////////////////////////////////////////////////// 1132 /// Format1 - Bitfield for single component formats - 8 bit specialization 1133 ////////////////////////////////////////////////////////////////////////// 1134 template<> 1135 struct Format1<8> 1136 { 1137 union 1138 { 1139 uint8_t r; 1140 1141 ///@ The following are here to provide full template needed in Formats. 1142 uint8_t g; 1143 uint8_t b; 1144 uint8_t a; 1145 }; 1146 }; 1147 1148 ////////////////////////////////////////////////////////////////////////// 1149 /// Format1 - Bitfield for single component formats - 16 bit specialization 1150 ////////////////////////////////////////////////////////////////////////// 1151 template<> 1152 struct Format1<16> 1153 { 1154 union 1155 { 1156 uint16_t r; 1157 1158 ///@ The following are here to provide full template needed in Formats. 1159 uint16_t g; 1160 uint16_t b; 1161 uint16_t a; 1162 }; 1163 }; 1164 1165 ////////////////////////////////////////////////////////////////////////// 1166 /// Format2 - Bitfield for 2 component formats. 1167 ////////////////////////////////////////////////////////////////////////// 1168 template<uint32_t x, uint32_t y> 1169 union Format2 1170 { 1171 struct 1172 { 1173 uint32_t r : x; 1174 uint32_t g : y; 1175 }; 1176 struct 1177 { 1178 ///@ The following are here to provide full template needed in Formats. 1179 uint32_t b : x; 1180 uint32_t a : y; 1181 }; 1182 }; 1183 1184 ////////////////////////////////////////////////////////////////////////// 1185 /// Format2 - Bitfield for 2 component formats - 16 bit specialization 1186 ////////////////////////////////////////////////////////////////////////// 1187 template<> 1188 union Format2<8,8> 1189 { 1190 struct 1191 { 1192 uint16_t r : 8; 1193 uint16_t g : 8; 1194 }; 1195 struct 1196 { 1197 ///@ The following are here to provide full template needed in Formats. 1198 uint16_t b : 8; 1199 uint16_t a : 8; 1200 }; 1201 }; 1202 1203 ////////////////////////////////////////////////////////////////////////// 1204 /// Format3 - Bitfield for 3 component formats. 1205 ////////////////////////////////////////////////////////////////////////// 1206 template<uint32_t x, uint32_t y, uint32_t z> 1207 union Format3 1208 { 1209 struct 1210 { 1211 uint32_t r : x; 1212 uint32_t g : y; 1213 uint32_t b : z; 1214 }; 1215 uint32_t a; ///@note This is here to provide full template needed in Formats. 1216 }; 1217 1218 ////////////////////////////////////////////////////////////////////////// 1219 /// Format3 - Bitfield for 3 component formats - 16 bit specialization 1220 ////////////////////////////////////////////////////////////////////////// 1221 template<> 1222 union Format3<5,6,5> 1223 { 1224 struct 1225 { 1226 uint16_t r : 5; 1227 uint16_t g : 6; 1228 uint16_t b : 5; 1229 }; 1230 uint16_t a; ///@note This is here to provide full template needed in Formats. 1231 }; 1232 1233 ////////////////////////////////////////////////////////////////////////// 1234 /// Format4 - Bitfield for 4 component formats. 1235 ////////////////////////////////////////////////////////////////////////// 1236 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> 1237 struct Format4 1238 { 1239 uint32_t r : x; 1240 uint32_t g : y; 1241 uint32_t b : z; 1242 uint32_t a : w; 1243 }; 1244 1245 ////////////////////////////////////////////////////////////////////////// 1246 /// Format4 - Bitfield for 4 component formats - 16 bit specialization 1247 ////////////////////////////////////////////////////////////////////////// 1248 template<> 1249 struct Format4<5,5,5,1> 1250 { 1251 uint16_t r : 5; 1252 uint16_t g : 5; 1253 uint16_t b : 5; 1254 uint16_t a : 1; 1255 }; 1256 1257 ////////////////////////////////////////////////////////////////////////// 1258 /// Format4 - Bitfield for 4 component formats - 16 bit specialization 1259 ////////////////////////////////////////////////////////////////////////// 1260 template<> 1261 struct Format4<4,4,4,4> 1262 { 1263 uint16_t r : 4; 1264 uint16_t g : 4; 1265 uint16_t b : 4; 1266 uint16_t a : 4; 1267 }; 1268 1269 ////////////////////////////////////////////////////////////////////////// 1270 /// ComponentTraits - Default components 1271 ////////////////////////////////////////////////////////////////////////// 1272 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> 1273 struct Defaults 1274 { 1275 INLINE static uint32_t GetDefault(uint32_t comp) 1276 { 1277 static const uint32_t defaults[4]{ x, y, z, w }; 1278 return defaults[comp]; 1279 } 1280 }; 1281 1282 ////////////////////////////////////////////////////////////////////////// 1283 /// ComponentTraits - Component type traits. 1284 ////////////////////////////////////////////////////////////////////////// 1285 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0> 1286 struct ComponentTraits 1287 { 1288 INLINE static SWR_TYPE GetType(uint32_t comp) 1289 { 1290 static const SWR_TYPE CompType[4]{ X, Y, Z, W }; 1291 return CompType[comp]; 1292 } 1293 1294 INLINE static uint32_t GetBPC(uint32_t comp) 1295 { 1296 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; 1297 return MyBpc[comp]; 1298 } 1299 1300 INLINE static bool isNormalized(uint32_t comp) 1301 { 1302 switch (comp) 1303 { 1304 case 0: 1305 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; 1306 case 1: 1307 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; 1308 case 2: 1309 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; 1310 case 3: 1311 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; 1312 } 1313 SWR_ASSERT(0); 1314 return false; 1315 } 1316 1317 INLINE static float toFloat(uint32_t comp) 1318 { 1319 switch (comp) 1320 { 1321 case 0: 1322 return TypeTraits<X, NumBitsX>::toFloat(); 1323 case 1: 1324 return TypeTraits<Y, NumBitsY>::toFloat(); 1325 case 2: 1326 return TypeTraits<Z, NumBitsZ>::toFloat(); 1327 case 3: 1328 return TypeTraits<W, NumBitsW>::toFloat(); 1329 } 1330 SWR_ASSERT(0); 1331 return TypeTraits<X, NumBitsX>::toFloat(); 1332 1333 } 1334 1335 INLINE static float fromFloat(uint32_t comp) 1336 { 1337 switch (comp) 1338 { 1339 case 0: 1340 return TypeTraits<X, NumBitsX>::fromFloat(); 1341 case 1: 1342 return TypeTraits<Y, NumBitsY>::fromFloat(); 1343 case 2: 1344 return TypeTraits<Z, NumBitsZ>::fromFloat(); 1345 case 3: 1346 return TypeTraits<W, NumBitsW>::fromFloat(); 1347 } 1348 SWR_ASSERT(0); 1349 return TypeTraits<X, NumBitsX>::fromFloat(); 1350 } 1351 1352 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc) 1353 { 1354 switch (comp) 1355 { 1356 case 0: 1357 return TypeTraits<X, NumBitsX>::loadSOA(pSrc); 1358 case 1: 1359 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc); 1360 case 2: 1361 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); 1362 case 3: 1363 return TypeTraits<W, NumBitsW>::loadSOA(pSrc); 1364 } 1365 SWR_ASSERT(0); 1366 return TypeTraits<X, NumBitsX>::loadSOA(pSrc); 1367 } 1368 1369 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src) 1370 { 1371 switch (comp) 1372 { 1373 case 0: 1374 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1375 return; 1376 case 1: 1377 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); 1378 return; 1379 case 2: 1380 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); 1381 return; 1382 case 3: 1383 TypeTraits<W, NumBitsW>::storeSOA(pDst, src); 1384 return; 1385 } 1386 SWR_ASSERT(0); 1387 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1388 } 1389 1390 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in) 1391 { 1392 switch (comp) 1393 { 1394 case 0: 1395 return TypeTraits<X, NumBitsX>::unpack(in); 1396 case 1: 1397 return TypeTraits<Y, NumBitsY>::unpack(in); 1398 case 2: 1399 return TypeTraits<Z, NumBitsZ>::unpack(in); 1400 case 3: 1401 return TypeTraits<W, NumBitsW>::unpack(in); 1402 } 1403 SWR_ASSERT(0); 1404 return TypeTraits<X, NumBitsX>::unpack(in); 1405 } 1406 1407 INLINE static simdscalar pack(uint32_t comp, simdscalar &in) 1408 { 1409 switch (comp) 1410 { 1411 case 0: 1412 return TypeTraits<X, NumBitsX>::pack(in); 1413 case 1: 1414 return TypeTraits<Y, NumBitsY>::pack(in); 1415 case 2: 1416 return TypeTraits<Z, NumBitsZ>::pack(in); 1417 case 3: 1418 return TypeTraits<W, NumBitsW>::pack(in); 1419 } 1420 SWR_ASSERT(0); 1421 return TypeTraits<X, NumBitsX>::pack(in); 1422 } 1423 1424 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in) 1425 { 1426 switch (comp) 1427 { 1428 case 0: 1429 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1430 case 1: 1431 return TypeTraits<Y, NumBitsY>::convertSrgb(in); 1432 case 2: 1433 return TypeTraits<Z, NumBitsZ>::convertSrgb(in); 1434 case 3: 1435 return TypeTraits<W, NumBitsW>::convertSrgb(in); 1436 } 1437 SWR_ASSERT(0); 1438 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1439 } 1440 #if ENABLE_AVX512_SIMD16 1441 1442 INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc) 1443 { 1444 switch (comp) 1445 { 1446 case 0: 1447 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); 1448 case 1: 1449 return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc); 1450 case 2: 1451 return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc); 1452 case 3: 1453 return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc); 1454 } 1455 SWR_ASSERT(0); 1456 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); 1457 } 1458 1459 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src) 1460 { 1461 switch (comp) 1462 { 1463 case 0: 1464 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1465 return; 1466 case 1: 1467 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); 1468 return; 1469 case 2: 1470 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); 1471 return; 1472 case 3: 1473 TypeTraits<W, NumBitsW>::storeSOA(pDst, src); 1474 return; 1475 } 1476 SWR_ASSERT(0); 1477 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1478 } 1479 1480 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in) 1481 { 1482 switch (comp) 1483 { 1484 case 0: 1485 return TypeTraits<X, NumBitsX>::unpack(in); 1486 case 1: 1487 return TypeTraits<Y, NumBitsY>::unpack(in); 1488 case 2: 1489 return TypeTraits<Z, NumBitsZ>::unpack(in); 1490 case 3: 1491 return TypeTraits<W, NumBitsW>::unpack(in); 1492 } 1493 SWR_ASSERT(0); 1494 return TypeTraits<X, NumBitsX>::unpack(in); 1495 } 1496 1497 INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in) 1498 { 1499 switch (comp) 1500 { 1501 case 0: 1502 return TypeTraits<X, NumBitsX>::pack(in); 1503 case 1: 1504 return TypeTraits<Y, NumBitsY>::pack(in); 1505 case 2: 1506 return TypeTraits<Z, NumBitsZ>::pack(in); 1507 case 3: 1508 return TypeTraits<W, NumBitsW>::pack(in); 1509 } 1510 SWR_ASSERT(0); 1511 return TypeTraits<X, NumBitsX>::pack(in); 1512 } 1513 1514 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in) 1515 { 1516 switch (comp) 1517 { 1518 case 0: 1519 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1520 case 1: 1521 return TypeTraits<Y, NumBitsY>::convertSrgb(in); 1522 case 2: 1523 return TypeTraits<Z, NumBitsZ>::convertSrgb(in); 1524 case 3: 1525 return TypeTraits<W, NumBitsW>::convertSrgb(in); 1526 } 1527 SWR_ASSERT(0); 1528 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1529 } 1530 #endif 1531 }; 1532