1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file StoreTile.h 24 * 25 * @brief Functionality for Store. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "common/os.h" 31 #include "common/formats.h" 32 #include "core/context.h" 33 #include "core/rdtsc_core.h" 34 #include "core/format_conversion.h" 35 36 #include "memory/TilingFunctions.h" 37 #include "memory/Convert.h" 38 #include "memory/SurfaceState.h" 39 #include "core/multisample.h" 40 41 #include <array> 42 #include <sstream> 43 44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 45 46 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats. 47 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); 48 49 ////////////////////////////////////////////////////////////////////////// 50 /// Store Raster Tile Function Tables. 51 ////////////////////////////////////////////////////////////////////////// 52 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 53 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 54 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 55 56 void InitStoreTilesTable_Linear_1(); 57 void InitStoreTilesTable_Linear_2(); 58 void InitStoreTilesTable_TileX_1(); 59 void InitStoreTilesTable_TileX_2(); 60 void InitStoreTilesTable_TileY_1(); 61 void InitStoreTilesTable_TileY_2(); 62 void InitStoreTilesTable_TileW(); 63 void InitStoreTilesTable(); 64 65 ////////////////////////////////////////////////////////////////////////// 66 /// StorePixels 67 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 68 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 69 /// @param ppDsts - Array of destination pointers. Each pointer is 70 /// to a single row of at most 16B. 71 /// @tparam NumDests - Number of destination pointers. Each pair of 72 /// pointers is for a 16-byte column of two rows. 73 ////////////////////////////////////////////////////////////////////////// 74 template <size_t PixelSize, size_t NumDests> 75 struct StorePixels 76 { 77 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; 78 }; 79 80 ////////////////////////////////////////////////////////////////////////// 81 /// StorePixels (32-bit pixel specialization) 82 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 83 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 84 /// @param ppDsts - Array of destination pointers. Each pointer is 85 /// to a single row of at most 16B. 86 /// @tparam NumDests - Number of destination pointers. Each pair of 87 /// pointers is for a 16-byte column of two rows. 88 ////////////////////////////////////////////////////////////////////////// 89 template <> 90 struct StorePixels<8, 2> 91 { 92 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 93 { 94 // Each 4-pixel row is 4 bytes. 95 const uint16_t* pPixSrc = (const uint16_t*)pSrc; 96 97 // Unswizzle from SWR-Z order 98 uint16_t* pRow = (uint16_t*)ppDsts[0]; 99 pRow[0] = pPixSrc[0]; 100 pRow[1] = pPixSrc[2]; 101 102 pRow = (uint16_t*)ppDsts[1]; 103 pRow[0] = pPixSrc[1]; 104 pRow[1] = pPixSrc[3]; 105 } 106 }; 107 108 template <> 109 struct StorePixels<8, 4> 110 { 111 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 112 { 113 // 8 x 2 bytes = 16 bytes, 16 pixels 114 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); 115 116 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); 117 118 // Unswizzle from SWR-Z order 119 ppDsts16[0][0] = pSrc16[0]; // 0 1 120 ppDsts16[0][1] = pSrc16[2]; // 4 5 121 122 ppDsts16[1][0] = pSrc16[1]; // 2 3 123 ppDsts16[1][1] = pSrc16[3]; // 6 7 124 125 ppDsts16[2][0] = pSrc16[4]; // 8 9 126 ppDsts16[2][1] = pSrc16[6]; // C D 127 128 ppDsts16[3][0] = pSrc16[5]; // A B 129 ppDsts16[3][1] = pSrc16[7]; // E F 130 } 131 }; 132 133 ////////////////////////////////////////////////////////////////////////// 134 /// StorePixels (32-bit pixel specialization) 135 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 136 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 137 /// @param ppDsts - Array of destination pointers. Each pointer is 138 /// to a single row of at most 16B. 139 /// @tparam NumDests - Number of destination pointers. Each pair of 140 /// pointers is for a 16-byte column of two rows. 141 ////////////////////////////////////////////////////////////////////////// 142 template <> 143 struct StorePixels<16, 2> 144 { 145 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 146 { 147 // Each 4-pixel row is 8 bytes. 148 const uint32_t* pPixSrc = (const uint32_t*)pSrc; 149 150 // Unswizzle from SWR-Z order 151 uint32_t* pRow = (uint32_t*)ppDsts[0]; 152 pRow[0] = pPixSrc[0]; 153 pRow[1] = pPixSrc[2]; 154 155 pRow = (uint32_t*)ppDsts[1]; 156 pRow[0] = pPixSrc[1]; 157 pRow[1] = pPixSrc[3]; 158 } 159 }; 160 161 template <> 162 struct StorePixels<16, 4> 163 { 164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 165 { 166 // 8 x 4 bytes = 32 bytes, 16 pixels 167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); 168 169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); 170 171 // Unswizzle from SWR-Z order 172 ppDsts32[0][0] = pSrc32[0]; // 0 1 173 ppDsts32[0][1] = pSrc32[2]; // 4 5 174 175 ppDsts32[1][0] = pSrc32[1]; // 2 3 176 ppDsts32[1][1] = pSrc32[3]; // 6 7 177 178 ppDsts32[2][0] = pSrc32[4]; // 8 9 179 ppDsts32[2][1] = pSrc32[6]; // C D 180 181 ppDsts32[3][0] = pSrc32[5]; // A B 182 ppDsts32[3][1] = pSrc32[7]; // E F 183 } 184 }; 185 186 ////////////////////////////////////////////////////////////////////////// 187 /// StorePixels (32-bit pixel specialization) 188 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 189 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 190 /// @param ppDsts - Array of destination pointers. Each pointer is 191 /// to a single row of at most 16B. 192 /// @tparam NumDests - Number of destination pointers. Each pair of 193 /// pointers is for a 16-byte column of two rows. 194 ////////////////////////////////////////////////////////////////////////// 195 template <> 196 struct StorePixels<32, 2> 197 { 198 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 199 { 200 // Each 4-pixel row is 16-bytes 201 simd4scalari *pZRow01 = (simd4scalari*)pSrc; 202 simd4scalari vQuad00 = SIMD128::load_si(pZRow01); 203 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1); 204 205 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01); 206 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01); 207 208 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00); 209 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10); 210 } 211 }; 212 213 template <> 214 struct StorePixels<32, 4> 215 { 216 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 217 { 218 // 4 x 16 bytes = 64 bytes, 16 pixels 219 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 220 221 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 222 223 // Unswizzle from SWR-Z order 224 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3 225 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7 226 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B 227 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F 228 229 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5 230 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7 231 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D 232 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F 233 } 234 }; 235 236 ////////////////////////////////////////////////////////////////////////// 237 /// StorePixels (32-bit pixel specialization) 238 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 239 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 240 /// @param ppDsts - Array of destination pointers. Each pointer is 241 /// to a single row of at most 16B. 242 /// @tparam NumDests - Number of destination pointers. Each pair of 243 /// pointers is for a 16-byte column of two rows. 244 ////////////////////////////////////////////////////////////////////////// 245 template <> 246 struct StorePixels<64, 4> 247 { 248 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 249 { 250 // Each 4-pixel row is 32 bytes. 251 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; 252 253 // order of pointers match SWR-Z layout 254 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; 255 *pvDsts[0] = pPixSrc[0]; 256 *pvDsts[1] = pPixSrc[1]; 257 *pvDsts[2] = pPixSrc[2]; 258 *pvDsts[3] = pPixSrc[3]; 259 } 260 }; 261 262 template <> 263 struct StorePixels<64, 8> 264 { 265 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 266 { 267 // 8 x 16 bytes = 128 bytes, 16 pixels 268 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 269 270 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 271 272 // order of pointers match SWR-Z layout 273 *ppDsts128[0] = pSrc128[0]; // 0 1 274 *ppDsts128[1] = pSrc128[1]; // 2 3 275 *ppDsts128[2] = pSrc128[2]; // 4 5 276 *ppDsts128[3] = pSrc128[3]; // 6 7 277 *ppDsts128[4] = pSrc128[4]; // 8 9 278 *ppDsts128[5] = pSrc128[5]; // A B 279 *ppDsts128[6] = pSrc128[6]; // C D 280 *ppDsts128[7] = pSrc128[7]; // E F 281 } 282 }; 283 284 ////////////////////////////////////////////////////////////////////////// 285 /// StorePixels (32-bit pixel specialization) 286 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 287 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 288 /// @param ppDsts - Array of destination pointers. Each pointer is 289 /// to a single row of at most 16B. 290 /// @tparam NumDests - Number of destination pointers. Each pair of 291 /// pointers is for a 16-byte column of two rows. 292 ////////////////////////////////////////////////////////////////////////// 293 template <> 294 struct StorePixels<128, 8> 295 { 296 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 297 { 298 // Each 4-pixel row is 64 bytes. 299 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; 300 301 // Unswizzle from SWR-Z order 302 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; 303 *pvDsts[0] = pPixSrc[0]; 304 *pvDsts[1] = pPixSrc[2]; 305 *pvDsts[2] = pPixSrc[1]; 306 *pvDsts[3] = pPixSrc[3]; 307 *pvDsts[4] = pPixSrc[4]; 308 *pvDsts[5] = pPixSrc[6]; 309 *pvDsts[6] = pPixSrc[5]; 310 *pvDsts[7] = pPixSrc[7]; 311 } 312 }; 313 314 template <> 315 struct StorePixels<128, 16> 316 { 317 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) 318 { 319 // 16 x 16 bytes = 256 bytes, 16 pixels 320 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 321 322 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 323 324 for (uint32_t i = 0; i < 16; i += 4) 325 { 326 *ppDsts128[i + 0] = pSrc128[i + 0]; 327 *ppDsts128[i + 1] = pSrc128[i + 2]; 328 *ppDsts128[i + 2] = pSrc128[i + 1]; 329 *ppDsts128[i + 3] = pSrc128[i + 3]; 330 } 331 } 332 }; 333 334 ////////////////////////////////////////////////////////////////////////// 335 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 336 ////////////////////////////////////////////////////////////////////////// 337 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 338 struct ConvertPixelsSOAtoAOS 339 { 340 ////////////////////////////////////////////////////////////////////////// 341 /// @brief Converts a SIMD from the Hot Tile to the destination format 342 /// and converts from SOA to AOS. 343 /// @param pSrc - Pointer to raster tile. 344 /// @param pDst - Pointer to destination surface or deswizzling buffer. 345 template <size_t NumDests> 346 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 347 { 348 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 349 350 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES] = {0}; 351 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES] = {0}; 352 353 // Convert from SrcFormat --> DstFormat 354 simd16vector src; 355 LoadSOA<SrcFormat>(pSrc, src); 356 StoreSOA<DstFormat>(src, soaTile); 357 358 // Convert from SOA --> AOS 359 FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile); 360 361 // Store data into destination 362 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 363 } 364 }; 365 366 ////////////////////////////////////////////////////////////////////////// 367 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 368 /// Specialization for no format conversion 369 ////////////////////////////////////////////////////////////////////////// 370 template<SWR_FORMAT Format> 371 struct ConvertPixelsSOAtoAOS<Format, Format> 372 { 373 ////////////////////////////////////////////////////////////////////////// 374 /// @brief Converts a SIMD from the Hot Tile to the destination format 375 /// and converts from SOA to AOS. 376 /// @param pSrc - Pointer to raster tile. 377 /// @param pDst - Pointer to destination surface or deswizzling buffer. 378 template <size_t NumDests> 379 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 380 { 381 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 382 383 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 384 385 // Convert from SOA --> AOS 386 FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile); 387 388 // Store data into destination 389 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); 390 } 391 }; 392 393 ////////////////////////////////////////////////////////////////////////// 394 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM 395 ////////////////////////////////////////////////////////////////////////// 396 template<> 397 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > 398 { 399 ////////////////////////////////////////////////////////////////////////// 400 /// @brief Converts a SIMD from the Hot Tile to the destination format 401 /// and converts from SOA to AOS. 402 /// @param pSrc - Pointer to raster tile. 403 /// @param pDst - Pointer to destination surface or deswizzling buffer. 404 template <size_t NumDests> 405 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 406 { 407 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 408 static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 409 410 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 411 412 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 413 414 // Load hot-tile 415 simd16vector src, dst; 416 LoadSOA<SrcFormat>(pSrc, src); 417 418 // deswizzle 419 dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 420 dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 421 dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 422 423 // clamp 424 dst.x = Clamp<DstFormat>(dst.x, 0); 425 dst.y = Clamp<DstFormat>(dst.y, 1); 426 dst.z = Clamp<DstFormat>(dst.z, 2); 427 428 // normalize 429 dst.x = Normalize<DstFormat>(dst.x, 0); 430 dst.y = Normalize<DstFormat>(dst.y, 1); 431 dst.z = Normalize<DstFormat>(dst.z, 2); 432 433 // pack 434 simd16scalari packed = _simd16_castps_si(dst.x); 435 436 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); 437 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); 438 439 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); 440 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); 441 442 // pack low 16 bits of each 32 bit lane to low 128 bits of dst 443 uint32_t *pPacked = (uint32_t*)&packed; 444 uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 445 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) 446 { 447 *pAosTile++ = *pPacked++; 448 } 449 450 // Store data into destination 451 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 452 } 453 }; 454 455 ////////////////////////////////////////////////////////////////////////// 456 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 457 ////////////////////////////////////////////////////////////////////////// 458 template<> 459 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> 460 { 461 static const SWR_FORMAT SrcFormat = R32_FLOAT; 462 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; 463 464 ////////////////////////////////////////////////////////////////////////// 465 /// @brief Converts a SIMD from the Hot Tile to the destination format 466 /// and converts from SOA to AOS. 467 /// @param pSrc - Pointer to raster tile. 468 /// @param pDst - Pointer to destination surface or deswizzling buffer. 469 template <size_t NumDests> 470 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 471 { 472 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 473 474 // clamp 475 const simd16scalar zero = _simd16_setzero_ps(); 476 const simd16scalar ones = _simd16_set1_ps(1.0f); 477 478 comp = _simd16_max_ps(comp, zero); 479 comp = _simd16_min_ps(comp, ones); 480 481 // normalize 482 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 483 484 simd16scalari temp = _simd16_cvtps_epi32(comp); 485 486 // swizzle 487 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 488 489 // merge/store data into destination but don't overwrite the X8 bits 490 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0])); 491 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2])); 492 493 simd16scalari dest = _simd16_setzero_si(); 494 495 dest = _simd16_insert_si(dest, destlo, 0); 496 dest = _simd16_insert_si(dest, desthi, 1); 497 498 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); 499 500 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); 501 502 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0)); 503 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1)); 504 } 505 }; 506 507 template<SWR_FORMAT DstFormat> 508 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 509 { 510 // swizzle rgba -> bgra while we load 511 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 512 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 513 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 514 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa 515 516 // clamp 517 const simd16scalar zero = _simd16_setzero_ps(); 518 const simd16scalar ones = _simd16_set1_ps(1.0f); 519 520 comp0 = _simd16_max_ps(comp0, zero); 521 comp0 = _simd16_min_ps(comp0, ones); 522 523 comp1 = _simd16_max_ps(comp1, zero); 524 comp1 = _simd16_min_ps(comp1, ones); 525 526 comp2 = _simd16_max_ps(comp2, zero); 527 comp2 = _simd16_min_ps(comp2, ones); 528 529 comp3 = _simd16_max_ps(comp3, zero); 530 comp3 = _simd16_min_ps(comp3, ones); 531 532 // gamma-correct only rgb 533 if (FormatTraits<DstFormat>::isSRGB) 534 { 535 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 536 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 537 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 538 } 539 540 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 541 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 542 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 543 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 544 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 545 546 // moving to 16 wide integer vector types 547 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 548 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 549 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 550 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa 551 552 // SOA to AOS conversion 553 src1 = _simd16_slli_epi32(src1, 8); 554 src2 = _simd16_slli_epi32(src2, 16); 555 src3 = _simd16_slli_epi32(src3, 24); 556 557 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 558 559 // de-swizzle conversion 560 #if 1 561 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 562 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 563 564 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 565 566 #else 567 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 568 569 #endif 570 // store 8x2 memory order: 571 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 572 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 573 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); 574 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); 575 } 576 577 template<SWR_FORMAT DstFormat> 578 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 579 { 580 static const uint32_t offset = sizeof(simdscalar); 581 582 // swizzle rgba -> bgra while we load 583 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 584 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 585 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 586 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 587 588 // clamp 589 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 590 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 591 592 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 593 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 594 595 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 596 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 597 598 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); 599 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); 600 601 if (FormatTraits<DstFormat>::isSRGB) 602 { 603 // Gamma-correct only rgb 604 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 605 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 606 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 607 } 608 609 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 610 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 611 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 612 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 613 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 614 615 // moving to 8 wide integer vector types 616 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 617 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 618 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 619 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa 620 621 #if KNOB_ARCH <= KNOB_ARCH_AVX 622 623 // splitting into two sets of 4 wide integer vector types 624 // because AVX doesn't have instructions to support this operation at 8 wide 625 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 626 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 627 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 628 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a 629 630 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 631 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 632 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 633 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a 634 635 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 636 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 637 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 638 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 639 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 640 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 641 642 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr 643 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00 644 645 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr 646 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00 647 648 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr 649 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr 650 651 // unpack into rows that get the tiling order correct 652 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr 653 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); 654 655 simdscalari final = _mm256_castsi128_si256(vRow00); 656 final = _mm256_insertf128_si256(final, vRow10, 1); 657 658 #else 659 660 // logic is as above, only wider 661 src1 = _mm256_slli_si256(src1, 1); 662 src2 = _mm256_slli_si256(src2, 2); 663 src3 = _mm256_slli_si256(src3, 3); 664 665 src0 = _mm256_or_si256(src0, src1); 666 src2 = _mm256_or_si256(src2, src3); 667 668 simdscalari final = _mm256_or_si256(src0, src2); 669 670 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 671 final = _mm256_permute4x64_epi64(final, 0xD8); 672 #endif 673 674 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); 675 } 676 677 template<SWR_FORMAT DstFormat> 678 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 679 { 680 // swizzle rgba -> bgra while we load 681 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 682 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 683 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 684 685 // clamp 686 const simd16scalar zero = _simd16_setzero_ps(); 687 const simd16scalar ones = _simd16_set1_ps(1.0f); 688 689 comp0 = _simd16_max_ps(comp0, zero); 690 comp0 = _simd16_min_ps(comp0, ones); 691 692 comp1 = _simd16_max_ps(comp1, zero); 693 comp1 = _simd16_min_ps(comp1, ones); 694 695 comp2 = _simd16_max_ps(comp2, zero); 696 comp2 = _simd16_min_ps(comp2, ones); 697 698 // gamma-correct only rgb 699 if (FormatTraits<DstFormat>::isSRGB) 700 { 701 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 702 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 703 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 704 } 705 706 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 707 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 708 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 709 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 710 711 // moving to 16 wide integer vector types 712 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 713 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 714 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 715 716 // SOA to AOS conversion 717 src1 = _simd16_slli_epi32(src1, 8); 718 src2 = _simd16_slli_epi32(src2, 16); 719 720 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 721 722 // de-swizzle conversion 723 #if 1 724 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 725 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 726 727 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 728 729 #else 730 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 731 732 #endif 733 // store 8x2 memory order: 734 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 735 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 736 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); 737 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); 738 } 739 740 template<SWR_FORMAT DstFormat> 741 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 742 { 743 static const uint32_t offset = sizeof(simdscalar); 744 745 // swizzle rgba -> bgra while we load 746 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 747 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 748 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 749 // clamp 750 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 751 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 752 753 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 754 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 755 756 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 757 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 758 759 if (FormatTraits<DstFormat>::isSRGB) 760 { 761 // Gamma-correct only rgb 762 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 763 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 764 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 765 } 766 767 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 768 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 769 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 770 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 771 772 // moving to 8 wide integer vector types 773 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 774 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 775 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 776 777 #if KNOB_ARCH <= KNOB_ARCH_AVX 778 779 // splitting into two sets of 4 wide integer vector types 780 // because AVX doesn't have instructions to support this operation at 8 wide 781 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 782 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 783 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 784 785 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 786 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 787 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 788 789 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 790 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 791 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 792 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 793 794 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr 795 796 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr 797 798 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr 799 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr 800 801 // unpack into rows that get the tiling order correct 802 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr 803 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); 804 805 simdscalari final = _mm256_castsi128_si256(vRow00); 806 final = _mm256_insertf128_si256(final, vRow10, 1); 807 808 #else 809 810 // logic is as above, only wider 811 src1 = _mm256_slli_si256(src1, 1); 812 src2 = _mm256_slli_si256(src2, 2); 813 814 src0 = _mm256_or_si256(src0, src1); 815 816 simdscalari final = _mm256_or_si256(src0, src2); 817 818 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 819 final = _mm256_permute4x64_epi64(final, 0xD8); 820 821 #endif 822 823 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); 824 } 825 826 template<> 827 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> 828 { 829 template <size_t NumDests> 830 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 831 { 832 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 833 } 834 }; 835 836 template<> 837 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> 838 { 839 template <size_t NumDests> 840 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 841 { 842 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 843 } 844 }; 845 846 template<> 847 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > 848 { 849 template <size_t NumDests> 850 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 851 { 852 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 853 } 854 }; 855 856 template<> 857 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > 858 { 859 template <size_t NumDests> 860 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 861 { 862 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 863 } 864 }; 865 866 template<> 867 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > 868 { 869 template <size_t NumDests> 870 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 871 { 872 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 873 } 874 }; 875 876 template<> 877 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > 878 { 879 template <size_t NumDests> 880 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 881 { 882 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 883 } 884 }; 885 886 template<> 887 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > 888 { 889 template <size_t NumDests> 890 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 891 { 892 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 893 } 894 }; 895 896 template<> 897 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > 898 { 899 template <size_t NumDests> 900 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 901 { 902 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 903 } 904 }; 905 906 ////////////////////////////////////////////////////////////////////////// 907 /// StoreRasterTile 908 ////////////////////////////////////////////////////////////////////////// 909 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 910 struct StoreRasterTile 911 { 912 ////////////////////////////////////////////////////////////////////////// 913 /// @brief Retrieve color from hot tile source which is always float. 914 /// @param pSrc - Pointer to raster tile. 915 /// @param x, y - Coordinates to raster tile. 916 /// @param output - output color 917 INLINE static void GetSwizzledSrcColor( 918 uint8_t* pSrc, 919 uint32_t x, uint32_t y, 920 float outputColor[4]) 921 { 922 typedef SimdTile_16<SrcFormat, DstFormat> SimdT; 923 924 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); 925 926 // Compute which simd tile we're accessing within 8x8 tile. 927 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 928 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); 929 930 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; 931 932 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); 933 934 pSimdTile->GetSwizzledColor(simdOffset, outputColor); 935 } 936 937 ////////////////////////////////////////////////////////////////////////// 938 /// @brief Stores an 8x8 raster tile to the destination surface. 939 /// @param pSrc - Pointer to raster tile. 940 /// @param pDstSurface - Destination surface state 941 /// @param x, y - Coordinates to raster tile. 942 INLINE static void Store( 943 uint8_t *pSrc, 944 SWR_SURFACE_STATE* pDstSurface, 945 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 946 { 947 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 948 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 949 950 // For each raster tile pixel (rx, ry) 951 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 952 { 953 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 954 { 955 // Perform bounds checking. 956 if (((x + rx) < lodWidth) && 957 ((y + ry) < lodHeight)) 958 { 959 float srcColor[4]; 960 GetSwizzledSrcColor(pSrc, rx, ry, srcColor); 961 962 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 963 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 964 sampleNum, pDstSurface->lod, pDstSurface); 965 { 966 ConvertPixelFromFloat<DstFormat>(pDst, srcColor); 967 } 968 } 969 } 970 } 971 } 972 973 ////////////////////////////////////////////////////////////////////////// 974 /// @brief Resolves an 8x8 raster tile to the resolve destination surface. 975 /// @param pSrc - Pointer to raster tile. 976 /// @param pDstSurface - Destination surface state 977 /// @param x, y - Coordinates to raster tile. 978 /// @param sampleOffset - Offset between adjacent multisamples 979 INLINE static void Resolve( 980 uint8_t *pSrc, 981 SWR_SURFACE_STATE* pDstSurface, 982 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 983 { 984 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 985 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 986 987 float oneOverNumSamples = 1.0f / pDstSurface->numSamples; 988 989 // For each raster tile pixel (rx, ry) 990 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 991 { 992 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 993 { 994 // Perform bounds checking. 995 if (((x + rx) < lodWidth) && 996 ((y + ry) < lodHeight)) 997 { 998 // Sum across samples 999 float resolveColor[4] = {0}; 1000 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 1001 { 1002 float sampleColor[4] = {0}; 1003 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum; 1004 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor); 1005 resolveColor[0] += sampleColor[0]; 1006 resolveColor[1] += sampleColor[1]; 1007 resolveColor[2] += sampleColor[2]; 1008 resolveColor[3] += sampleColor[3]; 1009 } 1010 1011 // Divide by numSamples to average 1012 resolveColor[0] *= oneOverNumSamples; 1013 resolveColor[1] *= oneOverNumSamples; 1014 resolveColor[2] *= oneOverNumSamples; 1015 resolveColor[3] *= oneOverNumSamples; 1016 1017 // Use the resolve surface state 1018 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress; 1019 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 1020 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex, 1021 0, pResolveSurface->lod, pResolveSurface); 1022 { 1023 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor); 1024 } 1025 } 1026 } 1027 } 1028 } 1029 1030 }; 1031 1032 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1033 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> 1034 {}; 1035 1036 ////////////////////////////////////////////////////////////////////////// 1037 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp 1038 ////////////////////////////////////////////////////////////////////////// 1039 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1040 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> 1041 { 1042 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; 1043 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1044 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1045 1046 ////////////////////////////////////////////////////////////////////////// 1047 /// @brief Stores an 8x8 raster tile to the destination surface. 1048 /// @param pSrc - Pointer to raster tile. 1049 /// @param pDstSurface - Destination surface state 1050 /// @param x, y - Coordinates to raster tile. 1051 INLINE static void Store( 1052 uint8_t *pSrc, 1053 SWR_SURFACE_STATE* pDstSurface, 1054 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1055 { 1056 // Punt non-full tiles to generic store 1057 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1058 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1059 1060 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1061 { 1062 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1063 } 1064 1065 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1066 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1067 1068 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1069 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1070 1071 uint8_t* ppDsts[] = 1072 { 1073 pDst, // row 0, col 0 1074 pDst + pDstSurface->pitch, // row 1, col 0 1075 pDst + dx / 2, // row 0, col 1 1076 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1077 }; 1078 1079 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1080 { 1081 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1082 { 1083 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1084 1085 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1086 1087 ppDsts[0] += dx; 1088 ppDsts[1] += dx; 1089 ppDsts[2] += dx; 1090 ppDsts[3] += dx; 1091 } 1092 1093 ppDsts[0] += dy; 1094 ppDsts[1] += dy; 1095 ppDsts[2] += dy; 1096 ppDsts[3] += dy; 1097 } 1098 } 1099 }; 1100 1101 ////////////////////////////////////////////////////////////////////////// 1102 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp 1103 ////////////////////////////////////////////////////////////////////////// 1104 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1105 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> 1106 { 1107 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; 1108 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1109 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1110 1111 ////////////////////////////////////////////////////////////////////////// 1112 /// @brief Stores an 8x8 raster tile to the destination surface. 1113 /// @param pSrc - Pointer to raster tile. 1114 /// @param pDstSurface - Destination surface state 1115 /// @param x, y - Coordinates to raster tile. 1116 INLINE static void Store( 1117 uint8_t *pSrc, 1118 SWR_SURFACE_STATE* pDstSurface, 1119 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1120 { 1121 // Punt non-full tiles to generic store 1122 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1123 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1124 1125 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1126 { 1127 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1128 } 1129 1130 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1131 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1132 1133 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1134 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1135 1136 uint8_t* ppDsts[] = 1137 { 1138 pDst, // row 0, col 0 1139 pDst + pDstSurface->pitch, // row 1, col 0 1140 pDst + dx / 2, // row 0, col 1 1141 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1142 }; 1143 1144 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1145 { 1146 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1147 { 1148 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1149 1150 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1151 1152 ppDsts[0] += dx; 1153 ppDsts[1] += dx; 1154 ppDsts[2] += dx; 1155 ppDsts[3] += dx; 1156 } 1157 1158 ppDsts[0] += dy; 1159 ppDsts[1] += dy; 1160 ppDsts[2] += dy; 1161 ppDsts[3] += dy; 1162 } 1163 } 1164 }; 1165 1166 ////////////////////////////////////////////////////////////////////////// 1167 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp 1168 ////////////////////////////////////////////////////////////////////////// 1169 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1170 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> 1171 { 1172 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; 1173 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1174 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1175 1176 ////////////////////////////////////////////////////////////////////////// 1177 /// @brief Stores an 8x8 raster tile to the destination surface. 1178 /// @param pSrc - Pointer to raster tile. 1179 /// @param pDstSurface - Destination surface state 1180 /// @param x, y - Coordinates to raster tile. 1181 INLINE static void Store( 1182 uint8_t *pSrc, 1183 SWR_SURFACE_STATE* pDstSurface, 1184 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1185 { 1186 // Punt non-full tiles to generic store 1187 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1188 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1189 1190 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1191 { 1192 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1193 } 1194 1195 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1196 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1197 1198 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1199 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1200 1201 uint8_t* ppDsts[] = 1202 { 1203 pDst, // row 0, col 0 1204 pDst + pDstSurface->pitch, // row 1, col 0 1205 pDst + dx / 2, // row 0, col 1 1206 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1207 }; 1208 1209 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1210 { 1211 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1212 { 1213 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1214 1215 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1216 1217 ppDsts[0] += dx; 1218 ppDsts[1] += dx; 1219 ppDsts[2] += dx; 1220 ppDsts[3] += dx; 1221 } 1222 1223 ppDsts[0] += dy; 1224 ppDsts[1] += dy; 1225 ppDsts[2] += dy; 1226 ppDsts[3] += dy; 1227 } 1228 } 1229 }; 1230 1231 ////////////////////////////////////////////////////////////////////////// 1232 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp 1233 ////////////////////////////////////////////////////////////////////////// 1234 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1235 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> 1236 { 1237 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; 1238 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1239 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1240 static const size_t MAX_DST_COLUMN_BYTES = 16; 1241 1242 ////////////////////////////////////////////////////////////////////////// 1243 /// @brief Stores an 8x8 raster tile to the destination surface. 1244 /// @param pSrc - Pointer to raster tile. 1245 /// @param pDstSurface - Destination surface state 1246 /// @param x, y - Coordinates to raster tile. 1247 INLINE static void Store( 1248 uint8_t *pSrc, 1249 SWR_SURFACE_STATE* pDstSurface, 1250 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1251 { 1252 // Punt non-full tiles to generic store 1253 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1254 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1255 1256 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1257 { 1258 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1259 } 1260 1261 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1262 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1263 1264 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1265 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1266 1267 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1268 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); 1269 1270 uint8_t *ppDsts[] = 1271 { 1272 pDst, // row 0, col 0 1273 pDst + pDstSurface->pitch, // row 1, col 0 1274 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1275 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1276 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1277 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1278 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1279 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 1280 }; 1281 1282 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1283 { 1284 // Raster tile width is same as simd16 tile width 1285 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1286 1287 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1288 1289 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1290 1291 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1292 { 1293 ppDsts[i] += dy; 1294 } 1295 } 1296 } 1297 }; 1298 1299 ////////////////////////////////////////////////////////////////////////// 1300 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp 1301 ////////////////////////////////////////////////////////////////////////// 1302 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1303 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> 1304 { 1305 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; 1306 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1307 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1308 static const size_t MAX_DST_COLUMN_BYTES = 16; 1309 1310 ////////////////////////////////////////////////////////////////////////// 1311 /// @brief Stores an 8x8 raster tile to the destination surface. 1312 /// @param pSrc - Pointer to raster tile. 1313 /// @param pDstSurface - Destination surface state 1314 /// @param x, y - Coordinates to raster tile. 1315 INLINE static void Store( 1316 uint8_t *pSrc, 1317 SWR_SURFACE_STATE* pDstSurface, 1318 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1319 { 1320 // Punt non-full tiles to generic store 1321 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1322 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1323 1324 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1325 { 1326 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1327 } 1328 1329 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1330 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1331 1332 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1333 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1334 1335 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1336 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); 1337 1338 uint8_t* ppDsts[] = 1339 { 1340 pDst, // row 0, col 0 1341 pDst + pDstSurface->pitch, // row 1, col 0 1342 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1343 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1344 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1345 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1346 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1347 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 1348 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 1349 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 1350 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 1351 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 1352 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 1353 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 1354 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 1355 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 1356 }; 1357 1358 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1359 { 1360 // Raster tile width is same as simd16 tile width 1361 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1362 1363 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1364 1365 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1366 1367 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1368 { 1369 ppDsts[i] += dy; 1370 } 1371 } 1372 } 1373 }; 1374 1375 ////////////////////////////////////////////////////////////////////////// 1376 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp 1377 ////////////////////////////////////////////////////////////////////////// 1378 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1379 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> 1380 { 1381 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; 1382 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1383 1384 ////////////////////////////////////////////////////////////////////////// 1385 /// @brief Stores an 8x8 raster tile to the destination surface. 1386 /// @param pSrc - Pointer to raster tile. 1387 /// @param pDstSurface - Destination surface state 1388 /// @param x, y - Coordinates to raster tile. 1389 INLINE static void Store( 1390 uint8_t *pSrc, 1391 SWR_SURFACE_STATE* pDstSurface, 1392 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1393 { 1394 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1395 1396 // Punt non-full tiles to generic store 1397 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1398 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1399 1400 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1401 { 1402 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1403 } 1404 1405 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1406 // We can compute the offsets to each column within the raster tile once and increment from these. 1407 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1408 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1409 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1410 1411 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1412 1413 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1414 uint8_t *ppDsts[] = 1415 { 1416 pDst, 1417 pDst + DestRowWidthBytes, 1418 pDst + DestRowWidthBytes / 4, 1419 pDst + DestRowWidthBytes + DestRowWidthBytes / 4 1420 }; 1421 1422 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1423 { 1424 // Raster tile width is same as simd16 tile width 1425 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1426 1427 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1428 1429 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1430 1431 ppDsts[0] += dy; 1432 ppDsts[1] += dy; 1433 ppDsts[2] += dy; 1434 ppDsts[3] += dy; 1435 } 1436 } 1437 }; 1438 1439 ////////////////////////////////////////////////////////////////////////// 1440 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp 1441 ////////////////////////////////////////////////////////////////////////// 1442 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1443 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> 1444 { 1445 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; 1446 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1447 1448 ////////////////////////////////////////////////////////////////////////// 1449 /// @brief Stores an 8x8 raster tile to the destination surface. 1450 /// @param pSrc - Pointer to raster tile. 1451 /// @param pDstSurface - Destination surface state 1452 /// @param x, y - Coordinates to raster tile. 1453 INLINE static void Store( 1454 uint8_t *pSrc, 1455 SWR_SURFACE_STATE* pDstSurface, 1456 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1457 { 1458 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1459 1460 // Punt non-full tiles to generic store 1461 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1462 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1463 1464 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1465 { 1466 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1467 } 1468 1469 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1470 // We can compute the offsets to each column within the raster tile once and increment from these. 1471 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1472 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1473 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1474 1475 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1476 1477 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1478 uint8_t *ppDsts[] = 1479 { 1480 pDst, 1481 pDst + DestRowWidthBytes, 1482 pDst + DestRowWidthBytes / 2, 1483 pDst + DestRowWidthBytes + DestRowWidthBytes / 2 1484 }; 1485 1486 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1487 { 1488 // Raster tile width is same as simd16 tile width 1489 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1490 1491 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1492 1493 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1494 1495 ppDsts[0] += dy; 1496 ppDsts[1] += dy; 1497 ppDsts[2] += dy; 1498 ppDsts[3] += dy; 1499 } 1500 } 1501 }; 1502 1503 ////////////////////////////////////////////////////////////////////////// 1504 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp 1505 ////////////////////////////////////////////////////////////////////////// 1506 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1507 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> 1508 { 1509 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1510 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1511 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1512 1513 ////////////////////////////////////////////////////////////////////////// 1514 /// @brief Stores an 8x8 raster tile to the destination surface. 1515 /// @param pSrc - Pointer to raster tile. 1516 /// @param pDstSurface - Destination surface state 1517 /// @param x, y - Coordinates to raster tile. 1518 INLINE static void Store( 1519 uint8_t *pSrc, 1520 SWR_SURFACE_STATE* pDstSurface, 1521 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1522 { 1523 static const uint32_t DestRowWidthBytes = 512; // 512B rows 1524 1525 // Punt non-full tiles to generic store 1526 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1527 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1528 1529 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1530 { 1531 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1532 } 1533 1534 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. 1535 // We can compute the offsets to each column within the raster tile once and increment from these. 1536 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1537 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1538 1539 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1540 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1541 1542 uint8_t* ppDsts[] = 1543 { 1544 pDst, // row 0, col 0 1545 pDst + DestRowWidthBytes, // row 1, col 0 1546 pDst + dx / 2, // row 0, col 1 1547 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 1548 }; 1549 1550 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1551 { 1552 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1553 { 1554 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1555 1556 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1557 1558 ppDsts[0] += dx; 1559 ppDsts[1] += dx; 1560 ppDsts[2] += dx; 1561 ppDsts[3] += dx; 1562 } 1563 1564 ppDsts[0] += dy; 1565 ppDsts[1] += dy; 1566 ppDsts[2] += dy; 1567 ppDsts[3] += dy; 1568 } 1569 } 1570 }; 1571 1572 ////////////////////////////////////////////////////////////////////////// 1573 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp 1574 ////////////////////////////////////////////////////////////////////////// 1575 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1576 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> 1577 { 1578 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1579 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1580 1581 ////////////////////////////////////////////////////////////////////////// 1582 /// @brief Stores an 8x8 raster tile to the destination surface. 1583 /// @param pSrc - Pointer to raster tile. 1584 /// @param pDstSurface - Destination surface state 1585 /// @param x, y - Coordinates to raster tile. 1586 INLINE static void Store( 1587 uint8_t *pSrc, 1588 SWR_SURFACE_STATE* pDstSurface, 1589 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1590 { 1591 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1592 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 1593 1594 // Punt non-full tiles to generic store 1595 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1596 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1597 1598 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1599 { 1600 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1601 } 1602 1603 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1604 // We can compute the offsets to each column within the raster tile once and increment from these. 1605 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1606 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1607 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1608 1609 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1610 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1611 1612 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1613 uint8_t *ppDsts[] = 1614 { 1615 pDst, // row 0, col 0 1616 pDst + DestRowWidthBytes, // row 1, col 0 1617 pDst + DestColumnBytes, // row 0, col 1 1618 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 1619 }; 1620 1621 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1622 { 1623 // Raster tile width is same as simd16 tile width 1624 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1625 1626 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1627 1628 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1629 1630 ppDsts[0] += dy; 1631 ppDsts[1] += dy; 1632 ppDsts[2] += dy; 1633 ppDsts[3] += dy; 1634 } 1635 } 1636 }; 1637 1638 ////////////////////////////////////////////////////////////////////////// 1639 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp 1640 ////////////////////////////////////////////////////////////////////////// 1641 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1642 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> 1643 { 1644 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; 1645 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1646 1647 ////////////////////////////////////////////////////////////////////////// 1648 /// @brief Stores an 8x8 raster tile to the destination surface. 1649 /// @param pSrc - Pointer to raster tile. 1650 /// @param pDstSurface - Destination surface state 1651 /// @param x, y - Coordinates to raster tile. 1652 INLINE static void Store( 1653 uint8_t *pSrc, 1654 SWR_SURFACE_STATE* pDstSurface, 1655 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1656 { 1657 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1658 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 1659 1660 // Punt non-full tiles to generic store 1661 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1662 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1663 1664 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1665 { 1666 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1667 } 1668 1669 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1670 // We can compute the offsets to each column within the raster tile once and increment from these. 1671 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1672 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1673 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1674 1675 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1676 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1677 1678 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1679 uint8_t *ppDsts[] = 1680 { 1681 pDst, // row 0, col 0 1682 pDst + DestRowWidthBytes, // row 1, col 0 1683 pDst + DestColumnBytes, // row 0, col 1 1684 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 1685 pDst + DestColumnBytes * 2, // row 0, col 2 1686 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 1687 pDst + DestColumnBytes * 3, // row 0, col 3 1688 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 1689 }; 1690 1691 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1692 { 1693 // Raster tile width is same as simd16 tile width 1694 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1695 1696 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1697 1698 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1699 1700 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1701 { 1702 ppDsts[i] += dy; 1703 } 1704 } 1705 } 1706 }; 1707 1708 ////////////////////////////////////////////////////////////////////////// 1709 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp 1710 ////////////////////////////////////////////////////////////////////////// 1711 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1712 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> 1713 { 1714 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile; 1715 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1716 1717 ////////////////////////////////////////////////////////////////////////// 1718 /// @brief Stores an 8x8 raster tile to the destination surface. 1719 /// @param pSrc - Pointer to raster tile. 1720 /// @param pDstSurface - Destination surface state 1721 /// @param x, y - Coordinates to raster tile. 1722 INLINE static void Store( 1723 uint8_t *pSrc, 1724 SWR_SURFACE_STATE* pDstSurface, 1725 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1726 { 1727 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1728 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 1729 1730 // Punt non-full tiles to generic store 1731 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1732 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1733 1734 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1735 { 1736 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1737 } 1738 1739 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1740 // We can compute the offsets to each column within the raster tile once and increment from these. 1741 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1742 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1743 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1744 1745 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1746 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1747 1748 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1749 uint8_t *ppDsts[] = 1750 { 1751 pDst, // row 0, col 0 1752 pDst + DestRowWidthBytes, // row 1, col 0 1753 pDst + DestColumnBytes, // row 0, col 1 1754 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 1755 pDst + DestColumnBytes * 2, // row 0, col 2 1756 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 1757 pDst + DestColumnBytes * 3, // row 0, col 3 1758 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 1759 pDst + DestColumnBytes * 4, // row 0, col 4 1760 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 1761 pDst + DestColumnBytes * 5, // row 0, col 5 1762 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 1763 pDst + DestColumnBytes * 6, // row 0, col 6 1764 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 1765 pDst + DestColumnBytes * 7, // row 0, col 7 1766 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 1767 }; 1768 1769 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1770 { 1771 // Raster tile width is same as simd16 tile width 1772 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1773 1774 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1775 1776 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1777 1778 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1779 { 1780 ppDsts[i] += dy; 1781 } 1782 } 1783 } 1784 }; 1785 1786 ////////////////////////////////////////////////////////////////////////// 1787 /// StoreMacroTile - Stores a macro tile which consists of raster tiles. 1788 ////////////////////////////////////////////////////////////////////////// 1789 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1790 struct StoreMacroTile 1791 { 1792 ////////////////////////////////////////////////////////////////////////// 1793 /// @brief Stores a macrotile to the destination surface using safe implementation. 1794 /// @param pSrc - Pointer to macro tile. 1795 /// @param pDstSurface - Destination surface state 1796 /// @param x, y - Coordinates to macro tile 1797 static void StoreGeneric( 1798 uint8_t *pSrcHotTile, 1799 SWR_SURFACE_STATE* pDstSurface, 1800 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 1801 { 1802 PFN_STORE_TILES_INTERNAL pfnStore; 1803 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 1804 1805 // Store each raster tile from the hot tile to the destination surface. 1806 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 1807 { 1808 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 1809 { 1810 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 1811 { 1812 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 1813 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 1814 } 1815 } 1816 } 1817 1818 } 1819 1820 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); 1821 ////////////////////////////////////////////////////////////////////////// 1822 /// @brief Stores a macrotile to the destination surface. 1823 /// @param pSrc - Pointer to macro tile. 1824 /// @param pDstSurface - Destination surface state 1825 /// @param x, y - Coordinates to macro tile 1826 static void Store( 1827 uint8_t *pSrcHotTile, 1828 SWR_SURFACE_STATE* pDstSurface, 1829 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 1830 { 1831 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; 1832 1833 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 1834 { 1835 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( 1836 0, 1837 0, 1838 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces 1839 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays 1840 sampleNum, 1841 pDstSurface->lod, 1842 pDstSurface); 1843 1844 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear 1845 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || 1846 (pDstSurface->bInterleavedSamples); 1847 1848 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 1849 } 1850 1851 // Save original for pSrcHotTile resolve. 1852 uint8_t *pResolveSrcHotTile = pSrcHotTile; 1853 1854 // Store each raster tile from the hot tile to the destination surface. 1855 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 1856 { 1857 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 1858 { 1859 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 1860 { 1861 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 1862 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 1863 } 1864 } 1865 } 1866 1867 if (pDstSurface->xpAuxBaseAddress) 1868 { 1869 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 1870 // Store each raster tile from the hot tile to the destination surface. 1871 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 1872 { 1873 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 1874 { 1875 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex); 1876 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples; 1877 } 1878 } 1879 } 1880 } 1881 }; 1882 1883 ////////////////////////////////////////////////////////////////////////// 1884 /// InitStoreTilesTable - Helper for setting up the tables. 1885 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 1886 void InitStoreTilesTableColor_Half1( 1887 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) 1888 { 1889 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; 1890 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; 1891 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; 1892 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; 1893 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store; 1894 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store; 1895 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; 1896 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; 1897 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; 1898 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store; 1899 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store; 1900 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; 1901 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; 1902 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; 1903 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; 1904 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; 1905 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; 1906 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; 1907 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; 1908 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 1909 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store; 1910 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; 1911 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; 1912 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store; 1913 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store; 1914 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store; 1915 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store; 1916 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; 1917 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; 1918 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; 1919 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; 1920 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; 1921 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; 1922 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; 1923 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; 1924 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; 1925 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; 1926 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; 1927 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; 1928 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; 1929 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; 1930 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; 1931 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; 1932 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; 1933 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; 1934 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric; 1935 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; 1936 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; 1937 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; 1938 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric; 1939 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric; 1940 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; 1941 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; 1942 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; 1943 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; 1944 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; 1945 } 1946 1947 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 1948 void InitStoreTilesTableColor_Half2( 1949 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) 1950 { 1951 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric; 1952 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; 1953 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric; 1954 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store; 1955 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store; 1956 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store; 1957 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store; 1958 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store; 1959 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store; 1960 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; 1961 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; 1962 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; 1963 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; 1964 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; 1965 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; 1966 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; 1967 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; 1968 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; 1969 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; 1970 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; 1971 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; 1972 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; 1973 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; 1974 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; 1975 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; 1976 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; 1977 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; 1978 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; 1979 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store; 1980 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store; 1981 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store; 1982 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store; 1983 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric; 1984 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric; 1985 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; 1986 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; 1987 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; 1988 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; 1989 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; 1990 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store; 1991 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store; 1992 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; 1993 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; 1994 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store; 1995 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store; 1996 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; 1997 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; 1998 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; 1999 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store; 2000 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store; 2001 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; 2002 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; 2003 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; 2004 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; 2005 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric; 2006 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric; 2007 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; 2008 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; 2009 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric; 2010 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric; 2011 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; 2012 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; 2013 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; 2014 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; 2015 } 2016 2017 ////////////////////////////////////////////////////////////////////////// 2018 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. 2019 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2020 void InitStoreTilesTableDepth( 2021 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2022 { 2023 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store; 2024 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 2025 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; 2026 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store; 2027 } 2028 2029 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2030 void InitStoreTilesTableStencil( 2031 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2032 { 2033 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store; 2034 } 2035 2036 2037 ////////////////////////////////////////////////////////////////////////// 2038 /// @brief Deswizzles and stores a full hottile to a render surface 2039 /// @param hPrivateContext - Handle to private DC 2040 /// @param srcFormat - Format for hot tile. 2041 /// @param renderTargetIndex - Index to destination render target 2042 /// @param x, y - Coordinates to raster tile. 2043 /// @param pSrcHotTile - Pointer to Hot Tile 2044 void SwrStoreHotTileToSurface( 2045 HANDLE hWorkerPrivateData, 2046 SWR_SURFACE_STATE *pDstSurface, 2047 BucketManager* pBucketMgr, 2048 SWR_FORMAT srcFormat, 2049 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, 2050 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, 2051 uint8_t *pSrcHotTile); 2052