1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file StoreTile.h 24 * 25 * @brief Functionality for Store. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "common/os.h" 31 #include "common/formats.h" 32 #include "core/context.h" 33 #include "core/rdtsc_core.h" 34 #include "core/format_conversion.h" 35 36 #include "memory/TilingFunctions.h" 37 #include "memory/Convert.h" 38 #include "core/multisample.h" 39 40 #include <array> 41 #include <sstream> 42 43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 44 45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats. 46 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); 47 48 ////////////////////////////////////////////////////////////////////////// 49 /// Store Raster Tile Function Tables. 50 ////////////////////////////////////////////////////////////////////////// 51 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 52 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 53 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 54 55 void InitStoreTilesTable_Linear_1(); 56 void InitStoreTilesTable_Linear_2(); 57 void InitStoreTilesTable_TileX_1(); 58 void InitStoreTilesTable_TileX_2(); 59 void InitStoreTilesTable_TileY_1(); 60 void InitStoreTilesTable_TileY_2(); 61 void InitStoreTilesTable_TileW(); 62 void InitStoreTilesTable(); 63 64 ////////////////////////////////////////////////////////////////////////// 65 /// StorePixels 66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 67 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 68 /// @param ppDsts - Array of destination pointers. Each pointer is 69 /// to a single row of at most 16B. 70 /// @tparam NumDests - Number of destination pointers. Each pair of 71 /// pointers is for a 16-byte column of two rows. 72 ////////////////////////////////////////////////////////////////////////// 73 template <size_t PixelSize, size_t NumDests> 74 struct StorePixels 75 { 76 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; 77 }; 78 79 ////////////////////////////////////////////////////////////////////////// 80 /// StorePixels (32-bit pixel specialization) 81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 82 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 83 /// @param ppDsts - Array of destination pointers. Each pointer is 84 /// to a single row of at most 16B. 85 /// @tparam NumDests - Number of destination pointers. Each pair of 86 /// pointers is for a 16-byte column of two rows. 87 ////////////////////////////////////////////////////////////////////////// 88 template <> 89 struct StorePixels<8, 2> 90 { 91 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 92 { 93 // Each 4-pixel row is 4 bytes. 94 const uint16_t* pPixSrc = (const uint16_t*)pSrc; 95 96 // Unswizzle from SWR-Z order 97 uint16_t* pRow = (uint16_t*)ppDsts[0]; 98 pRow[0] = pPixSrc[0]; 99 pRow[1] = pPixSrc[2]; 100 101 pRow = (uint16_t*)ppDsts[1]; 102 pRow[0] = pPixSrc[1]; 103 pRow[1] = pPixSrc[3]; 104 } 105 }; 106 107 #if USE_8x2_TILE_BACKEND 108 template <> 109 struct StorePixels<8, 4> 110 { 111 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 112 { 113 // 8 x 2 bytes = 16 bytes, 16 pixels 114 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); 115 116 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); 117 118 // Unswizzle from SWR-Z order 119 ppDsts16[0][0] = pSrc16[0]; // 0 1 120 ppDsts16[0][1] = pSrc16[2]; // 4 5 121 122 ppDsts16[1][0] = pSrc16[1]; // 2 3 123 ppDsts16[1][1] = pSrc16[3]; // 6 7 124 125 ppDsts16[2][0] = pSrc16[4]; // 8 9 126 ppDsts16[2][1] = pSrc16[6]; // C D 127 128 ppDsts16[3][0] = pSrc16[5]; // A B 129 ppDsts16[3][1] = pSrc16[7]; // E F 130 } 131 }; 132 133 #endif 134 ////////////////////////////////////////////////////////////////////////// 135 /// StorePixels (32-bit pixel specialization) 136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 137 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 138 /// @param ppDsts - Array of destination pointers. Each pointer is 139 /// to a single row of at most 16B. 140 /// @tparam NumDests - Number of destination pointers. Each pair of 141 /// pointers is for a 16-byte column of two rows. 142 ////////////////////////////////////////////////////////////////////////// 143 template <> 144 struct StorePixels<16, 2> 145 { 146 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 147 { 148 // Each 4-pixel row is 8 bytes. 149 const uint32_t* pPixSrc = (const uint32_t*)pSrc; 150 151 // Unswizzle from SWR-Z order 152 uint32_t* pRow = (uint32_t*)ppDsts[0]; 153 pRow[0] = pPixSrc[0]; 154 pRow[1] = pPixSrc[2]; 155 156 pRow = (uint32_t*)ppDsts[1]; 157 pRow[0] = pPixSrc[1]; 158 pRow[1] = pPixSrc[3]; 159 } 160 }; 161 162 #if USE_8x2_TILE_BACKEND 163 template <> 164 struct StorePixels<16, 4> 165 { 166 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 167 { 168 // 8 x 4 bytes = 32 bytes, 16 pixels 169 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); 170 171 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); 172 173 // Unswizzle from SWR-Z order 174 ppDsts32[0][0] = pSrc32[0]; // 0 1 175 ppDsts32[0][1] = pSrc32[2]; // 4 5 176 177 ppDsts32[1][0] = pSrc32[1]; // 2 3 178 ppDsts32[1][1] = pSrc32[3]; // 6 7 179 180 ppDsts32[2][0] = pSrc32[4]; // 8 9 181 ppDsts32[2][1] = pSrc32[6]; // C D 182 183 ppDsts32[3][0] = pSrc32[5]; // A B 184 ppDsts32[3][1] = pSrc32[7]; // E F 185 } 186 }; 187 188 #endif 189 ////////////////////////////////////////////////////////////////////////// 190 /// StorePixels (32-bit pixel specialization) 191 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 192 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 193 /// @param ppDsts - Array of destination pointers. Each pointer is 194 /// to a single row of at most 16B. 195 /// @tparam NumDests - Number of destination pointers. Each pair of 196 /// pointers is for a 16-byte column of two rows. 197 ////////////////////////////////////////////////////////////////////////// 198 template <> 199 struct StorePixels<32, 2> 200 { 201 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 202 { 203 // Each 4-pixel row is 16-bytes 204 simd4scalari *pZRow01 = (simd4scalari*)pSrc; 205 simd4scalari vQuad00 = SIMD128::load_si(pZRow01); 206 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1); 207 208 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01); 209 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01); 210 211 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00); 212 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10); 213 } 214 }; 215 216 #if USE_8x2_TILE_BACKEND 217 template <> 218 struct StorePixels<32, 4> 219 { 220 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 221 { 222 // 4 x 16 bytes = 64 bytes, 16 pixels 223 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 224 225 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 226 227 // Unswizzle from SWR-Z order 228 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3 229 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7 230 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B 231 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F 232 233 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5 234 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7 235 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D 236 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F 237 } 238 }; 239 240 #endif 241 ////////////////////////////////////////////////////////////////////////// 242 /// StorePixels (32-bit pixel specialization) 243 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 244 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 245 /// @param ppDsts - Array of destination pointers. Each pointer is 246 /// to a single row of at most 16B. 247 /// @tparam NumDests - Number of destination pointers. Each pair of 248 /// pointers is for a 16-byte column of two rows. 249 ////////////////////////////////////////////////////////////////////////// 250 template <> 251 struct StorePixels<64, 4> 252 { 253 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 254 { 255 // Each 4-pixel row is 32 bytes. 256 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; 257 258 // order of pointers match SWR-Z layout 259 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; 260 *pvDsts[0] = pPixSrc[0]; 261 *pvDsts[1] = pPixSrc[1]; 262 *pvDsts[2] = pPixSrc[2]; 263 *pvDsts[3] = pPixSrc[3]; 264 } 265 }; 266 267 #if USE_8x2_TILE_BACKEND 268 template <> 269 struct StorePixels<64, 8> 270 { 271 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 272 { 273 // 8 x 16 bytes = 128 bytes, 16 pixels 274 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 275 276 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 277 278 // order of pointers match SWR-Z layout 279 *ppDsts128[0] = pSrc128[0]; // 0 1 280 *ppDsts128[1] = pSrc128[1]; // 2 3 281 *ppDsts128[2] = pSrc128[2]; // 4 5 282 *ppDsts128[3] = pSrc128[3]; // 6 7 283 *ppDsts128[4] = pSrc128[4]; // 8 9 284 *ppDsts128[5] = pSrc128[5]; // A B 285 *ppDsts128[6] = pSrc128[6]; // C D 286 *ppDsts128[7] = pSrc128[7]; // E F 287 } 288 }; 289 290 #endif 291 ////////////////////////////////////////////////////////////////////////// 292 /// StorePixels (32-bit pixel specialization) 293 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 294 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 295 /// @param ppDsts - Array of destination pointers. Each pointer is 296 /// to a single row of at most 16B. 297 /// @tparam NumDests - Number of destination pointers. Each pair of 298 /// pointers is for a 16-byte column of two rows. 299 ////////////////////////////////////////////////////////////////////////// 300 template <> 301 struct StorePixels<128, 8> 302 { 303 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 304 { 305 // Each 4-pixel row is 64 bytes. 306 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; 307 308 // Unswizzle from SWR-Z order 309 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; 310 *pvDsts[0] = pPixSrc[0]; 311 *pvDsts[1] = pPixSrc[2]; 312 *pvDsts[2] = pPixSrc[1]; 313 *pvDsts[3] = pPixSrc[3]; 314 *pvDsts[4] = pPixSrc[4]; 315 *pvDsts[5] = pPixSrc[6]; 316 *pvDsts[6] = pPixSrc[5]; 317 *pvDsts[7] = pPixSrc[7]; 318 } 319 }; 320 321 #if USE_8x2_TILE_BACKEND 322 template <> 323 struct StorePixels<128, 16> 324 { 325 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) 326 { 327 // 16 x 16 bytes = 256 bytes, 16 pixels 328 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); 329 330 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); 331 332 for (uint32_t i = 0; i < 16; i += 4) 333 { 334 *ppDsts128[i + 0] = pSrc128[i + 0]; 335 *ppDsts128[i + 1] = pSrc128[i + 2]; 336 *ppDsts128[i + 2] = pSrc128[i + 1]; 337 *ppDsts128[i + 3] = pSrc128[i + 3]; 338 } 339 } 340 }; 341 342 #endif 343 ////////////////////////////////////////////////////////////////////////// 344 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 345 ////////////////////////////////////////////////////////////////////////// 346 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 347 struct ConvertPixelsSOAtoAOS 348 { 349 ////////////////////////////////////////////////////////////////////////// 350 /// @brief Converts a SIMD from the Hot Tile to the destination format 351 /// and converts from SOA to AOS. 352 /// @param pSrc - Pointer to raster tile. 353 /// @param pDst - Pointer to destination surface or deswizzling buffer. 354 template <size_t NumDests> 355 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 356 { 357 #if USE_8x2_TILE_BACKEND 358 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 359 360 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 361 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 362 363 // Convert from SrcFormat --> DstFormat 364 simd16vector src; 365 LoadSOA<SrcFormat>(pSrc, src); 366 StoreSOA<DstFormat>(src, soaTile); 367 368 // Convert from SOA --> AOS 369 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile); 370 371 #else 372 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 373 374 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 375 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 376 377 // Convert from SrcFormat --> DstFormat 378 simdvector src; 379 LoadSOA<SrcFormat>(pSrc, src); 380 StoreSOA<DstFormat>(src, soaTile); 381 382 // Convert from SOA --> AOS 383 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 384 385 #endif 386 // Store data into destination 387 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 388 } 389 }; 390 391 ////////////////////////////////////////////////////////////////////////// 392 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 393 /// Specialization for no format conversion 394 ////////////////////////////////////////////////////////////////////////// 395 template<SWR_FORMAT Format> 396 struct ConvertPixelsSOAtoAOS<Format, Format> 397 { 398 ////////////////////////////////////////////////////////////////////////// 399 /// @brief Converts a SIMD from the Hot Tile to the destination format 400 /// and converts from SOA to AOS. 401 /// @param pSrc - Pointer to raster tile. 402 /// @param pDst - Pointer to destination surface or deswizzling buffer. 403 template <size_t NumDests> 404 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 405 { 406 #if USE_8x2_TILE_BACKEND 407 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 408 409 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 410 411 // Convert from SOA --> AOS 412 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile); 413 414 #else 415 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 416 417 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 418 419 // Convert from SOA --> AOS 420 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile); 421 422 #endif 423 // Store data into destination 424 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); 425 } 426 }; 427 428 ////////////////////////////////////////////////////////////////////////// 429 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM 430 ////////////////////////////////////////////////////////////////////////// 431 template<> 432 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > 433 { 434 ////////////////////////////////////////////////////////////////////////// 435 /// @brief Converts a SIMD from the Hot Tile to the destination format 436 /// and converts from SOA to AOS. 437 /// @param pSrc - Pointer to raster tile. 438 /// @param pDst - Pointer to destination surface or deswizzling buffer. 439 template <size_t NumDests> 440 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 441 { 442 #if USE_8x2_TILE_BACKEND 443 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 444 static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 445 446 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 447 448 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 449 450 // Load hot-tile 451 simd16vector src, dst; 452 LoadSOA<SrcFormat>(pSrc, src); 453 454 // deswizzle 455 dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 456 dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 457 dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 458 459 // clamp 460 dst.x = Clamp<DstFormat>(dst.x, 0); 461 dst.y = Clamp<DstFormat>(dst.y, 1); 462 dst.z = Clamp<DstFormat>(dst.z, 2); 463 464 // normalize 465 dst.x = Normalize<DstFormat>(dst.x, 0); 466 dst.y = Normalize<DstFormat>(dst.y, 1); 467 dst.z = Normalize<DstFormat>(dst.z, 2); 468 469 // pack 470 simd16scalari packed = _simd16_castps_si(dst.x); 471 472 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); 473 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); 474 475 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); 476 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); 477 478 // pack low 16 bits of each 32 bit lane to low 128 bits of dst 479 uint32_t *pPacked = (uint32_t*)&packed; 480 uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 481 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) 482 { 483 *pAosTile++ = *pPacked++; 484 } 485 486 #else 487 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 488 static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 489 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 490 491 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 492 493 // Load hot-tile 494 simdvector src, dst; 495 LoadSOA<SrcFormat>(pSrc, src); 496 497 // deswizzle 498 dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 499 dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 500 dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 501 502 // clamp 503 dst.x = Clamp<DstFormat>(dst.x, 0); 504 dst.y = Clamp<DstFormat>(dst.y, 1); 505 dst.z = Clamp<DstFormat>(dst.z, 2); 506 507 // normalize 508 dst.x = Normalize<DstFormat>(dst.x, 0); 509 dst.y = Normalize<DstFormat>(dst.y, 1); 510 dst.z = Normalize<DstFormat>(dst.z, 2); 511 512 // pack 513 simdscalari packed = _simd_castps_si(dst.x); 514 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0))); 515 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) + 516 FormatTraits<DstFormat>::GetConstBPC(1))); 517 518 // pack low 16 bits of each 32 bit lane to low 128 bits of dst 519 uint32_t *pPacked = (uint32_t*)&packed; 520 uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 521 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) 522 { 523 *pAosTile++ = *pPacked++; 524 } 525 526 #endif 527 // Store data into destination 528 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 529 } 530 }; 531 532 ////////////////////////////////////////////////////////////////////////// 533 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 534 ////////////////////////////////////////////////////////////////////////// 535 template<> 536 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> 537 { 538 static const SWR_FORMAT SrcFormat = R32_FLOAT; 539 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; 540 541 ////////////////////////////////////////////////////////////////////////// 542 /// @brief Converts a SIMD from the Hot Tile to the destination format 543 /// and converts from SOA to AOS. 544 /// @param pSrc - Pointer to raster tile. 545 /// @param pDst - Pointer to destination surface or deswizzling buffer. 546 template <size_t NumDests> 547 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 548 { 549 #if USE_8x2_TILE_BACKEND 550 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 551 552 // clamp 553 const simd16scalar zero = _simd16_setzero_ps(); 554 const simd16scalar ones = _simd16_set1_ps(1.0f); 555 556 comp = _simd16_max_ps(comp, zero); 557 comp = _simd16_min_ps(comp, ones); 558 559 // normalize 560 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 561 562 simd16scalari temp = _simd16_cvtps_epi32(comp); 563 564 // swizzle 565 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 566 567 // merge/store data into destination but don't overwrite the X8 bits 568 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0])); 569 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2])); 570 571 simd16scalari dest = _simd16_setzero_si(); 572 573 dest = _simd16_insert_si(dest, destlo, 0); 574 dest = _simd16_insert_si(dest, desthi, 1); 575 576 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); 577 578 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); 579 580 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0)); 581 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1)); 582 #else 583 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 584 585 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 586 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 587 588 // Convert from SrcFormat --> DstFormat 589 simdvector src; 590 LoadSOA<SrcFormat>(pSrc, src); 591 StoreSOA<DstFormat>(src, soaTile); 592 593 // Convert from SOA --> AOS 594 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 595 596 // Store data into destination but don't overwrite the X8 bits 597 // Each 4-pixel row is 16-bytes 598 simd4scalari *pZRow01 = (simd4scalari*)aosTile; 599 simd4scalari vQuad00 = SIMD128::load_si(pZRow01); 600 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1); 601 602 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01); 603 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01); 604 605 simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]); 606 simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]); 607 608 simd4scalari vMask = _mm_set1_epi32(0xFFFFFF); 609 610 vDst0 = SIMD128::andnot_si(vMask, vDst0); 611 vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask)); 612 vDst1 = SIMD128::andnot_si(vMask, vDst1); 613 vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask)); 614 615 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0); 616 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1); 617 #endif 618 } 619 }; 620 621 #if USE_8x2_TILE_BACKEND 622 template<SWR_FORMAT DstFormat> 623 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 624 { 625 // swizzle rgba -> bgra while we load 626 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 627 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 628 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 629 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa 630 631 // clamp 632 const simd16scalar zero = _simd16_setzero_ps(); 633 const simd16scalar ones = _simd16_set1_ps(1.0f); 634 635 comp0 = _simd16_max_ps(comp0, zero); 636 comp0 = _simd16_min_ps(comp0, ones); 637 638 comp1 = _simd16_max_ps(comp1, zero); 639 comp1 = _simd16_min_ps(comp1, ones); 640 641 comp2 = _simd16_max_ps(comp2, zero); 642 comp2 = _simd16_min_ps(comp2, ones); 643 644 comp3 = _simd16_max_ps(comp3, zero); 645 comp3 = _simd16_min_ps(comp3, ones); 646 647 // gamma-correct only rgb 648 if (FormatTraits<DstFormat>::isSRGB) 649 { 650 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 651 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 652 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 653 } 654 655 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 656 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 657 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 658 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 659 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 660 661 // moving to 16 wide integer vector types 662 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 663 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 664 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 665 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa 666 667 // SOA to AOS conversion 668 src1 = _simd16_slli_epi32(src1, 8); 669 src2 = _simd16_slli_epi32(src2, 16); 670 src3 = _simd16_slli_epi32(src3, 24); 671 672 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 673 674 // de-swizzle conversion 675 #if 1 676 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 677 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 678 679 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 680 681 #else 682 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 683 684 #endif 685 // store 8x2 memory order: 686 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 687 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 688 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); 689 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); 690 } 691 692 #endif 693 template<SWR_FORMAT DstFormat> 694 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 695 { 696 static const uint32_t offset = sizeof(simdscalar); 697 698 // swizzle rgba -> bgra while we load 699 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 700 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 701 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 702 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 703 704 // clamp 705 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 706 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 707 708 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 709 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 710 711 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 712 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 713 714 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); 715 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); 716 717 if (FormatTraits<DstFormat>::isSRGB) 718 { 719 // Gamma-correct only rgb 720 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 721 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 722 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 723 } 724 725 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 726 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 727 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 728 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 729 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 730 731 // moving to 8 wide integer vector types 732 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 733 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 734 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 735 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa 736 737 #if KNOB_ARCH <= KNOB_ARCH_AVX 738 739 // splitting into two sets of 4 wide integer vector types 740 // because AVX doesn't have instructions to support this operation at 8 wide 741 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 742 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 743 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 744 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a 745 746 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 747 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 748 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 749 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a 750 751 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 752 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 753 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 754 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 755 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 756 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 757 758 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr 759 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00 760 761 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr 762 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00 763 764 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr 765 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr 766 767 // unpack into rows that get the tiling order correct 768 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr 769 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); 770 771 simdscalari final = _mm256_castsi128_si256(vRow00); 772 final = _mm256_insertf128_si256(final, vRow10, 1); 773 774 #else 775 776 // logic is as above, only wider 777 src1 = _mm256_slli_si256(src1, 1); 778 src2 = _mm256_slli_si256(src2, 2); 779 src3 = _mm256_slli_si256(src3, 3); 780 781 src0 = _mm256_or_si256(src0, src1); 782 src2 = _mm256_or_si256(src2, src3); 783 784 simdscalari final = _mm256_or_si256(src0, src2); 785 786 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 787 final = _mm256_permute4x64_epi64(final, 0xD8); 788 #endif 789 790 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); 791 } 792 793 #if USE_8x2_TILE_BACKEND 794 template<SWR_FORMAT DstFormat> 795 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 796 { 797 // swizzle rgba -> bgra while we load 798 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 799 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 800 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 801 802 // clamp 803 const simd16scalar zero = _simd16_setzero_ps(); 804 const simd16scalar ones = _simd16_set1_ps(1.0f); 805 806 comp0 = _simd16_max_ps(comp0, zero); 807 comp0 = _simd16_min_ps(comp0, ones); 808 809 comp1 = _simd16_max_ps(comp1, zero); 810 comp1 = _simd16_min_ps(comp1, ones); 811 812 comp2 = _simd16_max_ps(comp2, zero); 813 comp2 = _simd16_min_ps(comp2, ones); 814 815 // gamma-correct only rgb 816 if (FormatTraits<DstFormat>::isSRGB) 817 { 818 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 819 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 820 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 821 } 822 823 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 824 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 825 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 826 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 827 828 // moving to 16 wide integer vector types 829 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 830 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 831 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 832 833 // SOA to AOS conversion 834 src1 = _simd16_slli_epi32(src1, 8); 835 src2 = _simd16_slli_epi32(src2, 16); 836 837 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 838 839 // de-swizzle conversion 840 #if 1 841 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 842 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 843 844 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 845 846 #else 847 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 848 849 #endif 850 // store 8x2 memory order: 851 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 852 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 853 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); 854 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); 855 } 856 857 #endif 858 template<SWR_FORMAT DstFormat> 859 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 860 { 861 static const uint32_t offset = sizeof(simdscalar); 862 863 // swizzle rgba -> bgra while we load 864 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 865 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 866 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 867 // clamp 868 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 869 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 870 871 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 872 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 873 874 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 875 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 876 877 if (FormatTraits<DstFormat>::isSRGB) 878 { 879 // Gamma-correct only rgb 880 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 881 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 882 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 883 } 884 885 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 886 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 887 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 888 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 889 890 // moving to 8 wide integer vector types 891 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 892 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 893 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 894 895 #if KNOB_ARCH <= KNOB_ARCH_AVX 896 897 // splitting into two sets of 4 wide integer vector types 898 // because AVX doesn't have instructions to support this operation at 8 wide 899 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 900 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 901 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 902 903 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 904 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 905 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 906 907 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 908 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 909 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 910 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 911 912 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr 913 914 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr 915 916 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr 917 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr 918 919 // unpack into rows that get the tiling order correct 920 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr 921 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); 922 923 simdscalari final = _mm256_castsi128_si256(vRow00); 924 final = _mm256_insertf128_si256(final, vRow10, 1); 925 926 #else 927 928 // logic is as above, only wider 929 src1 = _mm256_slli_si256(src1, 1); 930 src2 = _mm256_slli_si256(src2, 2); 931 932 src0 = _mm256_or_si256(src0, src1); 933 934 simdscalari final = _mm256_or_si256(src0, src2); 935 936 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 937 final = _mm256_permute4x64_epi64(final, 0xD8); 938 939 #endif 940 941 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); 942 } 943 944 template<> 945 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> 946 { 947 template <size_t NumDests> 948 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 949 { 950 #if USE_8x2_TILE_BACKEND 951 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 952 #else 953 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 954 #endif 955 } 956 }; 957 958 template<> 959 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> 960 { 961 template <size_t NumDests> 962 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 963 { 964 #if USE_8x2_TILE_BACKEND 965 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 966 #else 967 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 968 #endif 969 } 970 }; 971 972 template<> 973 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > 974 { 975 template <size_t NumDests> 976 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 977 { 978 #if USE_8x2_TILE_BACKEND 979 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 980 #else 981 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 982 #endif 983 } 984 }; 985 986 template<> 987 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > 988 { 989 template <size_t NumDests> 990 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 991 { 992 #if USE_8x2_TILE_BACKEND 993 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 994 #else 995 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 996 #endif 997 } 998 }; 999 1000 template<> 1001 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > 1002 { 1003 template <size_t NumDests> 1004 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1005 { 1006 #if USE_8x2_TILE_BACKEND 1007 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1008 #else 1009 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1010 #endif 1011 } 1012 }; 1013 1014 template<> 1015 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > 1016 { 1017 template <size_t NumDests> 1018 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1019 { 1020 #if USE_8x2_TILE_BACKEND 1021 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1022 #else 1023 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1024 #endif 1025 } 1026 }; 1027 1028 template<> 1029 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > 1030 { 1031 template <size_t NumDests> 1032 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1033 { 1034 #if USE_8x2_TILE_BACKEND 1035 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1036 #else 1037 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1038 #endif 1039 } 1040 }; 1041 1042 template<> 1043 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > 1044 { 1045 template <size_t NumDests> 1046 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1047 { 1048 #if USE_8x2_TILE_BACKEND 1049 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1050 #else 1051 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1052 #endif 1053 } 1054 }; 1055 1056 ////////////////////////////////////////////////////////////////////////// 1057 /// StoreRasterTile 1058 ////////////////////////////////////////////////////////////////////////// 1059 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1060 struct StoreRasterTile 1061 { 1062 ////////////////////////////////////////////////////////////////////////// 1063 /// @brief Retrieve color from hot tile source which is always float. 1064 /// @param pSrc - Pointer to raster tile. 1065 /// @param x, y - Coordinates to raster tile. 1066 /// @param output - output color 1067 INLINE static void GetSwizzledSrcColor( 1068 uint8_t* pSrc, 1069 uint32_t x, uint32_t y, 1070 float outputColor[4]) 1071 { 1072 #if USE_8x2_TILE_BACKEND 1073 typedef SimdTile_16<SrcFormat, DstFormat> SimdT; 1074 1075 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); 1076 1077 // Compute which simd tile we're accessing within 8x8 tile. 1078 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 1079 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); 1080 1081 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; 1082 1083 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); 1084 1085 pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1086 #else 1087 typedef SimdTile<SrcFormat, DstFormat> SimdT; 1088 1089 SimdT* pSrcSimdTiles = (SimdT*)pSrc; 1090 1091 // Compute which simd tile we're accessing within 8x8 tile. 1092 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 1093 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); 1094 1095 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; 1096 1097 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); 1098 1099 pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1100 #endif 1101 } 1102 1103 ////////////////////////////////////////////////////////////////////////// 1104 /// @brief Stores an 8x8 raster tile to the destination surface. 1105 /// @param pSrc - Pointer to raster tile. 1106 /// @param pDstSurface - Destination surface state 1107 /// @param x, y - Coordinates to raster tile. 1108 INLINE static void Store( 1109 uint8_t *pSrc, 1110 SWR_SURFACE_STATE* pDstSurface, 1111 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 1112 { 1113 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1114 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1115 1116 // For each raster tile pixel (rx, ry) 1117 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 1118 { 1119 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 1120 { 1121 // Perform bounds checking. 1122 if (((x + rx) < lodWidth) && 1123 ((y + ry) < lodHeight)) 1124 { 1125 float srcColor[4]; 1126 GetSwizzledSrcColor(pSrc, rx, ry, srcColor); 1127 1128 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 1129 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 1130 sampleNum, pDstSurface->lod, pDstSurface); 1131 { 1132 ConvertPixelFromFloat<DstFormat>(pDst, srcColor); 1133 } 1134 } 1135 } 1136 } 1137 } 1138 1139 ////////////////////////////////////////////////////////////////////////// 1140 /// @brief Resolves an 8x8 raster tile to the resolve destination surface. 1141 /// @param pSrc - Pointer to raster tile. 1142 /// @param pDstSurface - Destination surface state 1143 /// @param x, y - Coordinates to raster tile. 1144 /// @param sampleOffset - Offset between adjacent multisamples 1145 INLINE static void Resolve( 1146 uint8_t *pSrc, 1147 SWR_SURFACE_STATE* pDstSurface, 1148 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 1149 { 1150 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1151 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1152 1153 float oneOverNumSamples = 1.0f / pDstSurface->numSamples; 1154 1155 // For each raster tile pixel (rx, ry) 1156 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 1157 { 1158 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 1159 { 1160 // Perform bounds checking. 1161 if (((x + rx) < lodWidth) && 1162 ((y + ry) < lodHeight)) 1163 { 1164 // Sum across samples 1165 float resolveColor[4] = {0}; 1166 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 1167 { 1168 float sampleColor[4] = {0}; 1169 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum; 1170 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor); 1171 resolveColor[0] += sampleColor[0]; 1172 resolveColor[1] += sampleColor[1]; 1173 resolveColor[2] += sampleColor[2]; 1174 resolveColor[3] += sampleColor[3]; 1175 } 1176 1177 // Divide by numSamples to average 1178 resolveColor[0] *= oneOverNumSamples; 1179 resolveColor[1] *= oneOverNumSamples; 1180 resolveColor[2] *= oneOverNumSamples; 1181 resolveColor[3] *= oneOverNumSamples; 1182 1183 // Use the resolve surface state 1184 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress; 1185 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 1186 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex, 1187 0, pResolveSurface->lod, pResolveSurface); 1188 { 1189 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor); 1190 } 1191 } 1192 } 1193 } 1194 } 1195 1196 }; 1197 1198 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1199 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> 1200 {}; 1201 1202 ////////////////////////////////////////////////////////////////////////// 1203 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp 1204 ////////////////////////////////////////////////////////////////////////// 1205 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1206 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> 1207 { 1208 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; 1209 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1210 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1211 1212 ////////////////////////////////////////////////////////////////////////// 1213 /// @brief Stores an 8x8 raster tile to the destination surface. 1214 /// @param pSrc - Pointer to raster tile. 1215 /// @param pDstSurface - Destination surface state 1216 /// @param x, y - Coordinates to raster tile. 1217 INLINE static void Store( 1218 uint8_t *pSrc, 1219 SWR_SURFACE_STATE* pDstSurface, 1220 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1221 { 1222 // Punt non-full tiles to generic store 1223 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1224 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1225 1226 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1227 { 1228 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1229 } 1230 1231 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1232 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1233 #if USE_8x2_TILE_BACKEND 1234 1235 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1236 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1237 1238 uint8_t* ppDsts[] = 1239 { 1240 pDst, // row 0, col 0 1241 pDst + pDstSurface->pitch, // row 1, col 0 1242 pDst + dx / 2, // row 0, col 1 1243 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1244 }; 1245 1246 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1247 { 1248 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1249 { 1250 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1251 1252 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1253 1254 ppDsts[0] += dx; 1255 ppDsts[1] += dx; 1256 ppDsts[2] += dx; 1257 ppDsts[3] += dx; 1258 } 1259 1260 ppDsts[0] += dy; 1261 ppDsts[1] += dy; 1262 ppDsts[2] += dy; 1263 ppDsts[3] += dy; 1264 } 1265 #else 1266 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1267 1268 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1269 { 1270 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1271 1272 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1273 { 1274 // Format conversion and convert from SOA to AOS, and store the rows. 1275 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1276 1277 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1278 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1279 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1280 } 1281 1282 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1283 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1284 } 1285 #endif 1286 } 1287 }; 1288 1289 ////////////////////////////////////////////////////////////////////////// 1290 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp 1291 ////////////////////////////////////////////////////////////////////////// 1292 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1293 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> 1294 { 1295 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; 1296 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1297 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1298 1299 ////////////////////////////////////////////////////////////////////////// 1300 /// @brief Stores an 8x8 raster tile to the destination surface. 1301 /// @param pSrc - Pointer to raster tile. 1302 /// @param pDstSurface - Destination surface state 1303 /// @param x, y - Coordinates to raster tile. 1304 INLINE static void Store( 1305 uint8_t *pSrc, 1306 SWR_SURFACE_STATE* pDstSurface, 1307 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1308 { 1309 // Punt non-full tiles to generic store 1310 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1311 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1312 1313 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1314 { 1315 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1316 } 1317 1318 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1319 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1320 #if USE_8x2_TILE_BACKEND 1321 1322 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1323 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1324 1325 uint8_t* ppDsts[] = 1326 { 1327 pDst, // row 0, col 0 1328 pDst + pDstSurface->pitch, // row 1, col 0 1329 pDst + dx / 2, // row 0, col 1 1330 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1331 }; 1332 1333 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1334 { 1335 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1336 { 1337 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1338 1339 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1340 1341 ppDsts[0] += dx; 1342 ppDsts[1] += dx; 1343 ppDsts[2] += dx; 1344 ppDsts[3] += dx; 1345 } 1346 1347 ppDsts[0] += dy; 1348 ppDsts[1] += dy; 1349 ppDsts[2] += dy; 1350 ppDsts[3] += dy; 1351 } 1352 #else 1353 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1354 1355 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1356 { 1357 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1358 1359 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1360 { 1361 // Format conversion and convert from SOA to AOS, and store the rows. 1362 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1363 1364 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1365 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1366 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1367 } 1368 1369 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1370 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1371 } 1372 #endif 1373 } 1374 }; 1375 1376 ////////////////////////////////////////////////////////////////////////// 1377 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp 1378 ////////////////////////////////////////////////////////////////////////// 1379 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1380 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> 1381 { 1382 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; 1383 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1384 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1385 1386 ////////////////////////////////////////////////////////////////////////// 1387 /// @brief Stores an 8x8 raster tile to the destination surface. 1388 /// @param pSrc - Pointer to raster tile. 1389 /// @param pDstSurface - Destination surface state 1390 /// @param x, y - Coordinates to raster tile. 1391 INLINE static void Store( 1392 uint8_t *pSrc, 1393 SWR_SURFACE_STATE* pDstSurface, 1394 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1395 { 1396 // Punt non-full tiles to generic store 1397 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1398 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1399 1400 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1401 { 1402 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1403 } 1404 1405 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1406 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1407 #if USE_8x2_TILE_BACKEND 1408 1409 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1410 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1411 1412 uint8_t* ppDsts[] = 1413 { 1414 pDst, // row 0, col 0 1415 pDst + pDstSurface->pitch, // row 1, col 0 1416 pDst + dx / 2, // row 0, col 1 1417 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1418 }; 1419 1420 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1421 { 1422 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1423 { 1424 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1425 1426 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1427 1428 ppDsts[0] += dx; 1429 ppDsts[1] += dx; 1430 ppDsts[2] += dx; 1431 ppDsts[3] += dx; 1432 } 1433 1434 ppDsts[0] += dy; 1435 ppDsts[1] += dy; 1436 ppDsts[2] += dy; 1437 ppDsts[3] += dy; 1438 } 1439 #else 1440 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1441 1442 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1443 { 1444 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1445 1446 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1447 { 1448 // Format conversion and convert from SOA to AOS, and store the rows. 1449 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1450 1451 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1452 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1453 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1454 } 1455 1456 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1457 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1458 } 1459 #endif 1460 } 1461 }; 1462 1463 ////////////////////////////////////////////////////////////////////////// 1464 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp 1465 ////////////////////////////////////////////////////////////////////////// 1466 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1467 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> 1468 { 1469 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; 1470 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1471 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1472 static const size_t MAX_DST_COLUMN_BYTES = 16; 1473 #if !USE_8x2_TILE_BACKEND 1474 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 1475 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1476 #endif 1477 1478 ////////////////////////////////////////////////////////////////////////// 1479 /// @brief Stores an 8x8 raster tile to the destination surface. 1480 /// @param pSrc - Pointer to raster tile. 1481 /// @param pDstSurface - Destination surface state 1482 /// @param x, y - Coordinates to raster tile. 1483 INLINE static void Store( 1484 uint8_t *pSrc, 1485 SWR_SURFACE_STATE* pDstSurface, 1486 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1487 { 1488 // Punt non-full tiles to generic store 1489 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1490 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1491 1492 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1493 { 1494 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1495 } 1496 1497 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1498 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1499 #if USE_8x2_TILE_BACKEND 1500 1501 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1502 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1503 1504 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1505 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); 1506 1507 uint8_t *ppDsts[] = 1508 { 1509 pDst, // row 0, col 0 1510 pDst + pDstSurface->pitch, // row 1, col 0 1511 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1512 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1513 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1514 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1515 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1516 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 1517 }; 1518 1519 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1520 { 1521 // Raster tile width is same as simd16 tile width 1522 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1523 1524 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1525 1526 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1527 1528 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1529 { 1530 ppDsts[i] += dy; 1531 } 1532 } 1533 #else 1534 uint8_t* ppDsts[] = 1535 { 1536 pDst, // row 0, col 0 1537 pDst + pDstSurface->pitch, // row 1, col 0 1538 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1539 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1540 }; 1541 1542 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1543 { 1544 uint8_t* ppStartRows[] = 1545 { 1546 ppDsts[0], 1547 ppDsts[1], 1548 ppDsts[2], 1549 ppDsts[3], 1550 }; 1551 1552 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1553 { 1554 // Format conversion and convert from SOA to AOS, and store the rows. 1555 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1556 1557 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 1558 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 1559 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 1560 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 1561 pSrc += SRC_COLUMN_BYTES; 1562 } 1563 1564 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1565 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1566 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; 1567 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; 1568 } 1569 #endif 1570 } 1571 }; 1572 1573 ////////////////////////////////////////////////////////////////////////// 1574 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp 1575 ////////////////////////////////////////////////////////////////////////// 1576 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1577 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> 1578 { 1579 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; 1580 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1581 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1582 static const size_t MAX_DST_COLUMN_BYTES = 16; 1583 #if !USE_8x2_TILE_BACKEND 1584 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 1585 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1586 #endif 1587 1588 ////////////////////////////////////////////////////////////////////////// 1589 /// @brief Stores an 8x8 raster tile to the destination surface. 1590 /// @param pSrc - Pointer to raster tile. 1591 /// @param pDstSurface - Destination surface state 1592 /// @param x, y - Coordinates to raster tile. 1593 INLINE static void Store( 1594 uint8_t *pSrc, 1595 SWR_SURFACE_STATE* pDstSurface, 1596 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1597 { 1598 // Punt non-full tiles to generic store 1599 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1600 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1601 1602 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1603 { 1604 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1605 } 1606 1607 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1608 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1609 #if USE_8x2_TILE_BACKEND 1610 1611 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1612 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1613 1614 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1615 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); 1616 1617 uint8_t* ppDsts[] = 1618 { 1619 pDst, // row 0, col 0 1620 pDst + pDstSurface->pitch, // row 1, col 0 1621 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1622 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1623 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1624 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1625 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1626 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 1627 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 1628 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 1629 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 1630 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 1631 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 1632 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 1633 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 1634 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 1635 }; 1636 1637 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1638 { 1639 // Raster tile width is same as simd16 tile width 1640 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1641 1642 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1643 1644 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1645 1646 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 1647 { 1648 ppDsts[i] += dy; 1649 } 1650 } 1651 #else 1652 struct DstPtrs 1653 { 1654 uint8_t* ppDsts[8]; 1655 } ptrs; 1656 1657 // Need 8 pointers, 4 columns of 2 rows each 1658 for (uint32_t y = 0; y < 2; ++y) 1659 { 1660 for (uint32_t x = 0; x < 4; ++x) 1661 { 1662 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; 1663 } 1664 } 1665 1666 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1667 { 1668 DstPtrs startPtrs = ptrs; 1669 1670 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1671 { 1672 // Format conversion and convert from SOA to AOS, and store the rows. 1673 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 1674 1675 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 1676 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 1677 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 1678 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 1679 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 1680 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 1681 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 1682 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 1683 pSrc += SRC_COLUMN_BYTES; 1684 } 1685 1686 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; 1687 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; 1688 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; 1689 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; 1690 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; 1691 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; 1692 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; 1693 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; 1694 } 1695 #endif 1696 } 1697 }; 1698 1699 ////////////////////////////////////////////////////////////////////////// 1700 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp 1701 ////////////////////////////////////////////////////////////////////////// 1702 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1703 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> 1704 { 1705 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; 1706 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1707 1708 ////////////////////////////////////////////////////////////////////////// 1709 /// @brief Stores an 8x8 raster tile to the destination surface. 1710 /// @param pSrc - Pointer to raster tile. 1711 /// @param pDstSurface - Destination surface state 1712 /// @param x, y - Coordinates to raster tile. 1713 INLINE static void Store( 1714 uint8_t *pSrc, 1715 SWR_SURFACE_STATE* pDstSurface, 1716 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1717 { 1718 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1719 1720 // Punt non-full tiles to generic store 1721 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1722 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1723 1724 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1725 { 1726 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1727 } 1728 1729 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1730 // We can compute the offsets to each column within the raster tile once and increment from these. 1731 #if USE_8x2_TILE_BACKEND 1732 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1733 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1734 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1735 1736 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1737 1738 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1739 uint8_t *ppDsts[] = 1740 { 1741 pDst, 1742 pDst + DestRowWidthBytes, 1743 pDst + DestRowWidthBytes / 4, 1744 pDst + DestRowWidthBytes + DestRowWidthBytes / 4 1745 }; 1746 1747 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1748 { 1749 // Raster tile width is same as simd16 tile width 1750 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1751 1752 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1753 1754 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1755 1756 ppDsts[0] += dy; 1757 ppDsts[1] += dy; 1758 ppDsts[2] += dy; 1759 ppDsts[3] += dy; 1760 } 1761 #else 1762 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1763 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1764 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1765 1766 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1767 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1768 1769 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1770 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1771 { 1772 uint32_t rowOffset = row * DestRowWidthBytes; 1773 1774 uint8_t* pRow = pCol0 + rowOffset; 1775 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 1776 1777 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1778 pSrc += pSrcInc; 1779 1780 ppDsts[0] += DestRowWidthBytes / 4; 1781 ppDsts[1] += DestRowWidthBytes / 4; 1782 1783 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1784 pSrc += pSrcInc; 1785 } 1786 #endif 1787 } 1788 }; 1789 1790 ////////////////////////////////////////////////////////////////////////// 1791 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp 1792 ////////////////////////////////////////////////////////////////////////// 1793 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1794 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> 1795 { 1796 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; 1797 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1798 1799 ////////////////////////////////////////////////////////////////////////// 1800 /// @brief Stores an 8x8 raster tile to the destination surface. 1801 /// @param pSrc - Pointer to raster tile. 1802 /// @param pDstSurface - Destination surface state 1803 /// @param x, y - Coordinates to raster tile. 1804 INLINE static void Store( 1805 uint8_t *pSrc, 1806 SWR_SURFACE_STATE* pDstSurface, 1807 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1808 { 1809 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1810 1811 // Punt non-full tiles to generic store 1812 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1813 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1814 1815 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1816 { 1817 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1818 } 1819 1820 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1821 // We can compute the offsets to each column within the raster tile once and increment from these. 1822 #if USE_8x2_TILE_BACKEND 1823 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1824 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1825 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1826 1827 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1828 1829 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1830 uint8_t *ppDsts[] = 1831 { 1832 pDst, 1833 pDst + DestRowWidthBytes, 1834 pDst + DestRowWidthBytes / 2, 1835 pDst + DestRowWidthBytes + DestRowWidthBytes / 2 1836 }; 1837 1838 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1839 { 1840 // Raster tile width is same as simd16 tile width 1841 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1842 1843 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1844 1845 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1846 1847 ppDsts[0] += dy; 1848 ppDsts[1] += dy; 1849 ppDsts[2] += dy; 1850 ppDsts[3] += dy; 1851 } 1852 #else 1853 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1854 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1855 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1856 1857 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1858 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1859 1860 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1861 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1862 { 1863 uint32_t rowOffset = row * DestRowWidthBytes; 1864 1865 uint8_t* pRow = pCol0 + rowOffset; 1866 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 1867 1868 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1869 pSrc += pSrcInc; 1870 1871 ppDsts[0] += DestRowWidthBytes / 2; 1872 ppDsts[1] += DestRowWidthBytes / 2; 1873 1874 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1875 pSrc += pSrcInc; 1876 } 1877 #endif 1878 } 1879 }; 1880 1881 ////////////////////////////////////////////////////////////////////////// 1882 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp 1883 ////////////////////////////////////////////////////////////////////////// 1884 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1885 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> 1886 { 1887 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1888 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1889 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1890 1891 ////////////////////////////////////////////////////////////////////////// 1892 /// @brief Stores an 8x8 raster tile to the destination surface. 1893 /// @param pSrc - Pointer to raster tile. 1894 /// @param pDstSurface - Destination surface state 1895 /// @param x, y - Coordinates to raster tile. 1896 INLINE static void Store( 1897 uint8_t *pSrc, 1898 SWR_SURFACE_STATE* pDstSurface, 1899 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1900 { 1901 static const uint32_t DestRowWidthBytes = 512; // 512B rows 1902 1903 // Punt non-full tiles to generic store 1904 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1905 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1906 1907 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1908 { 1909 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1910 } 1911 1912 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. 1913 // We can compute the offsets to each column within the raster tile once and increment from these. 1914 #if USE_8x2_TILE_BACKEND 1915 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1916 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1917 1918 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1919 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1920 1921 uint8_t* ppDsts[] = 1922 { 1923 pDst, // row 0, col 0 1924 pDst + DestRowWidthBytes, // row 1, col 0 1925 pDst + dx / 2, // row 0, col 1 1926 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 1927 }; 1928 1929 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1930 { 1931 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1932 { 1933 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1934 1935 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1936 1937 ppDsts[0] += dx; 1938 ppDsts[1] += dx; 1939 ppDsts[2] += dx; 1940 ppDsts[3] += dx; 1941 } 1942 1943 ppDsts[0] += dy; 1944 ppDsts[1] += dy; 1945 ppDsts[2] += dy; 1946 ppDsts[3] += dy; 1947 } 1948 #else 1949 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1950 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1951 uint8_t* pRow1 = pRow0 + DestRowWidthBytes; 1952 1953 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1954 { 1955 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) 1956 { 1957 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8); 1958 1959 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; 1960 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1961 1962 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1963 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1964 } 1965 1966 pRow0 += (DestRowWidthBytes * 2); 1967 pRow1 += (DestRowWidthBytes * 2); 1968 } 1969 #endif 1970 } 1971 }; 1972 1973 ////////////////////////////////////////////////////////////////////////// 1974 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp 1975 ////////////////////////////////////////////////////////////////////////// 1976 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1977 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> 1978 { 1979 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1980 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1981 1982 ////////////////////////////////////////////////////////////////////////// 1983 /// @brief Stores an 8x8 raster tile to the destination surface. 1984 /// @param pSrc - Pointer to raster tile. 1985 /// @param pDstSurface - Destination surface state 1986 /// @param x, y - Coordinates to raster tile. 1987 INLINE static void Store( 1988 uint8_t *pSrc, 1989 SWR_SURFACE_STATE* pDstSurface, 1990 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1991 { 1992 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1993 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 1994 1995 // Punt non-full tiles to generic store 1996 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1997 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1998 1999 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 2000 { 2001 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 2002 } 2003 2004 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 2005 // We can compute the offsets to each column within the raster tile once and increment from these. 2006 #if USE_8x2_TILE_BACKEND 2007 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2008 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2009 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2010 2011 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 2012 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2013 2014 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2015 uint8_t *ppDsts[] = 2016 { 2017 pDst, // row 0, col 0 2018 pDst + DestRowWidthBytes, // row 1, col 0 2019 pDst + DestColumnBytes, // row 0, col 1 2020 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 2021 }; 2022 2023 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2024 { 2025 // Raster tile width is same as simd16 tile width 2026 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2027 2028 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2029 2030 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2031 2032 ppDsts[0] += dy; 2033 ppDsts[1] += dy; 2034 ppDsts[2] += dy; 2035 ppDsts[3] += dy; 2036 } 2037 #else 2038 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2039 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2040 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2041 2042 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 2043 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 2044 2045 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2046 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 2047 { 2048 uint32_t rowOffset = row * DestRowWidthBytes; 2049 2050 uint8_t* pRow = pCol0 + rowOffset; 2051 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 2052 2053 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2054 pSrc += pSrcInc; 2055 2056 ppDsts[0] += DestColumnBytes; 2057 ppDsts[1] += DestColumnBytes; 2058 2059 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2060 pSrc += pSrcInc; 2061 } 2062 #endif 2063 } 2064 }; 2065 2066 ////////////////////////////////////////////////////////////////////////// 2067 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp 2068 ////////////////////////////////////////////////////////////////////////// 2069 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2070 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> 2071 { 2072 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; 2073 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2074 2075 ////////////////////////////////////////////////////////////////////////// 2076 /// @brief Stores an 8x8 raster tile to the destination surface. 2077 /// @param pSrc - Pointer to raster tile. 2078 /// @param pDstSurface - Destination surface state 2079 /// @param x, y - Coordinates to raster tile. 2080 INLINE static void Store( 2081 uint8_t *pSrc, 2082 SWR_SURFACE_STATE* pDstSurface, 2083 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 2084 { 2085 static const uint32_t DestRowWidthBytes = 16; // 16B rows 2086 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 2087 2088 // Punt non-full tiles to generic store 2089 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 2090 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 2091 2092 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 2093 { 2094 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 2095 } 2096 2097 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 2098 // We can compute the offsets to each column within the raster tile once and increment from these. 2099 #if USE_8x2_TILE_BACKEND 2100 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2101 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2102 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2103 2104 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 2105 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2106 2107 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2108 uint8_t *ppDsts[] = 2109 { 2110 pDst, // row 0, col 0 2111 pDst + DestRowWidthBytes, // row 1, col 0 2112 pDst + DestColumnBytes, // row 0, col 1 2113 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 2114 pDst + DestColumnBytes * 2, // row 0, col 2 2115 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 2116 pDst + DestColumnBytes * 3, // row 0, col 3 2117 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 2118 }; 2119 2120 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2121 { 2122 // Raster tile width is same as simd16 tile width 2123 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2124 2125 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2126 2127 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2128 2129 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 2130 { 2131 ppDsts[i] += dy; 2132 } 2133 } 2134 #else 2135 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2136 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2137 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2138 uint8_t* pCol1 = pCol0 + DestColumnBytes; 2139 2140 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. 2141 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 2142 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 2143 2144 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2145 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 2146 { 2147 uint32_t rowOffset = row * DestRowWidthBytes; 2148 uint8_t* ppDsts[] = 2149 { 2150 pCol0 + rowOffset, 2151 pCol0 + rowOffset + DestRowWidthBytes, 2152 pCol1 + rowOffset, 2153 pCol1 + rowOffset + DestRowWidthBytes, 2154 }; 2155 2156 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2157 pSrc += pSrcInc; 2158 2159 ppDsts[0] += DestColumnBytes * 2; 2160 ppDsts[1] += DestColumnBytes * 2; 2161 ppDsts[2] += DestColumnBytes * 2; 2162 ppDsts[3] += DestColumnBytes * 2; 2163 2164 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2165 pSrc += pSrcInc; 2166 } 2167 #endif 2168 } 2169 }; 2170 2171 ////////////////////////////////////////////////////////////////////////// 2172 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp 2173 ////////////////////////////////////////////////////////////////////////// 2174 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2175 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> 2176 { 2177 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile; 2178 #if USE_8x2_TILE_BACKEND 2179 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2180 2181 #else 2182 static const size_t TILE_Y_COL_WIDTH_BYTES = 16; 2183 static const size_t TILE_Y_ROWS = 32; 2184 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; 2185 2186 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 2187 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2188 static const size_t MAX_DST_COLUMN_BYTES = 16; 2189 2190 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 2191 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; 2192 2193 #endif 2194 ////////////////////////////////////////////////////////////////////////// 2195 /// @brief Stores an 8x8 raster tile to the destination surface. 2196 /// @param pSrc - Pointer to raster tile. 2197 /// @param pDstSurface - Destination surface state 2198 /// @param x, y - Coordinates to raster tile. 2199 INLINE static void Store( 2200 uint8_t *pSrc, 2201 SWR_SURFACE_STATE* pDstSurface, 2202 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 2203 { 2204 #if USE_8x2_TILE_BACKEND 2205 static const uint32_t DestRowWidthBytes = 16; // 16B rows 2206 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 2207 #endif 2208 2209 // Punt non-full tiles to generic store 2210 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 2211 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 2212 2213 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 2214 { 2215 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 2216 } 2217 2218 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 2219 // We can compute the offsets to each column within the raster tile once and increment from these. 2220 #if USE_8x2_TILE_BACKEND 2221 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2222 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2223 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2224 2225 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 2226 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2227 2228 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2229 uint8_t *ppDsts[] = 2230 { 2231 pDst, // row 0, col 0 2232 pDst + DestRowWidthBytes, // row 1, col 0 2233 pDst + DestColumnBytes, // row 0, col 1 2234 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 2235 pDst + DestColumnBytes * 2, // row 0, col 2 2236 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 2237 pDst + DestColumnBytes * 3, // row 0, col 3 2238 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 2239 pDst + DestColumnBytes * 4, // row 0, col 4 2240 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 2241 pDst + DestColumnBytes * 5, // row 0, col 5 2242 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 2243 pDst + DestColumnBytes * 6, // row 0, col 6 2244 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 2245 pDst + DestColumnBytes * 7, // row 0, col 7 2246 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 2247 }; 2248 2249 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2250 { 2251 // Raster tile width is same as simd16 tile width 2252 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2253 2254 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2255 2256 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2257 2258 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) 2259 { 2260 ppDsts[i] += dy; 2261 } 2262 } 2263 #else 2264 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2265 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2266 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2267 struct DstPtrs 2268 { 2269 uint8_t* ppDsts[8]; 2270 } ptrs; 2271 2272 // Need 8 pointers, 4 columns of 2 rows each 2273 for (uint32_t y = 0; y < 2; ++y) 2274 { 2275 for (uint32_t x = 0; x < 4; ++x) 2276 { 2277 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; 2278 } 2279 } 2280 2281 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 2282 { 2283 DstPtrs startPtrs = ptrs; 2284 2285 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 2286 { 2287 // Format conversion and convert from SOA to AOS, and store the rows. 2288 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 2289 2290 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 2291 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 2292 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 2293 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 2294 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 2295 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 2296 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 2297 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 2298 pSrc += SRC_COLUMN_BYTES; 2299 } 2300 2301 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; 2302 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; 2303 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; 2304 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; 2305 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; 2306 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; 2307 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; 2308 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; 2309 } 2310 #endif 2311 } 2312 }; 2313 2314 ////////////////////////////////////////////////////////////////////////// 2315 /// StoreMacroTile - Stores a macro tile which consists of raster tiles. 2316 ////////////////////////////////////////////////////////////////////////// 2317 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2318 struct StoreMacroTile 2319 { 2320 ////////////////////////////////////////////////////////////////////////// 2321 /// @brief Stores a macrotile to the destination surface using safe implementation. 2322 /// @param pSrc - Pointer to macro tile. 2323 /// @param pDstSurface - Destination surface state 2324 /// @param x, y - Coordinates to macro tile 2325 static void StoreGeneric( 2326 uint8_t *pSrcHotTile, 2327 SWR_SURFACE_STATE* pDstSurface, 2328 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 2329 { 2330 PFN_STORE_TILES_INTERNAL pfnStore; 2331 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 2332 2333 // Store each raster tile from the hot tile to the destination surface. 2334 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 2335 { 2336 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 2337 { 2338 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2339 { 2340 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 2341 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 2342 } 2343 } 2344 } 2345 2346 } 2347 2348 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); 2349 ////////////////////////////////////////////////////////////////////////// 2350 /// @brief Stores a macrotile to the destination surface. 2351 /// @param pSrc - Pointer to macro tile. 2352 /// @param pDstSurface - Destination surface state 2353 /// @param x, y - Coordinates to macro tile 2354 static void Store( 2355 uint8_t *pSrcHotTile, 2356 SWR_SURFACE_STATE* pDstSurface, 2357 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 2358 { 2359 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; 2360 2361 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2362 { 2363 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( 2364 0, 2365 0, 2366 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces 2367 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays 2368 sampleNum, 2369 pDstSurface->lod, 2370 pDstSurface); 2371 2372 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear 2373 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || 2374 (pDstSurface->bInterleavedSamples); 2375 2376 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 2377 } 2378 2379 // Save original for pSrcHotTile resolve. 2380 uint8_t *pResolveSrcHotTile = pSrcHotTile; 2381 2382 // Store each raster tile from the hot tile to the destination surface. 2383 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 2384 { 2385 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 2386 { 2387 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2388 { 2389 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 2390 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 2391 } 2392 } 2393 } 2394 2395 if (pDstSurface->xpAuxBaseAddress) 2396 { 2397 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 2398 // Store each raster tile from the hot tile to the destination surface. 2399 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 2400 { 2401 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 2402 { 2403 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex); 2404 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples; 2405 } 2406 } 2407 } 2408 } 2409 }; 2410 2411 ////////////////////////////////////////////////////////////////////////// 2412 /// InitStoreTilesTable - Helper for setting up the tables. 2413 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 2414 void InitStoreTilesTableColor_Half1( 2415 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) 2416 { 2417 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; 2418 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; 2419 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; 2420 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; 2421 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store; 2422 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store; 2423 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; 2424 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; 2425 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; 2426 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store; 2427 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store; 2428 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; 2429 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; 2430 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; 2431 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; 2432 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; 2433 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; 2434 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; 2435 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; 2436 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 2437 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store; 2438 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; 2439 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; 2440 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store; 2441 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store; 2442 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store; 2443 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store; 2444 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; 2445 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; 2446 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; 2447 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; 2448 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; 2449 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; 2450 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; 2451 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; 2452 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; 2453 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; 2454 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; 2455 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; 2456 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; 2457 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; 2458 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; 2459 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; 2460 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; 2461 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; 2462 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric; 2463 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; 2464 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; 2465 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; 2466 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric; 2467 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric; 2468 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; 2469 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; 2470 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; 2471 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; 2472 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; 2473 } 2474 2475 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 2476 void InitStoreTilesTableColor_Half2( 2477 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) 2478 { 2479 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric; 2480 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; 2481 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric; 2482 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store; 2483 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store; 2484 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store; 2485 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store; 2486 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store; 2487 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store; 2488 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; 2489 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; 2490 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; 2491 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; 2492 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; 2493 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; 2494 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; 2495 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; 2496 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; 2497 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; 2498 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; 2499 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; 2500 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; 2501 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; 2502 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; 2503 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; 2504 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; 2505 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; 2506 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; 2507 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store; 2508 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store; 2509 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store; 2510 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store; 2511 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric; 2512 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric; 2513 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; 2514 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; 2515 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; 2516 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; 2517 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; 2518 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store; 2519 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store; 2520 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; 2521 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; 2522 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store; 2523 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store; 2524 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; 2525 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; 2526 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; 2527 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store; 2528 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store; 2529 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; 2530 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; 2531 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; 2532 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; 2533 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric; 2534 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric; 2535 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; 2536 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; 2537 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric; 2538 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric; 2539 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; 2540 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; 2541 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; 2542 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; 2543 } 2544 2545 ////////////////////////////////////////////////////////////////////////// 2546 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. 2547 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2548 void InitStoreTilesTableDepth( 2549 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2550 { 2551 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store; 2552 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 2553 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; 2554 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store; 2555 } 2556 2557 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2558 void InitStoreTilesTableStencil( 2559 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2560 { 2561 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store; 2562 } 2563