1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file StoreTile.h 24 * 25 * @brief Functionality for Store. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "common/os.h" 31 #include "common/formats.h" 32 #include "core/context.h" 33 #include "core/rdtsc_core.h" 34 #include "core/format_conversion.h" 35 36 #include "memory/TilingFunctions.h" 37 #include "memory/Convert.h" 38 #include "core/multisample.h" 39 40 #include <array> 41 #include <sstream> 42 43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats. 44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); 45 46 ////////////////////////////////////////////////////////////////////////// 47 /// Store Raster Tile Function Tables. 48 ////////////////////////////////////////////////////////////////////////// 49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 52 53 void InitStoreTilesTable_Linear_1(); 54 void InitStoreTilesTable_Linear_2(); 55 void InitStoreTilesTable_TileX_1(); 56 void InitStoreTilesTable_TileX_2(); 57 void InitStoreTilesTable_TileY_1(); 58 void InitStoreTilesTable_TileY_2(); 59 void InitStoreTilesTable_TileW(); 60 void InitStoreTilesTable(); 61 62 ////////////////////////////////////////////////////////////////////////// 63 /// StorePixels 64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 66 /// @param ppDsts - Array of destination pointers. Each pointer is 67 /// to a single row of at most 16B. 68 /// @tparam NumDests - Number of destination pointers. Each pair of 69 /// pointers is for a 16-byte column of two rows. 70 ////////////////////////////////////////////////////////////////////////// 71 template <size_t PixelSize, size_t NumDests> 72 struct StorePixels 73 { 74 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; 75 }; 76 77 ////////////////////////////////////////////////////////////////////////// 78 /// StorePixels (32-bit pixel specialization) 79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 81 /// @param ppDsts - Array of destination pointers. Each pointer is 82 /// to a single row of at most 16B. 83 /// @tparam NumDests - Number of destination pointers. Each pair of 84 /// pointers is for a 16-byte column of two rows. 85 ////////////////////////////////////////////////////////////////////////// 86 template <> 87 struct StorePixels<8, 2> 88 { 89 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 90 { 91 // Each 4-pixel row is 4 bytes. 92 const uint16_t* pPixSrc = (const uint16_t*)pSrc; 93 94 // Unswizzle from SWR-Z order 95 uint16_t* pRow = (uint16_t*)ppDsts[0]; 96 pRow[0] = pPixSrc[0]; 97 pRow[1] = pPixSrc[2]; 98 99 pRow = (uint16_t*)ppDsts[1]; 100 pRow[0] = pPixSrc[1]; 101 pRow[1] = pPixSrc[3]; 102 } 103 }; 104 105 #if USE_8x2_TILE_BACKEND 106 template <> 107 struct StorePixels<8, 4> 108 { 109 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 110 { 111 // 8 x 2 bytes = 16 bytes, 16 pixels 112 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); 113 114 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); 115 116 // Unswizzle from SWR-Z order 117 ppDsts16[0][0] = pSrc16[0]; // 0 1 118 ppDsts16[0][1] = pSrc16[2]; // 4 5 119 120 ppDsts16[1][0] = pSrc16[1]; // 2 3 121 ppDsts16[1][1] = pSrc16[3]; // 6 7 122 123 ppDsts16[2][0] = pSrc16[4]; // 8 9 124 ppDsts16[2][1] = pSrc16[6]; // C D 125 126 ppDsts16[3][0] = pSrc16[5]; // A B 127 ppDsts16[3][1] = pSrc16[7]; // E F 128 } 129 }; 130 131 #endif 132 ////////////////////////////////////////////////////////////////////////// 133 /// StorePixels (32-bit pixel specialization) 134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 136 /// @param ppDsts - Array of destination pointers. Each pointer is 137 /// to a single row of at most 16B. 138 /// @tparam NumDests - Number of destination pointers. Each pair of 139 /// pointers is for a 16-byte column of two rows. 140 ////////////////////////////////////////////////////////////////////////// 141 template <> 142 struct StorePixels<16, 2> 143 { 144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 145 { 146 // Each 4-pixel row is 8 bytes. 147 const uint32_t* pPixSrc = (const uint32_t*)pSrc; 148 149 // Unswizzle from SWR-Z order 150 uint32_t* pRow = (uint32_t*)ppDsts[0]; 151 pRow[0] = pPixSrc[0]; 152 pRow[1] = pPixSrc[2]; 153 154 pRow = (uint32_t*)ppDsts[1]; 155 pRow[0] = pPixSrc[1]; 156 pRow[1] = pPixSrc[3]; 157 } 158 }; 159 160 #if USE_8x2_TILE_BACKEND 161 template <> 162 struct StorePixels<16, 4> 163 { 164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 165 { 166 // 8 x 4 bytes = 32 bytes, 16 pixels 167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); 168 169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); 170 171 // Unswizzle from SWR-Z order 172 ppDsts32[0][0] = pSrc32[0]; // 0 1 173 ppDsts32[0][1] = pSrc32[2]; // 4 5 174 175 ppDsts32[1][0] = pSrc32[1]; // 2 3 176 ppDsts32[1][1] = pSrc32[3]; // 6 7 177 178 ppDsts32[2][0] = pSrc32[4]; // 8 9 179 ppDsts32[2][1] = pSrc32[6]; // C D 180 181 ppDsts32[3][0] = pSrc32[5]; // A B 182 ppDsts32[3][1] = pSrc32[7]; // E F 183 } 184 }; 185 186 #endif 187 ////////////////////////////////////////////////////////////////////////// 188 /// StorePixels (32-bit pixel specialization) 189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 191 /// @param ppDsts - Array of destination pointers. Each pointer is 192 /// to a single row of at most 16B. 193 /// @tparam NumDests - Number of destination pointers. Each pair of 194 /// pointers is for a 16-byte column of two rows. 195 ////////////////////////////////////////////////////////////////////////// 196 template <> 197 struct StorePixels<32, 2> 198 { 199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 200 { 201 // Each 4-pixel row is 16-bytes 202 __m128i *pZRow01 = (__m128i*)pSrc; 203 __m128i vQuad00 = _mm_load_si128(pZRow01); 204 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); 205 206 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); 207 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); 208 209 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); 210 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); 211 } 212 }; 213 214 #if USE_8x2_TILE_BACKEND 215 template <> 216 struct StorePixels<32, 4> 217 { 218 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 219 { 220 // 4 x 16 bytes = 64 bytes, 16 pixels 221 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 222 223 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 224 225 // Unswizzle from SWR-Z order 226 __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3 227 __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7 228 __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B 229 __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F 230 231 _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5 232 _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7 233 _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D 234 _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F 235 } 236 }; 237 238 #endif 239 ////////////////////////////////////////////////////////////////////////// 240 /// StorePixels (32-bit pixel specialization) 241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 243 /// @param ppDsts - Array of destination pointers. Each pointer is 244 /// to a single row of at most 16B. 245 /// @tparam NumDests - Number of destination pointers. Each pair of 246 /// pointers is for a 16-byte column of two rows. 247 ////////////////////////////////////////////////////////////////////////// 248 template <> 249 struct StorePixels<64, 4> 250 { 251 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 252 { 253 // Each 4-pixel row is 32 bytes. 254 const __m128i* pPixSrc = (const __m128i*)pSrc; 255 256 // order of pointers match SWR-Z layout 257 __m128i** pvDsts = (__m128i**)&ppDsts[0]; 258 *pvDsts[0] = pPixSrc[0]; 259 *pvDsts[1] = pPixSrc[1]; 260 *pvDsts[2] = pPixSrc[2]; 261 *pvDsts[3] = pPixSrc[3]; 262 } 263 }; 264 265 #if USE_8x2_TILE_BACKEND 266 template <> 267 struct StorePixels<64, 8> 268 { 269 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 270 { 271 // 8 x 16 bytes = 128 bytes, 16 pixels 272 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 273 274 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 275 276 // order of pointers match SWR-Z layout 277 *ppDsts128[0] = pSrc128[0]; // 0 1 278 *ppDsts128[1] = pSrc128[1]; // 2 3 279 *ppDsts128[2] = pSrc128[2]; // 4 5 280 *ppDsts128[3] = pSrc128[3]; // 6 7 281 *ppDsts128[4] = pSrc128[4]; // 8 9 282 *ppDsts128[5] = pSrc128[5]; // A B 283 *ppDsts128[6] = pSrc128[6]; // C D 284 *ppDsts128[7] = pSrc128[7]; // E F 285 } 286 }; 287 288 #endif 289 ////////////////////////////////////////////////////////////////////////// 290 /// StorePixels (32-bit pixel specialization) 291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows. 292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order 293 /// @param ppDsts - Array of destination pointers. Each pointer is 294 /// to a single row of at most 16B. 295 /// @tparam NumDests - Number of destination pointers. Each pair of 296 /// pointers is for a 16-byte column of two rows. 297 ////////////////////////////////////////////////////////////////////////// 298 template <> 299 struct StorePixels<128, 8> 300 { 301 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 302 { 303 // Each 4-pixel row is 64 bytes. 304 const __m128i* pPixSrc = (const __m128i*)pSrc; 305 306 // Unswizzle from SWR-Z order 307 __m128i** pvDsts = (__m128i**)&ppDsts[0]; 308 *pvDsts[0] = pPixSrc[0]; 309 *pvDsts[1] = pPixSrc[2]; 310 *pvDsts[2] = pPixSrc[1]; 311 *pvDsts[3] = pPixSrc[3]; 312 *pvDsts[4] = pPixSrc[4]; 313 *pvDsts[5] = pPixSrc[6]; 314 *pvDsts[6] = pPixSrc[5]; 315 *pvDsts[7] = pPixSrc[7]; 316 } 317 }; 318 319 #if USE_8x2_TILE_BACKEND 320 template <> 321 struct StorePixels<128, 16> 322 { 323 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) 324 { 325 // 16 x 16 bytes = 256 bytes, 16 pixels 326 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 327 328 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 329 330 for (uint32_t i = 0; i < 16; i += 4) 331 { 332 *ppDsts128[i + 0] = pSrc128[i + 0]; 333 *ppDsts128[i + 1] = pSrc128[i + 2]; 334 *ppDsts128[i + 2] = pSrc128[i + 1]; 335 *ppDsts128[i + 3] = pSrc128[i + 3]; 336 } 337 } 338 }; 339 340 #endif 341 ////////////////////////////////////////////////////////////////////////// 342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 343 ////////////////////////////////////////////////////////////////////////// 344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 345 struct ConvertPixelsSOAtoAOS 346 { 347 ////////////////////////////////////////////////////////////////////////// 348 /// @brief Converts a SIMD from the Hot Tile to the destination format 349 /// and converts from SOA to AOS. 350 /// @param pSrc - Pointer to raster tile. 351 /// @param pDst - Pointer to destination surface or deswizzling buffer. 352 template <size_t NumDests> 353 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 354 { 355 #if USE_8x2_TILE_BACKEND 356 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 357 358 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 359 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 360 361 // Convert from SrcFormat --> DstFormat 362 simd16vector src; 363 LoadSOA<SrcFormat>(pSrc, src); 364 StoreSOA<DstFormat>(src, soaTile); 365 366 // Convert from SOA --> AOS 367 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile); 368 369 #else 370 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 371 372 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 373 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 374 375 // Convert from SrcFormat --> DstFormat 376 simdvector src; 377 LoadSOA<SrcFormat>(pSrc, src); 378 StoreSOA<DstFormat>(src, soaTile); 379 380 // Convert from SOA --> AOS 381 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 382 383 #endif 384 // Store data into destination 385 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 386 } 387 }; 388 389 ////////////////////////////////////////////////////////////////////////// 390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 391 /// Specialization for no format conversion 392 ////////////////////////////////////////////////////////////////////////// 393 template<SWR_FORMAT Format> 394 struct ConvertPixelsSOAtoAOS<Format, Format> 395 { 396 ////////////////////////////////////////////////////////////////////////// 397 /// @brief Converts a SIMD from the Hot Tile to the destination format 398 /// and converts from SOA to AOS. 399 /// @param pSrc - Pointer to raster tile. 400 /// @param pDst - Pointer to destination surface or deswizzling buffer. 401 template <size_t NumDests> 402 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 403 { 404 #if USE_8x2_TILE_BACKEND 405 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 406 407 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 408 409 // Convert from SOA --> AOS 410 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile); 411 412 #else 413 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 414 415 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 416 417 // Convert from SOA --> AOS 418 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile); 419 420 #endif 421 // Store data into destination 422 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); 423 } 424 }; 425 426 ////////////////////////////////////////////////////////////////////////// 427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM 428 ////////////////////////////////////////////////////////////////////////// 429 template<> 430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > 431 { 432 ////////////////////////////////////////////////////////////////////////// 433 /// @brief Converts a SIMD from the Hot Tile to the destination format 434 /// and converts from SOA to AOS. 435 /// @param pSrc - Pointer to raster tile. 436 /// @param pDst - Pointer to destination surface or deswizzling buffer. 437 template <size_t NumDests> 438 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 439 { 440 #if USE_8x2_TILE_BACKEND 441 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 442 static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 443 444 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 445 446 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 447 448 // Load hot-tile 449 simd16vector src, dst; 450 LoadSOA<SrcFormat>(pSrc, src); 451 452 // deswizzle 453 dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 454 dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 455 dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 456 457 // clamp 458 dst.x = Clamp<DstFormat>(dst.x, 0); 459 dst.y = Clamp<DstFormat>(dst.y, 1); 460 dst.z = Clamp<DstFormat>(dst.z, 2); 461 462 // normalize 463 dst.x = Normalize<DstFormat>(dst.x, 0); 464 dst.y = Normalize<DstFormat>(dst.y, 1); 465 dst.z = Normalize<DstFormat>(dst.z, 2); 466 467 // pack 468 simd16scalari packed = _simd16_castps_si(dst.x); 469 470 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); 471 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); 472 473 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); 474 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); 475 476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst 477 uint32_t *pPacked = (uint32_t*)&packed; 478 uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 479 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) 480 { 481 *pAosTile++ = *pPacked++; 482 } 483 484 #else 485 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 486 static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 487 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 488 489 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 490 491 // Load hot-tile 492 simdvector src, dst; 493 LoadSOA<SrcFormat>(pSrc, src); 494 495 // deswizzle 496 dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 497 dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 498 dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 499 500 // clamp 501 dst.x = Clamp<DstFormat>(dst.x, 0); 502 dst.y = Clamp<DstFormat>(dst.y, 1); 503 dst.z = Clamp<DstFormat>(dst.z, 2); 504 505 // normalize 506 dst.x = Normalize<DstFormat>(dst.x, 0); 507 dst.y = Normalize<DstFormat>(dst.y, 1); 508 dst.z = Normalize<DstFormat>(dst.z, 2); 509 510 // pack 511 simdscalari packed = _simd_castps_si(dst.x); 512 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0))); 513 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) + 514 FormatTraits<DstFormat>::GetBPC(1))); 515 516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst 517 uint32_t *pPacked = (uint32_t*)&packed; 518 uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 519 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) 520 { 521 *pAosTile++ = *pPacked++; 522 } 523 524 #endif 525 // Store data into destination 526 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 527 } 528 }; 529 530 ////////////////////////////////////////////////////////////////////////// 531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 532 ////////////////////////////////////////////////////////////////////////// 533 template<> 534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> 535 { 536 static const SWR_FORMAT SrcFormat = R32_FLOAT; 537 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; 538 539 ////////////////////////////////////////////////////////////////////////// 540 /// @brief Converts a SIMD from the Hot Tile to the destination format 541 /// and converts from SOA to AOS. 542 /// @param pSrc - Pointer to raster tile. 543 /// @param pDst - Pointer to destination surface or deswizzling buffer. 544 template <size_t NumDests> 545 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 546 { 547 #if USE_8x2_TILE_BACKEND 548 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 549 550 // clamp 551 const simd16scalar zero = _simd16_setzero_ps(); 552 const simd16scalar ones = _simd16_set1_ps(1.0f); 553 554 comp = _simd16_max_ps(comp, zero); 555 comp = _simd16_min_ps(comp, ones); 556 557 // normalize 558 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 559 560 simd16scalari temp = _simd16_cvtps_epi32(comp); 561 562 // swizzle 563 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 564 565 // merge/store data into destination but don't overwrite the X8 bits 566 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0])); 567 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2])); 568 569 simd16scalari dest = _simd16_setzero_si(); 570 571 dest = _simd16_insert_si(dest, destlo, 0); 572 dest = _simd16_insert_si(dest, desthi, 1); 573 574 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); 575 576 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); 577 578 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0)); 579 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1)); 580 #else 581 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 582 583 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 584 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 585 586 // Convert from SrcFormat --> DstFormat 587 simdvector src; 588 LoadSOA<SrcFormat>(pSrc, src); 589 StoreSOA<DstFormat>(src, soaTile); 590 591 // Convert from SOA --> AOS 592 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 593 594 // Store data into destination but don't overwrite the X8 bits 595 // Each 4-pixel row is 16-bytes 596 __m128i *pZRow01 = (__m128i*)aosTile; 597 __m128i vQuad00 = _mm_load_si128(pZRow01); 598 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); 599 600 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); 601 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); 602 603 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); 604 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); 605 606 __m128i vMask = _mm_set1_epi32(0xFFFFFF); 607 608 vDst0 = _mm_andnot_si128(vMask, vDst0); 609 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); 610 vDst1 = _mm_andnot_si128(vMask, vDst1); 611 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); 612 613 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); 614 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); 615 #endif 616 } 617 }; 618 619 #if USE_8x2_TILE_BACKEND 620 template<SWR_FORMAT DstFormat> 621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 622 { 623 // swizzle rgba -> bgra while we load 624 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 625 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 626 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 627 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa 628 629 // clamp 630 const simd16scalar zero = _simd16_setzero_ps(); 631 const simd16scalar ones = _simd16_set1_ps(1.0f); 632 633 comp0 = _simd16_max_ps(comp0, zero); 634 comp0 = _simd16_min_ps(comp0, ones); 635 636 comp1 = _simd16_max_ps(comp1, zero); 637 comp1 = _simd16_min_ps(comp1, ones); 638 639 comp2 = _simd16_max_ps(comp2, zero); 640 comp2 = _simd16_min_ps(comp2, ones); 641 642 comp3 = _simd16_max_ps(comp3, zero); 643 comp3 = _simd16_min_ps(comp3, ones); 644 645 // gamma-correct only rgb 646 if (FormatTraits<DstFormat>::isSRGB) 647 { 648 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 649 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 650 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 651 } 652 653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 654 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 655 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 656 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 657 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 658 659 // moving to 16 wide integer vector types 660 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 661 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 662 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 663 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa 664 665 // SOA to AOS conversion 666 src1 = _simd16_slli_epi32(src1, 8); 667 src2 = _simd16_slli_epi32(src2, 16); 668 src3 = _simd16_slli_epi32(src3, 24); 669 670 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 671 672 // de-swizzle conversion 673 #if 1 674 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 675 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 676 677 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 678 679 #else 680 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 681 682 #endif 683 // store 8x2 memory order: 684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 686 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); 687 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); 688 } 689 690 #endif 691 template<SWR_FORMAT DstFormat> 692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 693 { 694 static const uint32_t offset = sizeof(simdscalar); 695 696 // swizzle rgba -> bgra while we load 697 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 698 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 699 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 700 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 701 702 // clamp 703 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 704 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 705 706 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 707 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 708 709 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 710 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 711 712 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); 713 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); 714 715 if (FormatTraits<DstFormat>::isSRGB) 716 { 717 // Gamma-correct only rgb 718 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 719 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 720 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 721 } 722 723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 724 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 725 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 726 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 727 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 728 729 // moving to 8 wide integer vector types 730 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 731 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 732 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 733 __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa 734 735 #if KNOB_ARCH == KNOB_ARCH_AVX 736 737 // splitting into two sets of 4 wide integer vector types 738 // because AVX doesn't have instructions to support this operation at 8 wide 739 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 740 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 741 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 742 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a 743 744 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 745 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 746 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 747 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a 748 749 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 750 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 751 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 752 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 753 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 754 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 755 756 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr 757 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 758 759 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr 760 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 761 762 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr 763 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr 764 765 // unpack into rows that get the tiling order correct 766 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr 767 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); 768 769 __m256i final = _mm256_castsi128_si256(vRow00); 770 final = _mm256_insertf128_si256(final, vRow10, 1); 771 772 #elif KNOB_ARCH >= KNOB_ARCH_AVX2 773 774 // logic is as above, only wider 775 src1 = _mm256_slli_si256(src1, 1); 776 src2 = _mm256_slli_si256(src2, 2); 777 src3 = _mm256_slli_si256(src3, 3); 778 779 src0 = _mm256_or_si256(src0, src1); 780 src2 = _mm256_or_si256(src2, src3); 781 782 __m256i final = _mm256_or_si256(src0, src2); 783 #if 0 784 785 __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); 786 787 final = _mm256_permutevar8x32_epi32(final, perm); 788 #else 789 790 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 791 final = _mm256_permute4x64_epi64(final, 0xD8); 792 #endif 793 #endif 794 795 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); 796 } 797 798 #if USE_8x2_TILE_BACKEND 799 template<SWR_FORMAT DstFormat> 800 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 801 { 802 // swizzle rgba -> bgra while we load 803 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 804 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 805 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 806 807 // clamp 808 const simd16scalar zero = _simd16_setzero_ps(); 809 const simd16scalar ones = _simd16_set1_ps(1.0f); 810 811 comp0 = _simd16_max_ps(comp0, zero); 812 comp0 = _simd16_min_ps(comp0, ones); 813 814 comp1 = _simd16_max_ps(comp1, zero); 815 comp1 = _simd16_min_ps(comp1, ones); 816 817 comp2 = _simd16_max_ps(comp2, zero); 818 comp2 = _simd16_min_ps(comp2, ones); 819 820 // gamma-correct only rgb 821 if (FormatTraits<DstFormat>::isSRGB) 822 { 823 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 824 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 825 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 826 } 827 828 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 829 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 830 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 831 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 832 833 // moving to 16 wide integer vector types 834 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 835 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 836 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 837 838 // SOA to AOS conversion 839 src1 = _simd16_slli_epi32(src1, 8); 840 src2 = _simd16_slli_epi32(src2, 16); 841 842 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 843 844 // de-swizzle conversion 845 #if 1 846 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 847 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 848 849 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 850 851 #else 852 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 853 854 #endif 855 // store 8x2 memory order: 856 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 857 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 858 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); 859 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); 860 } 861 862 #endif 863 template<SWR_FORMAT DstFormat> 864 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 865 { 866 static const uint32_t offset = sizeof(simdscalar); 867 868 // swizzle rgba -> bgra while we load 869 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 870 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 871 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 872 // clamp 873 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 874 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 875 876 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 877 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 878 879 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 880 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 881 882 if (FormatTraits<DstFormat>::isSRGB) 883 { 884 // Gamma-correct only rgb 885 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 886 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 887 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 888 } 889 890 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 891 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 892 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 893 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 894 895 // moving to 8 wide integer vector types 896 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 897 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 898 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 899 900 #if KNOB_ARCH == KNOB_ARCH_AVX 901 902 // splitting into two sets of 4 wide integer vector types 903 // because AVX doesn't have instructions to support this operation at 8 wide 904 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 905 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 906 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 907 908 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 909 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 910 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 911 912 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 913 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 914 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 915 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 916 917 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr 918 919 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr 920 921 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr 922 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr 923 924 // unpack into rows that get the tiling order correct 925 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr 926 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); 927 928 __m256i final = _mm256_castsi128_si256(vRow00); 929 final = _mm256_insertf128_si256(final, vRow10, 1); 930 931 #elif KNOB_ARCH >= KNOB_ARCH_AVX2 932 933 // logic is as above, only wider 934 src1 = _mm256_slli_si256(src1, 1); 935 src2 = _mm256_slli_si256(src2, 2); 936 937 src0 = _mm256_or_si256(src0, src1); 938 939 __m256i final = _mm256_or_si256(src0, src2); 940 941 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 942 final = _mm256_permute4x64_epi64(final, 0xD8); 943 944 #endif 945 946 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); 947 } 948 949 template<> 950 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> 951 { 952 template <size_t NumDests> 953 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 954 { 955 #if USE_8x2_TILE_BACKEND 956 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 957 #else 958 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 959 #endif 960 } 961 }; 962 963 template<> 964 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> 965 { 966 template <size_t NumDests> 967 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 968 { 969 #if USE_8x2_TILE_BACKEND 970 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 971 #else 972 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 973 #endif 974 } 975 }; 976 977 template<> 978 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > 979 { 980 template <size_t NumDests> 981 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 982 { 983 #if USE_8x2_TILE_BACKEND 984 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 985 #else 986 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 987 #endif 988 } 989 }; 990 991 template<> 992 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > 993 { 994 template <size_t NumDests> 995 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 996 { 997 #if USE_8x2_TILE_BACKEND 998 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 999 #else 1000 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1001 #endif 1002 } 1003 }; 1004 1005 template<> 1006 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > 1007 { 1008 template <size_t NumDests> 1009 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1010 { 1011 #if USE_8x2_TILE_BACKEND 1012 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1013 #else 1014 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1015 #endif 1016 } 1017 }; 1018 1019 template<> 1020 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > 1021 { 1022 template <size_t NumDests> 1023 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1024 { 1025 #if USE_8x2_TILE_BACKEND 1026 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1027 #else 1028 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1029 #endif 1030 } 1031 }; 1032 1033 template<> 1034 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > 1035 { 1036 template <size_t NumDests> 1037 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1038 { 1039 #if USE_8x2_TILE_BACKEND 1040 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1041 #else 1042 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1043 #endif 1044 } 1045 }; 1046 1047 template<> 1048 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > 1049 { 1050 template <size_t NumDests> 1051 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 1052 { 1053 #if USE_8x2_TILE_BACKEND 1054 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1055 #else 1056 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1057 #endif 1058 } 1059 }; 1060 1061 ////////////////////////////////////////////////////////////////////////// 1062 /// StoreRasterTile 1063 ////////////////////////////////////////////////////////////////////////// 1064 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1065 struct StoreRasterTile 1066 { 1067 ////////////////////////////////////////////////////////////////////////// 1068 /// @brief Retrieve color from hot tile source which is always float. 1069 /// @param pSrc - Pointer to raster tile. 1070 /// @param x, y - Coordinates to raster tile. 1071 /// @param output - output color 1072 INLINE static void GetSwizzledSrcColor( 1073 uint8_t* pSrc, 1074 uint32_t x, uint32_t y, 1075 float outputColor[4]) 1076 { 1077 #if USE_8x2_TILE_BACKEND 1078 typedef SimdTile_16<SrcFormat, DstFormat> SimdT; 1079 1080 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); 1081 1082 // Compute which simd tile we're accessing within 8x8 tile. 1083 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 1084 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); 1085 1086 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; 1087 1088 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); 1089 1090 pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1091 #else 1092 typedef SimdTile<SrcFormat, DstFormat> SimdT; 1093 1094 SimdT* pSrcSimdTiles = (SimdT*)pSrc; 1095 1096 // Compute which simd tile we're accessing within 8x8 tile. 1097 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 1098 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); 1099 1100 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; 1101 1102 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); 1103 1104 pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1105 #endif 1106 } 1107 1108 ////////////////////////////////////////////////////////////////////////// 1109 /// @brief Stores an 8x8 raster tile to the destination surface. 1110 /// @param pSrc - Pointer to raster tile. 1111 /// @param pDstSurface - Destination surface state 1112 /// @param x, y - Coordinates to raster tile. 1113 INLINE static void Store( 1114 uint8_t *pSrc, 1115 SWR_SURFACE_STATE* pDstSurface, 1116 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 1117 { 1118 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1119 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1120 1121 // For each raster tile pixel (rx, ry) 1122 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 1123 { 1124 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 1125 { 1126 // Perform bounds checking. 1127 if (((x + rx) < lodWidth) && 1128 ((y + ry) < lodHeight)) 1129 { 1130 float srcColor[4]; 1131 GetSwizzledSrcColor(pSrc, rx, ry, srcColor); 1132 1133 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 1134 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 1135 sampleNum, pDstSurface->lod, pDstSurface); 1136 { 1137 ConvertPixelFromFloat<DstFormat>(pDst, srcColor); 1138 } 1139 } 1140 } 1141 } 1142 } 1143 }; 1144 1145 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1146 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> 1147 {}; 1148 1149 ////////////////////////////////////////////////////////////////////////// 1150 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp 1151 ////////////////////////////////////////////////////////////////////////// 1152 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1153 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> 1154 { 1155 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; 1156 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1157 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1158 1159 ////////////////////////////////////////////////////////////////////////// 1160 /// @brief Stores an 8x8 raster tile to the destination surface. 1161 /// @param pSrc - Pointer to raster tile. 1162 /// @param pDstSurface - Destination surface state 1163 /// @param x, y - Coordinates to raster tile. 1164 INLINE static void Store( 1165 uint8_t *pSrc, 1166 SWR_SURFACE_STATE* pDstSurface, 1167 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1168 { 1169 // Punt non-full tiles to generic store 1170 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1171 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1172 1173 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1174 { 1175 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1176 } 1177 1178 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1179 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1180 #if USE_8x2_TILE_BACKEND 1181 1182 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1183 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1184 1185 uint8_t* ppDsts[] = 1186 { 1187 pDst, // row 0, col 0 1188 pDst + pDstSurface->pitch, // row 1, col 0 1189 pDst + dx / 2, // row 0, col 1 1190 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1191 }; 1192 1193 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1194 { 1195 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1196 { 1197 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1198 1199 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1200 1201 ppDsts[0] += dx; 1202 ppDsts[1] += dx; 1203 ppDsts[2] += dx; 1204 ppDsts[3] += dx; 1205 } 1206 1207 ppDsts[0] += dy; 1208 ppDsts[1] += dy; 1209 ppDsts[2] += dy; 1210 ppDsts[3] += dy; 1211 } 1212 #else 1213 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1214 1215 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1216 { 1217 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1218 1219 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1220 { 1221 // Format conversion and convert from SOA to AOS, and store the rows. 1222 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1223 1224 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1225 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1226 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1227 } 1228 1229 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1230 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1231 } 1232 #endif 1233 } 1234 }; 1235 1236 ////////////////////////////////////////////////////////////////////////// 1237 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp 1238 ////////////////////////////////////////////////////////////////////////// 1239 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1240 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> 1241 { 1242 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; 1243 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1244 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1245 1246 ////////////////////////////////////////////////////////////////////////// 1247 /// @brief Stores an 8x8 raster tile to the destination surface. 1248 /// @param pSrc - Pointer to raster tile. 1249 /// @param pDstSurface - Destination surface state 1250 /// @param x, y - Coordinates to raster tile. 1251 INLINE static void Store( 1252 uint8_t *pSrc, 1253 SWR_SURFACE_STATE* pDstSurface, 1254 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1255 { 1256 // Punt non-full tiles to generic store 1257 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1258 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1259 1260 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1261 { 1262 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1263 } 1264 1265 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1266 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1267 #if USE_8x2_TILE_BACKEND 1268 1269 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1270 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1271 1272 uint8_t* ppDsts[] = 1273 { 1274 pDst, // row 0, col 0 1275 pDst + pDstSurface->pitch, // row 1, col 0 1276 pDst + dx / 2, // row 0, col 1 1277 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1278 }; 1279 1280 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1281 { 1282 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1283 { 1284 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1285 1286 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1287 1288 ppDsts[0] += dx; 1289 ppDsts[1] += dx; 1290 ppDsts[2] += dx; 1291 ppDsts[3] += dx; 1292 } 1293 1294 ppDsts[0] += dy; 1295 ppDsts[1] += dy; 1296 ppDsts[2] += dy; 1297 ppDsts[3] += dy; 1298 } 1299 #else 1300 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1301 1302 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1303 { 1304 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1305 1306 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1307 { 1308 // Format conversion and convert from SOA to AOS, and store the rows. 1309 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1310 1311 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1312 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1313 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1314 } 1315 1316 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1317 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1318 } 1319 #endif 1320 } 1321 }; 1322 1323 ////////////////////////////////////////////////////////////////////////// 1324 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp 1325 ////////////////////////////////////////////////////////////////////////// 1326 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1327 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> 1328 { 1329 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; 1330 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1331 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1332 1333 ////////////////////////////////////////////////////////////////////////// 1334 /// @brief Stores an 8x8 raster tile to the destination surface. 1335 /// @param pSrc - Pointer to raster tile. 1336 /// @param pDstSurface - Destination surface state 1337 /// @param x, y - Coordinates to raster tile. 1338 INLINE static void Store( 1339 uint8_t *pSrc, 1340 SWR_SURFACE_STATE* pDstSurface, 1341 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1342 { 1343 // Punt non-full tiles to generic store 1344 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1345 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1346 1347 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1348 { 1349 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1350 } 1351 1352 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1353 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1354 #if USE_8x2_TILE_BACKEND 1355 1356 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1357 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1358 1359 uint8_t* ppDsts[] = 1360 { 1361 pDst, // row 0, col 0 1362 pDst + pDstSurface->pitch, // row 1, col 0 1363 pDst + dx / 2, // row 0, col 1 1364 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1365 }; 1366 1367 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1368 { 1369 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1370 { 1371 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1372 1373 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1374 1375 ppDsts[0] += dx; 1376 ppDsts[1] += dx; 1377 ppDsts[2] += dx; 1378 ppDsts[3] += dx; 1379 } 1380 1381 ppDsts[0] += dy; 1382 ppDsts[1] += dy; 1383 ppDsts[2] += dy; 1384 ppDsts[3] += dy; 1385 } 1386 #else 1387 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1388 1389 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1390 { 1391 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 1392 1393 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1394 { 1395 // Format conversion and convert from SOA to AOS, and store the rows. 1396 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 1397 1398 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1399 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1400 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 1401 } 1402 1403 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1404 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1405 } 1406 #endif 1407 } 1408 }; 1409 1410 ////////////////////////////////////////////////////////////////////////// 1411 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp 1412 ////////////////////////////////////////////////////////////////////////// 1413 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1414 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> 1415 { 1416 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; 1417 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1418 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1419 static const size_t MAX_DST_COLUMN_BYTES = 16; 1420 #if !USE_8x2_TILE_BACKEND 1421 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 1422 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1423 #endif 1424 1425 ////////////////////////////////////////////////////////////////////////// 1426 /// @brief Stores an 8x8 raster tile to the destination surface. 1427 /// @param pSrc - Pointer to raster tile. 1428 /// @param pDstSurface - Destination surface state 1429 /// @param x, y - Coordinates to raster tile. 1430 INLINE static void Store( 1431 uint8_t *pSrc, 1432 SWR_SURFACE_STATE* pDstSurface, 1433 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1434 { 1435 // Punt non-full tiles to generic store 1436 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1437 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1438 1439 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1440 { 1441 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1442 } 1443 1444 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1445 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1446 #if USE_8x2_TILE_BACKEND 1447 1448 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1449 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1450 1451 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1452 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); 1453 1454 uint8_t *ppDsts[] = 1455 { 1456 pDst, // row 0, col 0 1457 pDst + pDstSurface->pitch, // row 1, col 0 1458 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1459 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1460 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1461 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1462 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1463 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 1464 }; 1465 1466 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1467 { 1468 // Raster tile width is same as simd16 tile width 1469 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1470 1471 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1472 1473 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1474 1475 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 1476 { 1477 ppDsts[i] += dy; 1478 } 1479 } 1480 #else 1481 uint8_t* ppDsts[] = 1482 { 1483 pDst, // row 0, col 0 1484 pDst + pDstSurface->pitch, // row 1, col 0 1485 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1486 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1487 }; 1488 1489 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1490 { 1491 uint8_t* ppStartRows[] = 1492 { 1493 ppDsts[0], 1494 ppDsts[1], 1495 ppDsts[2], 1496 ppDsts[3], 1497 }; 1498 1499 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1500 { 1501 // Format conversion and convert from SOA to AOS, and store the rows. 1502 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1503 1504 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 1505 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 1506 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 1507 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 1508 pSrc += SRC_COLUMN_BYTES; 1509 } 1510 1511 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 1512 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 1513 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; 1514 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; 1515 } 1516 #endif 1517 } 1518 }; 1519 1520 ////////////////////////////////////////////////////////////////////////// 1521 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp 1522 ////////////////////////////////////////////////////////////////////////// 1523 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1524 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> 1525 { 1526 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; 1527 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1528 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1529 static const size_t MAX_DST_COLUMN_BYTES = 16; 1530 #if !USE_8x2_TILE_BACKEND 1531 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 1532 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1533 #endif 1534 1535 ////////////////////////////////////////////////////////////////////////// 1536 /// @brief Stores an 8x8 raster tile to the destination surface. 1537 /// @param pSrc - Pointer to raster tile. 1538 /// @param pDstSurface - Destination surface state 1539 /// @param x, y - Coordinates to raster tile. 1540 INLINE static void Store( 1541 uint8_t *pSrc, 1542 SWR_SURFACE_STATE* pDstSurface, 1543 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1544 { 1545 // Punt non-full tiles to generic store 1546 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1547 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1548 1549 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1550 { 1551 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1552 } 1553 1554 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1555 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1556 #if USE_8x2_TILE_BACKEND 1557 1558 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1559 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1560 1561 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1562 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); 1563 1564 uint8_t* ppDsts[] = 1565 { 1566 pDst, // row 0, col 0 1567 pDst + pDstSurface->pitch, // row 1, col 0 1568 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 1569 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 1570 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 1571 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 1572 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 1573 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 1574 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 1575 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 1576 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 1577 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 1578 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 1579 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 1580 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 1581 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 1582 }; 1583 1584 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1585 { 1586 // Raster tile width is same as simd16 tile width 1587 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1588 1589 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1590 1591 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1592 1593 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 1594 { 1595 ppDsts[i] += dy; 1596 } 1597 } 1598 #else 1599 struct DstPtrs 1600 { 1601 uint8_t* ppDsts[8]; 1602 } ptrs; 1603 1604 // Need 8 pointers, 4 columns of 2 rows each 1605 for (uint32_t y = 0; y < 2; ++y) 1606 { 1607 for (uint32_t x = 0; x < 4; ++x) 1608 { 1609 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; 1610 } 1611 } 1612 1613 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 1614 { 1615 DstPtrs startPtrs = ptrs; 1616 1617 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 1618 { 1619 // Format conversion and convert from SOA to AOS, and store the rows. 1620 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 1621 1622 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 1623 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 1624 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 1625 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 1626 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 1627 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 1628 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 1629 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 1630 pSrc += SRC_COLUMN_BYTES; 1631 } 1632 1633 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; 1634 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; 1635 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; 1636 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; 1637 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; 1638 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; 1639 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; 1640 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; 1641 } 1642 #endif 1643 } 1644 }; 1645 1646 ////////////////////////////////////////////////////////////////////////// 1647 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp 1648 ////////////////////////////////////////////////////////////////////////// 1649 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1650 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> 1651 { 1652 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; 1653 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1654 1655 ////////////////////////////////////////////////////////////////////////// 1656 /// @brief Stores an 8x8 raster tile to the destination surface. 1657 /// @param pSrc - Pointer to raster tile. 1658 /// @param pDstSurface - Destination surface state 1659 /// @param x, y - Coordinates to raster tile. 1660 INLINE static void Store( 1661 uint8_t *pSrc, 1662 SWR_SURFACE_STATE* pDstSurface, 1663 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1664 { 1665 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1666 1667 // Punt non-full tiles to generic store 1668 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1669 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1670 1671 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1672 { 1673 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1674 } 1675 1676 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1677 // We can compute the offsets to each column within the raster tile once and increment from these. 1678 #if USE_8x2_TILE_BACKEND 1679 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1680 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1681 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1682 1683 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1684 1685 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1686 uint8_t *ppDsts[] = 1687 { 1688 pDst, 1689 pDst + DestRowWidthBytes, 1690 pDst + DestRowWidthBytes / 4, 1691 pDst + DestRowWidthBytes + DestRowWidthBytes / 4 1692 }; 1693 1694 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1695 { 1696 // Raster tile width is same as simd16 tile width 1697 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1698 1699 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1700 1701 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1702 1703 ppDsts[0] += dy; 1704 ppDsts[1] += dy; 1705 ppDsts[2] += dy; 1706 ppDsts[3] += dy; 1707 } 1708 #else 1709 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1710 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1711 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1712 1713 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1714 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1715 1716 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1717 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1718 { 1719 uint32_t rowOffset = row * DestRowWidthBytes; 1720 1721 uint8_t* pRow = pCol0 + rowOffset; 1722 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 1723 1724 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1725 pSrc += pSrcInc; 1726 1727 ppDsts[0] += DestRowWidthBytes / 4; 1728 ppDsts[1] += DestRowWidthBytes / 4; 1729 1730 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1731 pSrc += pSrcInc; 1732 } 1733 #endif 1734 } 1735 }; 1736 1737 ////////////////////////////////////////////////////////////////////////// 1738 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp 1739 ////////////////////////////////////////////////////////////////////////// 1740 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1741 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> 1742 { 1743 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; 1744 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1745 1746 ////////////////////////////////////////////////////////////////////////// 1747 /// @brief Stores an 8x8 raster tile to the destination surface. 1748 /// @param pSrc - Pointer to raster tile. 1749 /// @param pDstSurface - Destination surface state 1750 /// @param x, y - Coordinates to raster tile. 1751 INLINE static void Store( 1752 uint8_t *pSrc, 1753 SWR_SURFACE_STATE* pDstSurface, 1754 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1755 { 1756 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1757 1758 // Punt non-full tiles to generic store 1759 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1760 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1761 1762 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1763 { 1764 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1765 } 1766 1767 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1768 // We can compute the offsets to each column within the raster tile once and increment from these. 1769 #if USE_8x2_TILE_BACKEND 1770 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1771 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1772 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1773 1774 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1775 1776 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1777 uint8_t *ppDsts[] = 1778 { 1779 pDst, 1780 pDst + DestRowWidthBytes, 1781 pDst + DestRowWidthBytes / 2, 1782 pDst + DestRowWidthBytes + DestRowWidthBytes / 2 1783 }; 1784 1785 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1786 { 1787 // Raster tile width is same as simd16 tile width 1788 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1789 1790 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1791 1792 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1793 1794 ppDsts[0] += dy; 1795 ppDsts[1] += dy; 1796 ppDsts[2] += dy; 1797 ppDsts[3] += dy; 1798 } 1799 #else 1800 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1801 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1802 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1803 1804 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1805 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1806 1807 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1808 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1809 { 1810 uint32_t rowOffset = row * DestRowWidthBytes; 1811 1812 uint8_t* pRow = pCol0 + rowOffset; 1813 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 1814 1815 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1816 pSrc += pSrcInc; 1817 1818 ppDsts[0] += DestRowWidthBytes / 2; 1819 ppDsts[1] += DestRowWidthBytes / 2; 1820 1821 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1822 pSrc += pSrcInc; 1823 } 1824 #endif 1825 } 1826 }; 1827 1828 ////////////////////////////////////////////////////////////////////////// 1829 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp 1830 ////////////////////////////////////////////////////////////////////////// 1831 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1832 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> 1833 { 1834 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1835 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1836 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 1837 1838 ////////////////////////////////////////////////////////////////////////// 1839 /// @brief Stores an 8x8 raster tile to the destination surface. 1840 /// @param pSrc - Pointer to raster tile. 1841 /// @param pDstSurface - Destination surface state 1842 /// @param x, y - Coordinates to raster tile. 1843 INLINE static void Store( 1844 uint8_t *pSrc, 1845 SWR_SURFACE_STATE* pDstSurface, 1846 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1847 { 1848 static const uint32_t DestRowWidthBytes = 512; // 512B rows 1849 1850 // Punt non-full tiles to generic store 1851 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1852 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1853 1854 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1855 { 1856 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1857 } 1858 1859 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. 1860 // We can compute the offsets to each column within the raster tile once and increment from these. 1861 #if USE_8x2_TILE_BACKEND 1862 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1863 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1864 1865 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1866 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1867 1868 uint8_t* ppDsts[] = 1869 { 1870 pDst, // row 0, col 0 1871 pDst + DestRowWidthBytes, // row 1, col 0 1872 pDst + dx / 2, // row 0, col 1 1873 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 1874 }; 1875 1876 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1877 { 1878 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1879 { 1880 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1881 1882 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1883 1884 ppDsts[0] += dx; 1885 ppDsts[1] += dx; 1886 ppDsts[2] += dx; 1887 ppDsts[3] += dx; 1888 } 1889 1890 ppDsts[0] += dy; 1891 ppDsts[1] += dy; 1892 ppDsts[2] += dy; 1893 ppDsts[3] += dy; 1894 } 1895 #else 1896 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1897 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1898 uint8_t* pRow1 = pRow0 + DestRowWidthBytes; 1899 1900 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1901 { 1902 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) 1903 { 1904 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8); 1905 1906 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; 1907 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1908 1909 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1910 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1911 } 1912 1913 pRow0 += (DestRowWidthBytes * 2); 1914 pRow1 += (DestRowWidthBytes * 2); 1915 } 1916 #endif 1917 } 1918 }; 1919 1920 ////////////////////////////////////////////////////////////////////////// 1921 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp 1922 ////////////////////////////////////////////////////////////////////////// 1923 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 1924 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> 1925 { 1926 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1927 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1928 1929 ////////////////////////////////////////////////////////////////////////// 1930 /// @brief Stores an 8x8 raster tile to the destination surface. 1931 /// @param pSrc - Pointer to raster tile. 1932 /// @param pDstSurface - Destination surface state 1933 /// @param x, y - Coordinates to raster tile. 1934 INLINE static void Store( 1935 uint8_t *pSrc, 1936 SWR_SURFACE_STATE* pDstSurface, 1937 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 1938 { 1939 static const uint32_t DestRowWidthBytes = 16; // 16B rows 1940 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 1941 1942 // Punt non-full tiles to generic store 1943 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 1944 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 1945 1946 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 1947 { 1948 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 1949 } 1950 1951 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 1952 // We can compute the offsets to each column within the raster tile once and increment from these. 1953 #if USE_8x2_TILE_BACKEND 1954 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1955 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1956 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1957 1958 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1959 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1960 1961 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1962 uint8_t *ppDsts[] = 1963 { 1964 pDst, // row 0, col 0 1965 pDst + DestRowWidthBytes, // row 1, col 0 1966 pDst + DestColumnBytes, // row 0, col 1 1967 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 1968 }; 1969 1970 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1971 { 1972 // Raster tile width is same as simd16 tile width 1973 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1974 1975 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1976 1977 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1978 1979 ppDsts[0] += dy; 1980 ppDsts[1] += dy; 1981 ppDsts[2] += dy; 1982 ppDsts[3] += dy; 1983 } 1984 #else 1985 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1986 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1987 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1988 1989 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 1990 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 1991 1992 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1993 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 1994 { 1995 uint32_t rowOffset = row * DestRowWidthBytes; 1996 1997 uint8_t* pRow = pCol0 + rowOffset; 1998 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 1999 2000 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2001 pSrc += pSrcInc; 2002 2003 ppDsts[0] += DestColumnBytes; 2004 ppDsts[1] += DestColumnBytes; 2005 2006 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2007 pSrc += pSrcInc; 2008 } 2009 #endif 2010 } 2011 }; 2012 2013 ////////////////////////////////////////////////////////////////////////// 2014 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp 2015 ////////////////////////////////////////////////////////////////////////// 2016 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2017 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> 2018 { 2019 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; 2020 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2021 2022 ////////////////////////////////////////////////////////////////////////// 2023 /// @brief Stores an 8x8 raster tile to the destination surface. 2024 /// @param pSrc - Pointer to raster tile. 2025 /// @param pDstSurface - Destination surface state 2026 /// @param x, y - Coordinates to raster tile. 2027 INLINE static void Store( 2028 uint8_t *pSrc, 2029 SWR_SURFACE_STATE* pDstSurface, 2030 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 2031 { 2032 static const uint32_t DestRowWidthBytes = 16; // 16B rows 2033 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 2034 2035 // Punt non-full tiles to generic store 2036 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 2037 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 2038 2039 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 2040 { 2041 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 2042 } 2043 2044 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 2045 // We can compute the offsets to each column within the raster tile once and increment from these. 2046 #if USE_8x2_TILE_BACKEND 2047 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2048 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2049 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2050 2051 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 2052 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2053 2054 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2055 uint8_t *ppDsts[] = 2056 { 2057 pDst, // row 0, col 0 2058 pDst + DestRowWidthBytes, // row 1, col 0 2059 pDst + DestColumnBytes, // row 0, col 1 2060 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 2061 pDst + DestColumnBytes * 2, // row 0, col 2 2062 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 2063 pDst + DestColumnBytes * 3, // row 0, col 3 2064 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 2065 }; 2066 2067 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2068 { 2069 // Raster tile width is same as simd16 tile width 2070 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2071 2072 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2073 2074 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2075 2076 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 2077 { 2078 ppDsts[i] += dy; 2079 } 2080 } 2081 #else 2082 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2083 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2084 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2085 uint8_t* pCol1 = pCol0 + DestColumnBytes; 2086 2087 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. 2088 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 2089 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 2090 2091 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2092 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 2093 { 2094 uint32_t rowOffset = row * DestRowWidthBytes; 2095 uint8_t* ppDsts[] = 2096 { 2097 pCol0 + rowOffset, 2098 pCol0 + rowOffset + DestRowWidthBytes, 2099 pCol1 + rowOffset, 2100 pCol1 + rowOffset + DestRowWidthBytes, 2101 }; 2102 2103 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2104 pSrc += pSrcInc; 2105 2106 ppDsts[0] += DestColumnBytes * 2; 2107 ppDsts[1] += DestColumnBytes * 2; 2108 ppDsts[2] += DestColumnBytes * 2; 2109 ppDsts[3] += DestColumnBytes * 2; 2110 2111 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2112 pSrc += pSrcInc; 2113 } 2114 #endif 2115 } 2116 }; 2117 2118 ////////////////////////////////////////////////////////////////////////// 2119 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp 2120 ////////////////////////////////////////////////////////////////////////// 2121 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2122 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> 2123 { 2124 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile; 2125 #if USE_8x2_TILE_BACKEND 2126 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2127 2128 #else 2129 static const size_t TILE_Y_COL_WIDTH_BYTES = 16; 2130 static const size_t TILE_Y_ROWS = 32; 2131 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; 2132 2133 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 2134 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2135 static const size_t MAX_DST_COLUMN_BYTES = 16; 2136 2137 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 2138 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; 2139 2140 #endif 2141 ////////////////////////////////////////////////////////////////////////// 2142 /// @brief Stores an 8x8 raster tile to the destination surface. 2143 /// @param pSrc - Pointer to raster tile. 2144 /// @param pDstSurface - Destination surface state 2145 /// @param x, y - Coordinates to raster tile. 2146 INLINE static void Store( 2147 uint8_t *pSrc, 2148 SWR_SURFACE_STATE* pDstSurface, 2149 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 2150 { 2151 #if USE_8x2_TILE_BACKEND 2152 static const uint32_t DestRowWidthBytes = 16; // 16B rows 2153 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 2154 #endif 2155 2156 // Punt non-full tiles to generic store 2157 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 2158 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 2159 2160 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 2161 { 2162 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 2163 } 2164 2165 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 2166 // We can compute the offsets to each column within the raster tile once and increment from these. 2167 #if USE_8x2_TILE_BACKEND 2168 // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2169 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2170 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2171 2172 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 2173 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2174 2175 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2176 uint8_t *ppDsts[] = 2177 { 2178 pDst, // row 0, col 0 2179 pDst + DestRowWidthBytes, // row 1, col 0 2180 pDst + DestColumnBytes, // row 0, col 1 2181 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 2182 pDst + DestColumnBytes * 2, // row 0, col 2 2183 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 2184 pDst + DestColumnBytes * 3, // row 0, col 3 2185 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 2186 pDst + DestColumnBytes * 4, // row 0, col 4 2187 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 2188 pDst + DestColumnBytes * 5, // row 0, col 5 2189 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 2190 pDst + DestColumnBytes * 6, // row 0, col 6 2191 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 2192 pDst + DestColumnBytes * 7, // row 0, col 7 2193 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 2194 }; 2195 2196 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2197 { 2198 // Raster tile width is same as simd16 tile width 2199 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2200 2201 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2202 2203 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2204 2205 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 2206 { 2207 ppDsts[i] += dy; 2208 } 2209 } 2210 #else 2211 // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2212 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2213 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2214 struct DstPtrs 2215 { 2216 uint8_t* ppDsts[8]; 2217 } ptrs; 2218 2219 // Need 8 pointers, 4 columns of 2 rows each 2220 for (uint32_t y = 0; y < 2; ++y) 2221 { 2222 for (uint32_t x = 0; x < 4; ++x) 2223 { 2224 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; 2225 } 2226 } 2227 2228 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 2229 { 2230 DstPtrs startPtrs = ptrs; 2231 2232 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 2233 { 2234 // Format conversion and convert from SOA to AOS, and store the rows. 2235 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 2236 2237 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 2238 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 2239 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 2240 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 2241 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 2242 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 2243 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 2244 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 2245 pSrc += SRC_COLUMN_BYTES; 2246 } 2247 2248 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; 2249 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; 2250 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; 2251 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; 2252 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; 2253 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; 2254 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; 2255 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; 2256 } 2257 #endif 2258 } 2259 }; 2260 2261 ////////////////////////////////////////////////////////////////////////// 2262 /// StoreMacroTile - Stores a macro tile which consists of raster tiles. 2263 ////////////////////////////////////////////////////////////////////////// 2264 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 2265 struct StoreMacroTile 2266 { 2267 ////////////////////////////////////////////////////////////////////////// 2268 /// @brief Stores a macrotile to the destination surface using safe implementation. 2269 /// @param pSrc - Pointer to macro tile. 2270 /// @param pDstSurface - Destination surface state 2271 /// @param x, y - Coordinates to macro tile 2272 static void StoreGeneric( 2273 uint8_t *pSrcHotTile, 2274 SWR_SURFACE_STATE* pDstSurface, 2275 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 2276 { 2277 PFN_STORE_TILES_INTERNAL pfnStore; 2278 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 2279 2280 // Store each raster tile from the hot tile to the destination surface. 2281 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 2282 { 2283 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 2284 { 2285 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2286 { 2287 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 2288 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 2289 } 2290 } 2291 } 2292 2293 } 2294 2295 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); 2296 ////////////////////////////////////////////////////////////////////////// 2297 /// @brief Stores a macrotile to the destination surface. 2298 /// @param pSrc - Pointer to macro tile. 2299 /// @param pDstSurface - Destination surface state 2300 /// @param x, y - Coordinates to macro tile 2301 static void Store( 2302 uint8_t *pSrcHotTile, 2303 SWR_SURFACE_STATE* pDstSurface, 2304 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 2305 { 2306 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; 2307 2308 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2309 { 2310 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( 2311 0, 2312 0, 2313 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces 2314 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays 2315 sampleNum, 2316 pDstSurface->lod, 2317 pDstSurface); 2318 2319 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear 2320 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || 2321 (pDstSurface->bInterleavedSamples); 2322 2323 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 2324 } 2325 2326 // Store each raster tile from the hot tile to the destination surface. 2327 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 2328 { 2329 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 2330 { 2331 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 2332 { 2333 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 2334 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 2335 } 2336 } 2337 } 2338 } 2339 }; 2340 2341 ////////////////////////////////////////////////////////////////////////// 2342 /// InitStoreTilesTable - Helper for setting up the tables. 2343 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 2344 void InitStoreTilesTableColor_Half1( 2345 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) 2346 { 2347 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; 2348 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; 2349 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; 2350 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; 2351 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store; 2352 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store; 2353 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; 2354 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; 2355 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; 2356 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store; 2357 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store; 2358 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; 2359 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; 2360 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; 2361 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; 2362 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; 2363 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; 2364 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; 2365 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; 2366 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 2367 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store; 2368 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; 2369 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; 2370 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store; 2371 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store; 2372 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store; 2373 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store; 2374 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; 2375 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; 2376 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; 2377 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; 2378 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; 2379 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; 2380 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; 2381 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; 2382 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; 2383 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; 2384 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; 2385 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; 2386 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; 2387 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; 2388 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; 2389 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; 2390 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; 2391 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; 2392 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric; 2393 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; 2394 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; 2395 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; 2396 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric; 2397 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric; 2398 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; 2399 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; 2400 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; 2401 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; 2402 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; 2403 } 2404 2405 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 2406 void InitStoreTilesTableColor_Half2( 2407 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) 2408 { 2409 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric; 2410 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; 2411 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric; 2412 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store; 2413 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store; 2414 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store; 2415 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store; 2416 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store; 2417 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store; 2418 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; 2419 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; 2420 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; 2421 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; 2422 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; 2423 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; 2424 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; 2425 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; 2426 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; 2427 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; 2428 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; 2429 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; 2430 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; 2431 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; 2432 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; 2433 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; 2434 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; 2435 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; 2436 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; 2437 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store; 2438 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store; 2439 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store; 2440 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store; 2441 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric; 2442 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric; 2443 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; 2444 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; 2445 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; 2446 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; 2447 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; 2448 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store; 2449 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store; 2450 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; 2451 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; 2452 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store; 2453 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store; 2454 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; 2455 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; 2456 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; 2457 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store; 2458 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store; 2459 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; 2460 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; 2461 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; 2462 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; 2463 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric; 2464 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric; 2465 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; 2466 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; 2467 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric; 2468 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric; 2469 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; 2470 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; 2471 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; 2472 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; 2473 } 2474 2475 ////////////////////////////////////////////////////////////////////////// 2476 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. 2477 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2478 void InitStoreTilesTableDepth( 2479 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2480 { 2481 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store; 2482 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 2483 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; 2484 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store; 2485 } 2486 2487 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 2488 void InitStoreTilesTableStencil( 2489 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 2490 { 2491 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store; 2492 } 2493