• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35 
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39 
40 #include <array>
41 #include <sstream>
42 
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45 
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52 
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
61 
62 //////////////////////////////////////////////////////////////////////////
63 /// StorePixels
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts   - Array of destination pointers.  Each pointer is
67 ///                   to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers.  Each pair of
69 ///                    pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize, size_t NumDests>
72 struct StorePixels
73 {
74     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75 };
76 
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts   - Array of destination pointers.  Each pointer is
82 ///                   to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers.  Each pair of
84 ///                    pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
86 template <>
87 struct StorePixels<8, 2>
88 {
89     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90     {
91         // Each 4-pixel row is 4 bytes.
92         const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93 
94         // Unswizzle from SWR-Z order
95         uint16_t* pRow = (uint16_t*)ppDsts[0];
96         pRow[0] = pPixSrc[0];
97         pRow[1] = pPixSrc[2];
98 
99         pRow = (uint16_t*)ppDsts[1];
100         pRow[0] = pPixSrc[1];
101         pRow[1] = pPixSrc[3];
102     }
103 };
104 
105 #if USE_8x2_TILE_BACKEND
106 template <>
107 struct StorePixels<8, 4>
108 {
109     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110     {
111         // 8 x 2 bytes = 16 bytes, 16 pixels
112         const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113 
114         uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115 
116         // Unswizzle from SWR-Z order
117         ppDsts16[0][0] = pSrc16[0];     // 0 1
118         ppDsts16[0][1] = pSrc16[2];     // 4 5
119 
120         ppDsts16[1][0] = pSrc16[1];     // 2 3
121         ppDsts16[1][1] = pSrc16[3];     // 6 7
122 
123         ppDsts16[2][0] = pSrc16[4];     // 8 9
124         ppDsts16[2][1] = pSrc16[6];     // C D
125 
126         ppDsts16[3][0] = pSrc16[5];     // A B
127         ppDsts16[3][1] = pSrc16[7];     // E F
128     }
129 };
130 
131 #endif
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts   - Array of destination pointers.  Each pointer is
137 ///                   to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers.  Each pair of
139 ///                    pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145     {
146         // Each 4-pixel row is 8 bytes.
147         const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148 
149         // Unswizzle from SWR-Z order
150         uint32_t* pRow = (uint32_t*)ppDsts[0];
151         pRow[0] = pPixSrc[0];
152         pRow[1] = pPixSrc[2];
153 
154         pRow = (uint32_t*)ppDsts[1];
155         pRow[0] = pPixSrc[1];
156         pRow[1] = pPixSrc[3];
157     }
158 };
159 
160 #if USE_8x2_TILE_BACKEND
161 template <>
162 struct StorePixels<16, 4>
163 {
164     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165     {
166         // 8 x 4 bytes = 32 bytes, 16 pixels
167         const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168 
169         uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170 
171         // Unswizzle from SWR-Z order
172         ppDsts32[0][0] = pSrc32[0];     // 0 1
173         ppDsts32[0][1] = pSrc32[2];     // 4 5
174 
175         ppDsts32[1][0] = pSrc32[1];     // 2 3
176         ppDsts32[1][1] = pSrc32[3];     // 6 7
177 
178         ppDsts32[2][0] = pSrc32[4];     // 8 9
179         ppDsts32[2][1] = pSrc32[6];     // C D
180 
181         ppDsts32[3][0] = pSrc32[5];     // A B
182         ppDsts32[3][1] = pSrc32[7];     // E F
183     }
184 };
185 
186 #endif
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts   - Array of destination pointers.  Each pointer is
192 ///                   to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers.  Each pair of
194 ///                    pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<32, 2>
198 {
199     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200     {
201         // Each 4-pixel row is 16-bytes
202         __m128i *pZRow01 = (__m128i*)pSrc;
203         __m128i vQuad00 = _mm_load_si128(pZRow01);
204         __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
205 
206         __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
207         __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
208 
209         _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
210         _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
211     }
212 };
213 
214 #if USE_8x2_TILE_BACKEND
215 template <>
216 struct StorePixels<32, 4>
217 {
218     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219     {
220         // 4 x 16 bytes = 64 bytes, 16 pixels
221         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222 
223         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224 
225         // Unswizzle from SWR-Z order
226         __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
227         __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
228         __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
229         __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
230 
231         _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
232         _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
233         _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
234         _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
235     }
236 };
237 
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts   - Array of destination pointers.  Each pointer is
244 ///                   to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers.  Each pair of
246 ///                    pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
248 template <>
249 struct StorePixels<64, 4>
250 {
251     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252     {
253         // Each 4-pixel row is 32 bytes.
254         const __m128i* pPixSrc = (const __m128i*)pSrc;
255 
256         // order of pointers match SWR-Z layout
257         __m128i** pvDsts = (__m128i**)&ppDsts[0];
258         *pvDsts[0] = pPixSrc[0];
259         *pvDsts[1] = pPixSrc[1];
260         *pvDsts[2] = pPixSrc[2];
261         *pvDsts[3] = pPixSrc[3];
262     }
263 };
264 
265 #if USE_8x2_TILE_BACKEND
266 template <>
267 struct StorePixels<64, 8>
268 {
269     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270     {
271         // 8 x 16 bytes = 128 bytes, 16 pixels
272         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273 
274         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275 
276         // order of pointers match SWR-Z layout
277         *ppDsts128[0] = pSrc128[0];     // 0 1
278         *ppDsts128[1] = pSrc128[1];     // 2 3
279         *ppDsts128[2] = pSrc128[2];     // 4 5
280         *ppDsts128[3] = pSrc128[3];     // 6 7
281         *ppDsts128[4] = pSrc128[4];     // 8 9
282         *ppDsts128[5] = pSrc128[5];     // A B
283         *ppDsts128[6] = pSrc128[6];     // C D
284         *ppDsts128[7] = pSrc128[7];     // E F
285     }
286 };
287 
288 #endif
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts   - Array of destination pointers.  Each pointer is
294 ///                   to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers.  Each pair of
296 ///                    pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
298 template <>
299 struct StorePixels<128, 8>
300 {
301     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302     {
303         // Each 4-pixel row is 64 bytes.
304         const __m128i* pPixSrc = (const __m128i*)pSrc;
305 
306         // Unswizzle from SWR-Z order
307         __m128i** pvDsts = (__m128i**)&ppDsts[0];
308         *pvDsts[0] = pPixSrc[0];
309         *pvDsts[1] = pPixSrc[2];
310         *pvDsts[2] = pPixSrc[1];
311         *pvDsts[3] = pPixSrc[3];
312         *pvDsts[4] = pPixSrc[4];
313         *pvDsts[5] = pPixSrc[6];
314         *pvDsts[6] = pPixSrc[5];
315         *pvDsts[7] = pPixSrc[7];
316     }
317 };
318 
319 #if USE_8x2_TILE_BACKEND
320 template <>
321 struct StorePixels<128, 16>
322 {
323     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324     {
325         // 16 x 16 bytes = 256 bytes, 16 pixels
326         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327 
328         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329 
330         for (uint32_t i = 0; i < 16; i += 4)
331         {
332             *ppDsts128[i + 0] = pSrc128[i + 0];
333             *ppDsts128[i + 1] = pSrc128[i + 2];
334             *ppDsts128[i + 2] = pSrc128[i + 1];
335             *ppDsts128[i + 3] = pSrc128[i + 3];
336         }
337     }
338 };
339 
340 #endif
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345 struct ConvertPixelsSOAtoAOS
346 {
347     //////////////////////////////////////////////////////////////////////////
348     /// @brief Converts a SIMD from the Hot Tile to the destination format
349     ///        and converts from SOA to AOS.
350     /// @param pSrc - Pointer to raster tile.
351     /// @param pDst - Pointer to destination surface or deswizzling buffer.
352     template <size_t NumDests>
353     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354     {
355 #if USE_8x2_TILE_BACKEND
356         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357 
358         OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360 
361         // Convert from SrcFormat --> DstFormat
362         simd16vector src;
363         LoadSOA<SrcFormat>(pSrc, src);
364         StoreSOA<DstFormat>(src, soaTile);
365 
366         // Convert from SOA --> AOS
367         FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368 
369 #else
370         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371 
372         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374 
375         // Convert from SrcFormat --> DstFormat
376         simdvector src;
377         LoadSOA<SrcFormat>(pSrc, src);
378         StoreSOA<DstFormat>(src, soaTile);
379 
380         // Convert from SOA --> AOS
381         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382 
383 #endif
384         // Store data into destination
385         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386     }
387 };
388 
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format>
394 struct ConvertPixelsSOAtoAOS<Format, Format>
395 {
396     //////////////////////////////////////////////////////////////////////////
397     /// @brief Converts a SIMD from the Hot Tile to the destination format
398     ///        and converts from SOA to AOS.
399     /// @param pSrc - Pointer to raster tile.
400     /// @param pDst - Pointer to destination surface or deswizzling buffer.
401     template <size_t NumDests>
402     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403     {
404 #if USE_8x2_TILE_BACKEND
405         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406 
407         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408 
409         // Convert from SOA --> AOS
410         FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411 
412 #else
413         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414 
415         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416 
417         // Convert from SOA --> AOS
418         FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419 
420 #endif
421         // Store data into destination
422         StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423     }
424 };
425 
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
429 template<>
430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431 {
432     //////////////////////////////////////////////////////////////////////////
433     /// @brief Converts a SIMD from the Hot Tile to the destination format
434     ///        and converts from SOA to AOS.
435     /// @param pSrc - Pointer to raster tile.
436     /// @param pDst - Pointer to destination surface or deswizzling buffer.
437     template <size_t NumDests>
438     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439     {
440 #if USE_8x2_TILE_BACKEND
441         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443 
444         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445 
446         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447 
448         // Load hot-tile
449         simd16vector src, dst;
450         LoadSOA<SrcFormat>(pSrc, src);
451 
452         // deswizzle
453         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456 
457         // clamp
458         dst.x = Clamp<DstFormat>(dst.x, 0);
459         dst.y = Clamp<DstFormat>(dst.y, 1);
460         dst.z = Clamp<DstFormat>(dst.z, 2);
461 
462         // normalize
463         dst.x = Normalize<DstFormat>(dst.x, 0);
464         dst.y = Normalize<DstFormat>(dst.y, 1);
465         dst.z = Normalize<DstFormat>(dst.z, 2);
466 
467         // pack
468         simd16scalari packed = _simd16_castps_si(dst.x);
469 
470         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472 
473         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475 
476         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477         uint32_t *pPacked = (uint32_t*)&packed;
478         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479         for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480         {
481             *pAosTile++ = *pPacked++;
482         }
483 
484 #else
485         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488 
489         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490 
491         // Load hot-tile
492         simdvector src, dst;
493         LoadSOA<SrcFormat>(pSrc, src);
494 
495         // deswizzle
496         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499 
500         // clamp
501         dst.x = Clamp<DstFormat>(dst.x, 0);
502         dst.y = Clamp<DstFormat>(dst.y, 1);
503         dst.z = Clamp<DstFormat>(dst.z, 2);
504 
505         // normalize
506         dst.x = Normalize<DstFormat>(dst.x, 0);
507         dst.y = Normalize<DstFormat>(dst.y, 1);
508         dst.z = Normalize<DstFormat>(dst.z, 2);
509 
510         // pack
511         simdscalari packed = _simd_castps_si(dst.x);
512         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
513         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
514                                                                               FormatTraits<DstFormat>::GetBPC(1)));
515 
516         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517         uint32_t *pPacked = (uint32_t*)&packed;
518         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519         for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520         {
521             *pAosTile++ = *pPacked++;
522         }
523 
524 #endif
525         // Store data into destination
526         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527     }
528 };
529 
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
533 template<>
534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535 {
536     static const SWR_FORMAT SrcFormat = R32_FLOAT;
537     static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538 
539     //////////////////////////////////////////////////////////////////////////
540     /// @brief Converts a SIMD from the Hot Tile to the destination format
541     ///        and converts from SOA to AOS.
542     /// @param pSrc - Pointer to raster tile.
543     /// @param pDst - Pointer to destination surface or deswizzling buffer.
544     template <size_t NumDests>
545     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546     {
547 #if USE_8x2_TILE_BACKEND
548         simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549 
550         // clamp
551         const simd16scalar zero = _simd16_setzero_ps();
552         const simd16scalar ones = _simd16_set1_ps(1.0f);
553 
554         comp = _simd16_max_ps(comp, zero);
555         comp = _simd16_min_ps(comp, ones);
556 
557         // normalize
558         comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559 
560         simd16scalari temp = _simd16_cvtps_epi32(comp);
561 
562         // swizzle
563         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564 
565         // merge/store data into destination but don't overwrite the X8 bits
566         simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
567         simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
568 
569         simd16scalari dest = _simd16_setzero_si();
570 
571         dest = _simd16_insert_si(dest, destlo, 0);
572         dest = _simd16_insert_si(dest, desthi, 1);
573 
574         simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575 
576         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577 
578         _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
579         _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580 #else
581         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582 
583         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585 
586         // Convert from SrcFormat --> DstFormat
587         simdvector src;
588         LoadSOA<SrcFormat>(pSrc, src);
589         StoreSOA<DstFormat>(src, soaTile);
590 
591         // Convert from SOA --> AOS
592         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593 
594         // Store data into destination but don't overwrite the X8 bits
595         // Each 4-pixel row is 16-bytes
596         __m128i *pZRow01 = (__m128i*)aosTile;
597         __m128i vQuad00 = _mm_load_si128(pZRow01);
598         __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
599 
600         __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
601         __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
602 
603         __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
604         __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
605 
606         __m128i vMask = _mm_set1_epi32(0xFFFFFF);
607 
608         vDst0 = _mm_andnot_si128(vMask, vDst0);
609         vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
610         vDst1 = _mm_andnot_si128(vMask, vDst1);
611         vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
612 
613         _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
614         _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615 #endif
616     }
617 };
618 
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat>
621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622 {
623     // swizzle rgba -> bgra while we load
624     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627     simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628 
629     // clamp
630     const simd16scalar zero = _simd16_setzero_ps();
631     const simd16scalar ones = _simd16_set1_ps(1.0f);
632 
633     comp0 = _simd16_max_ps(comp0, zero);
634     comp0 = _simd16_min_ps(comp0, ones);
635 
636     comp1 = _simd16_max_ps(comp1, zero);
637     comp1 = _simd16_min_ps(comp1, ones);
638 
639     comp2 = _simd16_max_ps(comp2, zero);
640     comp2 = _simd16_min_ps(comp2, ones);
641 
642     comp3 = _simd16_max_ps(comp3, zero);
643     comp3 = _simd16_min_ps(comp3, ones);
644 
645     // gamma-correct only rgb
646     if (FormatTraits<DstFormat>::isSRGB)
647     {
648         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651     }
652 
653     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657     comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658 
659     // moving to 16 wide integer vector types
660     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663     simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664 
665     // SOA to AOS conversion
666     src1 = _simd16_slli_epi32(src1,  8);
667     src2 = _simd16_slli_epi32(src2, 16);
668     src3 = _simd16_slli_epi32(src3, 24);
669 
670     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671 
672     // de-swizzle conversion
673 #if 1
674     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
676 
677     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678 
679 #else
680     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681 
682 #endif
683     // store 8x2 memory order:
684     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688 }
689 
690 #endif
691 template<SWR_FORMAT DstFormat>
692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693 {
694     static const uint32_t offset = sizeof(simdscalar);
695 
696     // swizzle rgba -> bgra while we load
697     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700     simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701 
702     // clamp
703     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705 
706     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708 
709     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711 
712     vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713     vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714 
715     if (FormatTraits<DstFormat>::isSRGB)
716     {
717         // Gamma-correct only rgb
718         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721     }
722 
723     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727     vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728 
729     // moving to 8 wide integer vector types
730     __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731     __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732     __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733     __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734 
735 #if KNOB_ARCH == KNOB_ARCH_AVX
736 
737     // splitting into two sets of 4 wide integer vector types
738     // because AVX doesn't have instructions to support this operation at 8 wide
739     __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740     __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741     __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742     __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743 
744     __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745     __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746     __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747     __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748 
749     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755 
756     srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
757     srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
758 
759     srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
760     srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
761 
762     srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
763     srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
764 
765     // unpack into rows that get the tiling order correct
766     __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
767     __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
768 
769     __m256i final = _mm256_castsi128_si256(vRow00);
770     final = _mm256_insertf128_si256(final, vRow10, 1);
771 
772 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
773 
774     // logic is as above, only wider
775     src1 = _mm256_slli_si256(src1, 1);
776     src2 = _mm256_slli_si256(src2, 2);
777     src3 = _mm256_slli_si256(src3, 3);
778 
779     src0 = _mm256_or_si256(src0, src1);
780     src2 = _mm256_or_si256(src2, src3);
781 
782     __m256i final = _mm256_or_si256(src0, src2);
783 #if 0
784 
785     __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
786 
787     final = _mm256_permutevar8x32_epi32(final, perm);
788 #else
789 
790     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
791     final = _mm256_permute4x64_epi64(final, 0xD8);
792 #endif
793 #endif
794 
795     _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
796 }
797 
798 #if USE_8x2_TILE_BACKEND
799 template<SWR_FORMAT DstFormat>
800 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
801 {
802     // swizzle rgba -> bgra while we load
803     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
804     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
805     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
806 
807     // clamp
808     const simd16scalar zero = _simd16_setzero_ps();
809     const simd16scalar ones = _simd16_set1_ps(1.0f);
810 
811     comp0 = _simd16_max_ps(comp0, zero);
812     comp0 = _simd16_min_ps(comp0, ones);
813 
814     comp1 = _simd16_max_ps(comp1, zero);
815     comp1 = _simd16_min_ps(comp1, ones);
816 
817     comp2 = _simd16_max_ps(comp2, zero);
818     comp2 = _simd16_min_ps(comp2, ones);
819 
820     // gamma-correct only rgb
821     if (FormatTraits<DstFormat>::isSRGB)
822     {
823         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
824         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
825         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
826     }
827 
828     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
829     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
830     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
831     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
832 
833     // moving to 16 wide integer vector types
834     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
835     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
836     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
837 
838     // SOA to AOS conversion
839     src1 = _simd16_slli_epi32(src1,  8);
840     src2 = _simd16_slli_epi32(src2, 16);
841 
842     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
843 
844     // de-swizzle conversion
845 #if 1
846     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
847     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
848 
849     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
850 
851 #else
852     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
853 
854 #endif
855     // store 8x2 memory order:
856     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
857     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
858     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
859     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
860 }
861 
862 #endif
863 template<SWR_FORMAT DstFormat>
864 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
865 {
866     static const uint32_t offset = sizeof(simdscalar);
867 
868     // swizzle rgba -> bgra while we load
869     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
870     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
871     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
872                                                                                                             // clamp
873     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
874     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
875 
876     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
877     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
878 
879     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
880     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
881 
882     if (FormatTraits<DstFormat>::isSRGB)
883     {
884         // Gamma-correct only rgb
885         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
886         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
887         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
888     }
889 
890     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
891     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
892     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
893     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
894 
895     // moving to 8 wide integer vector types
896     __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
897     __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
898     __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
899 
900 #if KNOB_ARCH == KNOB_ARCH_AVX
901 
902     // splitting into two sets of 4 wide integer vector types
903     // because AVX doesn't have instructions to support this operation at 8 wide
904     __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
905     __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
906     __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
907 
908     __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
909     __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
910     __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
911 
912     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
913     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
914     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
915     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
916 
917     srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
918 
919     srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
920 
921     srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
922     srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
923 
924     // unpack into rows that get the tiling order correct
925     __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
926     __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
927 
928     __m256i final = _mm256_castsi128_si256(vRow00);
929     final = _mm256_insertf128_si256(final, vRow10, 1);
930 
931 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
932 
933                                               // logic is as above, only wider
934     src1 = _mm256_slli_si256(src1, 1);
935     src2 = _mm256_slli_si256(src2, 2);
936 
937     src0 = _mm256_or_si256(src0, src1);
938 
939     __m256i final = _mm256_or_si256(src0, src2);
940 
941     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
942     final = _mm256_permute4x64_epi64(final, 0xD8);
943 
944 #endif
945 
946     _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
947 }
948 
949 template<>
950 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
951 {
952     template <size_t NumDests>
953     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
954     {
955 #if USE_8x2_TILE_BACKEND
956         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
957 #else
958         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
959 #endif
960     }
961 };
962 
963 template<>
964 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
965 {
966     template <size_t NumDests>
967     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
968     {
969 #if USE_8x2_TILE_BACKEND
970         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
971 #else
972         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
973 #endif
974     }
975 };
976 
977 template<>
978 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
979 {
980     template <size_t NumDests>
981     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
982     {
983 #if USE_8x2_TILE_BACKEND
984         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
985 #else
986         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
987 #endif
988     }
989 };
990 
991 template<>
992 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
993 {
994     template <size_t NumDests>
995     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
996     {
997 #if USE_8x2_TILE_BACKEND
998         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
999 #else
1000         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1001 #endif
1002     }
1003 };
1004 
1005 template<>
1006 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1007 {
1008     template <size_t NumDests>
1009     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1010     {
1011 #if USE_8x2_TILE_BACKEND
1012         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1013 #else
1014         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1015 #endif
1016     }
1017 };
1018 
1019 template<>
1020 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1021 {
1022     template <size_t NumDests>
1023     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1024     {
1025 #if USE_8x2_TILE_BACKEND
1026         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1027 #else
1028         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1029 #endif
1030     }
1031 };
1032 
1033 template<>
1034 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1035 {
1036     template <size_t NumDests>
1037     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1038     {
1039 #if USE_8x2_TILE_BACKEND
1040         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1041 #else
1042         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1043 #endif
1044     }
1045 };
1046 
1047 template<>
1048 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1049 {
1050     template <size_t NumDests>
1051     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1052     {
1053 #if USE_8x2_TILE_BACKEND
1054         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1055 #else
1056         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1057 #endif
1058     }
1059 };
1060 
1061 //////////////////////////////////////////////////////////////////////////
1062 /// StoreRasterTile
1063 //////////////////////////////////////////////////////////////////////////
1064 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1065 struct StoreRasterTile
1066 {
1067     //////////////////////////////////////////////////////////////////////////
1068     /// @brief Retrieve color from hot tile source which is always float.
1069     /// @param pSrc - Pointer to raster tile.
1070     /// @param x, y - Coordinates to raster tile.
1071     /// @param output - output color
1072     INLINE static void GetSwizzledSrcColor(
1073         uint8_t* pSrc,
1074         uint32_t x, uint32_t y,
1075         float outputColor[4])
1076     {
1077 #if USE_8x2_TILE_BACKEND
1078         typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1079 
1080         SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1081 
1082         // Compute which simd tile we're accessing within 8x8 tile.
1083         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1084         uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1085 
1086         SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1087 
1088         uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1089 
1090         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1091 #else
1092         typedef SimdTile<SrcFormat, DstFormat> SimdT;
1093 
1094         SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1095 
1096         // Compute which simd tile we're accessing within 8x8 tile.
1097         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1098         uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1099 
1100         SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1101 
1102         uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1103 
1104         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1105 #endif
1106     }
1107 
1108     //////////////////////////////////////////////////////////////////////////
1109     /// @brief Stores an 8x8 raster tile to the destination surface.
1110     /// @param pSrc - Pointer to raster tile.
1111     /// @param pDstSurface - Destination surface state
1112     /// @param x, y - Coordinates to raster tile.
1113     INLINE static void Store(
1114         uint8_t *pSrc,
1115         SWR_SURFACE_STATE* pDstSurface,
1116         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1117     {
1118         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1119         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1120 
1121         // For each raster tile pixel (rx, ry)
1122         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1123         {
1124             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1125             {
1126                 // Perform bounds checking.
1127                 if (((x + rx) < lodWidth) &&
1128                     ((y + ry) < lodHeight))
1129                 {
1130                     float srcColor[4];
1131                     GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1132 
1133                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1134                         pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1135                         sampleNum, pDstSurface->lod, pDstSurface);
1136                     {
1137                         ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1138                     }
1139                 }
1140             }
1141         }
1142     }
1143 };
1144 
1145 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1146 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1147 {};
1148 
1149 //////////////////////////////////////////////////////////////////////////
1150 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1151 //////////////////////////////////////////////////////////////////////////
1152 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1153 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1154 {
1155     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1156     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1157     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1158 
1159     //////////////////////////////////////////////////////////////////////////
1160     /// @brief Stores an 8x8 raster tile to the destination surface.
1161     /// @param pSrc - Pointer to raster tile.
1162     /// @param pDstSurface - Destination surface state
1163     /// @param x, y - Coordinates to raster tile.
1164     INLINE static void Store(
1165         uint8_t *pSrc,
1166         SWR_SURFACE_STATE* pDstSurface,
1167         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1168     {
1169         // Punt non-full tiles to generic store
1170         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1171         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1172 
1173         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1174         {
1175             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1176         }
1177 
1178         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1179             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1180 #if USE_8x2_TILE_BACKEND
1181 
1182         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1183         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1184 
1185         uint8_t* ppDsts[] =
1186         {
1187             pDst,                                           // row 0, col 0
1188             pDst + pDstSurface->pitch,                      // row 1, col 0
1189             pDst + dx / 2,                                  // row 0, col 1
1190             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1191         };
1192 
1193         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1194         {
1195             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1196             {
1197                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1198 
1199                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1200 
1201                 ppDsts[0] += dx;
1202                 ppDsts[1] += dx;
1203                 ppDsts[2] += dx;
1204                 ppDsts[3] += dx;
1205             }
1206 
1207             ppDsts[0] += dy;
1208             ppDsts[1] += dy;
1209             ppDsts[2] += dy;
1210             ppDsts[3] += dy;
1211         }
1212 #else
1213         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1214 
1215         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1216         {
1217             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1218 
1219             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1220             {
1221                 // Format conversion and convert from SOA to AOS, and store the rows.
1222                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1223 
1224                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1225                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1226                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1227             }
1228 
1229             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1230             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1231         }
1232 #endif
1233     }
1234 };
1235 
1236 //////////////////////////////////////////////////////////////////////////
1237 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1238 //////////////////////////////////////////////////////////////////////////
1239 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1240 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1241 {
1242     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1243     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1244     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1245 
1246     //////////////////////////////////////////////////////////////////////////
1247     /// @brief Stores an 8x8 raster tile to the destination surface.
1248     /// @param pSrc - Pointer to raster tile.
1249     /// @param pDstSurface - Destination surface state
1250     /// @param x, y - Coordinates to raster tile.
1251     INLINE static void Store(
1252         uint8_t *pSrc,
1253         SWR_SURFACE_STATE* pDstSurface,
1254         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1255     {
1256         // Punt non-full tiles to generic store
1257         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1258         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1259 
1260         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1261         {
1262             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1263         }
1264 
1265         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1266             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1267 #if USE_8x2_TILE_BACKEND
1268 
1269         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1270         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1271 
1272         uint8_t* ppDsts[] =
1273         {
1274             pDst,                                           // row 0, col 0
1275             pDst + pDstSurface->pitch,                      // row 1, col 0
1276             pDst + dx / 2,                                  // row 0, col 1
1277             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1278         };
1279 
1280         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1281         {
1282             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1283             {
1284                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1285 
1286                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1287 
1288                 ppDsts[0] += dx;
1289                 ppDsts[1] += dx;
1290                 ppDsts[2] += dx;
1291                 ppDsts[3] += dx;
1292             }
1293 
1294             ppDsts[0] += dy;
1295             ppDsts[1] += dy;
1296             ppDsts[2] += dy;
1297             ppDsts[3] += dy;
1298         }
1299 #else
1300         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1301 
1302         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1303         {
1304             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1305 
1306             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1307             {
1308                 // Format conversion and convert from SOA to AOS, and store the rows.
1309                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1310 
1311                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1312                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1313                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1314             }
1315 
1316             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1317             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1318         }
1319 #endif
1320     }
1321 };
1322 
1323 //////////////////////////////////////////////////////////////////////////
1324 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1325 //////////////////////////////////////////////////////////////////////////
1326 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1327 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1328 {
1329     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1330     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1331     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1332 
1333     //////////////////////////////////////////////////////////////////////////
1334     /// @brief Stores an 8x8 raster tile to the destination surface.
1335     /// @param pSrc - Pointer to raster tile.
1336     /// @param pDstSurface - Destination surface state
1337     /// @param x, y - Coordinates to raster tile.
1338     INLINE static void Store(
1339         uint8_t *pSrc,
1340         SWR_SURFACE_STATE* pDstSurface,
1341         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1342     {
1343         // Punt non-full tiles to generic store
1344         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1345         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1346 
1347         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1348         {
1349             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1350         }
1351 
1352         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1353             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1354 #if USE_8x2_TILE_BACKEND
1355 
1356         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1357         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1358 
1359         uint8_t* ppDsts[] =
1360         {
1361             pDst,                                           // row 0, col 0
1362             pDst + pDstSurface->pitch,                      // row 1, col 0
1363             pDst + dx / 2,                                  // row 0, col 1
1364             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1365         };
1366 
1367         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1368         {
1369             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1370             {
1371                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1372 
1373                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1374 
1375                 ppDsts[0] += dx;
1376                 ppDsts[1] += dx;
1377                 ppDsts[2] += dx;
1378                 ppDsts[3] += dx;
1379             }
1380 
1381             ppDsts[0] += dy;
1382             ppDsts[1] += dy;
1383             ppDsts[2] += dy;
1384             ppDsts[3] += dy;
1385         }
1386 #else
1387         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1388 
1389         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1390         {
1391             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1392 
1393             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1394             {
1395                 // Format conversion and convert from SOA to AOS, and store the rows.
1396                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1397 
1398                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1399                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1400                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1401             }
1402 
1403             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1404             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1405         }
1406 #endif
1407     }
1408 };
1409 
1410 //////////////////////////////////////////////////////////////////////////
1411 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1412 //////////////////////////////////////////////////////////////////////////
1413 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1414 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1415 {
1416     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1417     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1418     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1419     static const size_t MAX_DST_COLUMN_BYTES = 16;
1420 #if !USE_8x2_TILE_BACKEND
1421     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1422     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1423 #endif
1424 
1425     //////////////////////////////////////////////////////////////////////////
1426     /// @brief Stores an 8x8 raster tile to the destination surface.
1427     /// @param pSrc - Pointer to raster tile.
1428     /// @param pDstSurface - Destination surface state
1429     /// @param x, y - Coordinates to raster tile.
1430     INLINE static void Store(
1431         uint8_t *pSrc,
1432         SWR_SURFACE_STATE* pDstSurface,
1433         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1434     {
1435         // Punt non-full tiles to generic store
1436         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1437         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1438 
1439         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1440         {
1441             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1442         }
1443 
1444         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1445             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1446 #if USE_8x2_TILE_BACKEND
1447 
1448         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1449         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1450 
1451         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1452         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1453 
1454         uint8_t *ppDsts[] =
1455         {
1456             pDst,                                                               // row 0, col 0
1457             pDst + pDstSurface->pitch,                                          // row 1, col 0
1458             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1459             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1460             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1461             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1462             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1463             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
1464         };
1465 
1466         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1467         {
1468             // Raster tile width is same as simd16 tile width
1469             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1470 
1471             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1472 
1473             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1474 
1475             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1476             {
1477                 ppDsts[i] += dy;
1478             }
1479         }
1480 #else
1481         uint8_t* ppDsts[] =
1482         {
1483             pDst,                                               // row 0, col 0
1484             pDst + pDstSurface->pitch,                          // row 1, col 0
1485             pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
1486             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
1487         };
1488 
1489         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1490         {
1491             uint8_t* ppStartRows[] =
1492             {
1493                 ppDsts[0],
1494                 ppDsts[1],
1495                 ppDsts[2],
1496                 ppDsts[3],
1497             };
1498 
1499             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1500             {
1501                 // Format conversion and convert from SOA to AOS, and store the rows.
1502                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1503 
1504                 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1505                 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1506                 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1507                 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1508                 pSrc += SRC_COLUMN_BYTES;
1509             }
1510 
1511             ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1512             ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1513             ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1514             ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1515         }
1516 #endif
1517     }
1518 };
1519 
1520 //////////////////////////////////////////////////////////////////////////
1521 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1522 //////////////////////////////////////////////////////////////////////////
1523 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1524 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1525 {
1526     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1527     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1528     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1529     static const size_t MAX_DST_COLUMN_BYTES = 16;
1530 #if !USE_8x2_TILE_BACKEND
1531     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1532     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1533 #endif
1534 
1535     //////////////////////////////////////////////////////////////////////////
1536     /// @brief Stores an 8x8 raster tile to the destination surface.
1537     /// @param pSrc - Pointer to raster tile.
1538     /// @param pDstSurface - Destination surface state
1539     /// @param x, y - Coordinates to raster tile.
1540     INLINE static void Store(
1541         uint8_t *pSrc,
1542         SWR_SURFACE_STATE* pDstSurface,
1543         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1544     {
1545         // Punt non-full tiles to generic store
1546         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1547         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1548 
1549         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1550         {
1551             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1552         }
1553 
1554         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1555             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1556 #if USE_8x2_TILE_BACKEND
1557 
1558         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1559         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1560 
1561         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1562         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1563 
1564         uint8_t* ppDsts[] =
1565         {
1566             pDst,                                                               // row 0, col 0
1567             pDst + pDstSurface->pitch,                                          // row 1, col 0
1568             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1569             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1570             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1571             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1572             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1573             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
1574             pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
1575             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
1576             pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
1577             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
1578             pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
1579             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
1580             pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
1581             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
1582         };
1583 
1584         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1585         {
1586             // Raster tile width is same as simd16 tile width
1587             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1588 
1589             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1590 
1591             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1592 
1593             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1594             {
1595                 ppDsts[i] += dy;
1596             }
1597         }
1598 #else
1599         struct DstPtrs
1600         {
1601             uint8_t* ppDsts[8];
1602         } ptrs;
1603 
1604         // Need 8 pointers, 4 columns of 2 rows each
1605         for (uint32_t y = 0; y < 2; ++y)
1606         {
1607             for (uint32_t x = 0; x < 4; ++x)
1608             {
1609                 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1610             }
1611         }
1612 
1613         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1614         {
1615             DstPtrs startPtrs = ptrs;
1616 
1617             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1618             {
1619                 // Format conversion and convert from SOA to AOS, and store the rows.
1620                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1621 
1622                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1623                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1624                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1625                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1626                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1627                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1628                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1629                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1630                 pSrc += SRC_COLUMN_BYTES;
1631             }
1632 
1633             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1634             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1635             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1636             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1637             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1638             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1639             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1640             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1641         }
1642 #endif
1643     }
1644 };
1645 
1646 //////////////////////////////////////////////////////////////////////////
1647 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1648 //////////////////////////////////////////////////////////////////////////
1649 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1650 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1651 {
1652     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1653     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1654 
1655     //////////////////////////////////////////////////////////////////////////
1656     /// @brief Stores an 8x8 raster tile to the destination surface.
1657     /// @param pSrc - Pointer to raster tile.
1658     /// @param pDstSurface - Destination surface state
1659     /// @param x, y - Coordinates to raster tile.
1660     INLINE static void Store(
1661         uint8_t *pSrc,
1662         SWR_SURFACE_STATE* pDstSurface,
1663         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1664     {
1665         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1666 
1667         // Punt non-full tiles to generic store
1668         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1669         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1670 
1671         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1672         {
1673             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1674         }
1675 
1676         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1677         // We can compute the offsets to each column within the raster tile once and increment from these.
1678 #if USE_8x2_TILE_BACKEND
1679         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1680         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1681             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1682 
1683         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1684 
1685         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1686         uint8_t *ppDsts[] =
1687         {
1688             pDst,
1689             pDst + DestRowWidthBytes,
1690             pDst + DestRowWidthBytes / 4,
1691             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1692         };
1693 
1694         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1695         {
1696             // Raster tile width is same as simd16 tile width
1697             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1698 
1699             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1700 
1701             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1702 
1703             ppDsts[0] += dy;
1704             ppDsts[1] += dy;
1705             ppDsts[2] += dy;
1706             ppDsts[3] += dy;
1707         }
1708 #else
1709         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1710         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1711             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1712 
1713         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1714         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1715 
1716         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1717         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1718         {
1719             uint32_t rowOffset = row * DestRowWidthBytes;
1720 
1721             uint8_t* pRow = pCol0 + rowOffset;
1722             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1723 
1724             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1725             pSrc += pSrcInc;
1726 
1727             ppDsts[0] += DestRowWidthBytes / 4;
1728             ppDsts[1] += DestRowWidthBytes / 4;
1729 
1730             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1731             pSrc += pSrcInc;
1732         }
1733 #endif
1734     }
1735 };
1736 
1737 //////////////////////////////////////////////////////////////////////////
1738 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1739 //////////////////////////////////////////////////////////////////////////
1740 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1741 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1742 {
1743     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1744     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1745 
1746     //////////////////////////////////////////////////////////////////////////
1747     /// @brief Stores an 8x8 raster tile to the destination surface.
1748     /// @param pSrc - Pointer to raster tile.
1749     /// @param pDstSurface - Destination surface state
1750     /// @param x, y - Coordinates to raster tile.
1751     INLINE static void Store(
1752         uint8_t *pSrc,
1753         SWR_SURFACE_STATE* pDstSurface,
1754         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1755     {
1756         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1757 
1758         // Punt non-full tiles to generic store
1759         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1760         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1761 
1762         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1763         {
1764             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1765         }
1766 
1767         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1768         // We can compute the offsets to each column within the raster tile once and increment from these.
1769 #if USE_8x2_TILE_BACKEND
1770         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1771         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1772             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1773 
1774         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1775 
1776         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1777         uint8_t *ppDsts[] =
1778         {
1779             pDst,
1780             pDst + DestRowWidthBytes,
1781             pDst + DestRowWidthBytes / 2,
1782             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1783         };
1784 
1785         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1786         {
1787             // Raster tile width is same as simd16 tile width
1788             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1789 
1790             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1791 
1792             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1793 
1794             ppDsts[0] += dy;
1795             ppDsts[1] += dy;
1796             ppDsts[2] += dy;
1797             ppDsts[3] += dy;
1798         }
1799 #else
1800         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1801         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1802             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1803 
1804         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1805         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1806 
1807         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1808         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1809         {
1810             uint32_t rowOffset = row * DestRowWidthBytes;
1811 
1812             uint8_t* pRow = pCol0 + rowOffset;
1813             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1814 
1815             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1816             pSrc += pSrcInc;
1817 
1818             ppDsts[0] += DestRowWidthBytes / 2;
1819             ppDsts[1] += DestRowWidthBytes / 2;
1820 
1821             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1822             pSrc += pSrcInc;
1823         }
1824 #endif
1825     }
1826 };
1827 
1828 //////////////////////////////////////////////////////////////////////////
1829 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1830 //////////////////////////////////////////////////////////////////////////
1831 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1832 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1833 {
1834     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1835     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1836     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1837 
1838     //////////////////////////////////////////////////////////////////////////
1839     /// @brief Stores an 8x8 raster tile to the destination surface.
1840     /// @param pSrc - Pointer to raster tile.
1841     /// @param pDstSurface - Destination surface state
1842     /// @param x, y - Coordinates to raster tile.
1843     INLINE static void Store(
1844         uint8_t *pSrc,
1845         SWR_SURFACE_STATE* pDstSurface,
1846         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1847     {
1848         static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
1849 
1850         // Punt non-full tiles to generic store
1851         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1852         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1853 
1854         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1855         {
1856             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1857         }
1858 
1859         // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1860         // We can compute the offsets to each column within the raster tile once and increment from these.
1861 #if USE_8x2_TILE_BACKEND
1862         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1863             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1864 
1865         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1866         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1867 
1868         uint8_t* ppDsts[] =
1869         {
1870             pDst,                                           // row 0, col 0
1871             pDst + DestRowWidthBytes,                       // row 1, col 0
1872             pDst + dx / 2,                                  // row 0, col 1
1873             pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
1874         };
1875 
1876         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1877         {
1878             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1879             {
1880                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1881 
1882                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1883 
1884                 ppDsts[0] += dx;
1885                 ppDsts[1] += dx;
1886                 ppDsts[2] += dx;
1887                 ppDsts[3] += dx;
1888             }
1889 
1890             ppDsts[0] += dy;
1891             ppDsts[1] += dy;
1892             ppDsts[2] += dy;
1893             ppDsts[3] += dy;
1894         }
1895 #else
1896         uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1897             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1898         uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1899 
1900         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1901         {
1902             for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1903             {
1904                 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1905 
1906                 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1907                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1908 
1909                 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1910                 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1911             }
1912 
1913             pRow0 += (DestRowWidthBytes * 2);
1914             pRow1 += (DestRowWidthBytes * 2);
1915         }
1916 #endif
1917     }
1918 };
1919 
1920 //////////////////////////////////////////////////////////////////////////
1921 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1922 //////////////////////////////////////////////////////////////////////////
1923 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1924 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1925 {
1926     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1927     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1928 
1929     //////////////////////////////////////////////////////////////////////////
1930     /// @brief Stores an 8x8 raster tile to the destination surface.
1931     /// @param pSrc - Pointer to raster tile.
1932     /// @param pDstSurface - Destination surface state
1933     /// @param x, y - Coordinates to raster tile.
1934     INLINE static void Store(
1935         uint8_t *pSrc,
1936         SWR_SURFACE_STATE* pDstSurface,
1937         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1938     {
1939         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1940         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1941 
1942         // Punt non-full tiles to generic store
1943         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1944         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1945 
1946         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1947         {
1948             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1949         }
1950 
1951         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1952         // We can compute the offsets to each column within the raster tile once and increment from these.
1953 #if USE_8x2_TILE_BACKEND
1954         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1955         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1956             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1957 
1958         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1959         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1960 
1961         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1962         uint8_t *ppDsts[] =
1963         {
1964             pDst,                                           // row 0, col 0
1965             pDst + DestRowWidthBytes,                       // row 1, col 0
1966             pDst + DestColumnBytes,                         // row 0, col 1
1967             pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
1968         };
1969 
1970         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1971         {
1972             // Raster tile width is same as simd16 tile width
1973             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1974 
1975             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1976 
1977             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1978 
1979             ppDsts[0] += dy;
1980             ppDsts[1] += dy;
1981             ppDsts[2] += dy;
1982             ppDsts[3] += dy;
1983         }
1984 #else
1985         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1986         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1987             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1988 
1989         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1990         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1991 
1992         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1993         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1994         {
1995             uint32_t rowOffset = row * DestRowWidthBytes;
1996 
1997             uint8_t* pRow = pCol0 + rowOffset;
1998             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1999 
2000             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2001             pSrc += pSrcInc;
2002 
2003             ppDsts[0] += DestColumnBytes;
2004             ppDsts[1] += DestColumnBytes;
2005 
2006             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2007             pSrc += pSrcInc;
2008         }
2009 #endif
2010     }
2011 };
2012 
2013 //////////////////////////////////////////////////////////////////////////
2014 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2015 //////////////////////////////////////////////////////////////////////////
2016 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2017 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2018 {
2019     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2020     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2021 
2022     //////////////////////////////////////////////////////////////////////////
2023     /// @brief Stores an 8x8 raster tile to the destination surface.
2024     /// @param pSrc - Pointer to raster tile.
2025     /// @param pDstSurface - Destination surface state
2026     /// @param x, y - Coordinates to raster tile.
2027     INLINE static void Store(
2028         uint8_t *pSrc,
2029         SWR_SURFACE_STATE* pDstSurface,
2030         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2031     {
2032         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2033         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2034 
2035         // Punt non-full tiles to generic store
2036         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2037         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2038 
2039         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2040         {
2041             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2042         }
2043 
2044         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2045         // We can compute the offsets to each column within the raster tile once and increment from these.
2046 #if USE_8x2_TILE_BACKEND
2047         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2048         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2049             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2050 
2051         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2052         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2053 
2054         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2055         uint8_t *ppDsts[] =
2056         {
2057             pDst,                                           // row 0, col 0
2058             pDst + DestRowWidthBytes,                       // row 1, col 0
2059             pDst + DestColumnBytes,                         // row 0, col 1
2060             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2061             pDst + DestColumnBytes * 2,                     // row 0, col 2
2062             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2063             pDst + DestColumnBytes * 3,                     // row 0, col 3
2064             pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
2065         };
2066 
2067         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2068         {
2069             // Raster tile width is same as simd16 tile width
2070             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2071 
2072             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2073 
2074             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2075 
2076             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2077             {
2078                 ppDsts[i] += dy;
2079             }
2080         }
2081 #else
2082         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2083         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2084             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2085         uint8_t* pCol1 = pCol0 + DestColumnBytes;
2086 
2087         // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2088         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2089         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2090 
2091         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2092         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2093         {
2094             uint32_t rowOffset = row * DestRowWidthBytes;
2095             uint8_t* ppDsts[] =
2096             {
2097                 pCol0 + rowOffset,
2098                 pCol0 + rowOffset + DestRowWidthBytes,
2099                 pCol1 + rowOffset,
2100                 pCol1 + rowOffset + DestRowWidthBytes,
2101             };
2102 
2103             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2104             pSrc += pSrcInc;
2105 
2106             ppDsts[0] += DestColumnBytes * 2;
2107             ppDsts[1] += DestColumnBytes * 2;
2108             ppDsts[2] += DestColumnBytes * 2;
2109             ppDsts[3] += DestColumnBytes * 2;
2110 
2111             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2112             pSrc += pSrcInc;
2113         }
2114 #endif
2115     }
2116 };
2117 
2118 //////////////////////////////////////////////////////////////////////////
2119 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2120 //////////////////////////////////////////////////////////////////////////
2121 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2122 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2123 {
2124     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2125 #if USE_8x2_TILE_BACKEND
2126     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2127 
2128 #else
2129     static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2130     static const size_t TILE_Y_ROWS = 32;
2131     static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2132 
2133     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2134     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2135     static const size_t MAX_DST_COLUMN_BYTES = 16;
2136 
2137     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2138     static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2139 
2140 #endif
2141     //////////////////////////////////////////////////////////////////////////
2142     /// @brief Stores an 8x8 raster tile to the destination surface.
2143     /// @param pSrc - Pointer to raster tile.
2144     /// @param pDstSurface - Destination surface state
2145     /// @param x, y - Coordinates to raster tile.
2146     INLINE static void Store(
2147         uint8_t *pSrc,
2148         SWR_SURFACE_STATE* pDstSurface,
2149         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2150     {
2151 #if USE_8x2_TILE_BACKEND
2152         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2153         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2154 #endif
2155 
2156         // Punt non-full tiles to generic store
2157         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2158         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2159 
2160         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2161         {
2162             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2163         }
2164 
2165         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2166         // We can compute the offsets to each column within the raster tile once and increment from these.
2167 #if USE_8x2_TILE_BACKEND
2168         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2169         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2170             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2171 
2172         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2173         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2174 
2175         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2176         uint8_t *ppDsts[] =
2177         {
2178             pDst,                                           // row 0, col 0
2179             pDst + DestRowWidthBytes,                       // row 1, col 0
2180             pDst + DestColumnBytes,                         // row 0, col 1
2181             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2182             pDst + DestColumnBytes * 2,                     // row 0, col 2
2183             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2184             pDst + DestColumnBytes * 3,                     // row 0, col 3
2185             pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2186             pDst + DestColumnBytes * 4,                     // row 0, col 4
2187             pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2188             pDst + DestColumnBytes * 5,                     // row 0, col 5
2189             pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2190             pDst + DestColumnBytes * 6,                     // row 0, col 6
2191             pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2192             pDst + DestColumnBytes * 7,                     // row 0, col 7
2193             pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
2194         };
2195 
2196         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2197         {
2198             // Raster tile width is same as simd16 tile width
2199             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2200 
2201             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2202 
2203             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2204 
2205             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2206             {
2207                 ppDsts[i] += dy;
2208             }
2209         }
2210 #else
2211         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2212         uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2213             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2214         struct DstPtrs
2215         {
2216             uint8_t* ppDsts[8];
2217         } ptrs;
2218 
2219         // Need 8 pointers, 4 columns of 2 rows each
2220         for (uint32_t y = 0; y < 2; ++y)
2221         {
2222             for (uint32_t x = 0; x < 4; ++x)
2223             {
2224                 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2225             }
2226         }
2227 
2228         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2229         {
2230             DstPtrs startPtrs = ptrs;
2231 
2232             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2233             {
2234                 // Format conversion and convert from SOA to AOS, and store the rows.
2235                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2236 
2237                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2238                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2239                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2240                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2241                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2242                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2243                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2244                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2245                 pSrc += SRC_COLUMN_BYTES;
2246             }
2247 
2248             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2249             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2250             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2251             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2252             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2253             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2254             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2255             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2256         }
2257 #endif
2258     }
2259 };
2260 
2261 //////////////////////////////////////////////////////////////////////////
2262 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2263 //////////////////////////////////////////////////////////////////////////
2264 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2265 struct StoreMacroTile
2266 {
2267     //////////////////////////////////////////////////////////////////////////
2268     /// @brief Stores a macrotile to the destination surface using safe implementation.
2269     /// @param pSrc - Pointer to macro tile.
2270     /// @param pDstSurface - Destination surface state
2271     /// @param x, y - Coordinates to macro tile
2272     static void StoreGeneric(
2273         uint8_t *pSrcHotTile,
2274         SWR_SURFACE_STATE* pDstSurface,
2275         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2276     {
2277         PFN_STORE_TILES_INTERNAL pfnStore;
2278         pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2279 
2280         // Store each raster tile from the hot tile to the destination surface.
2281         for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2282         {
2283             for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2284             {
2285                 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2286                 {
2287                     pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2288                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2289                 }
2290             }
2291         }
2292 
2293     }
2294 
2295     typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2296     //////////////////////////////////////////////////////////////////////////
2297     /// @brief Stores a macrotile to the destination surface.
2298     /// @param pSrc - Pointer to macro tile.
2299     /// @param pDstSurface - Destination surface state
2300     /// @param x, y - Coordinates to macro tile
2301     static void Store(
2302         uint8_t *pSrcHotTile,
2303         SWR_SURFACE_STATE* pDstSurface,
2304         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2305     {
2306         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2307 
2308         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2309         {
2310             size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2311                 0,
2312                 0,
2313                 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2314                 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2315                 sampleNum,
2316                 pDstSurface->lod,
2317                 pDstSurface);
2318 
2319             // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2320             bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2321                 (pDstSurface->bInterleavedSamples);
2322 
2323             pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2324         }
2325 
2326         // Store each raster tile from the hot tile to the destination surface.
2327         for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2328         {
2329             for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2330             {
2331                 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2332                 {
2333                     pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2334                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2335                 }
2336             }
2337         }
2338     }
2339 };
2340 
2341 //////////////////////////////////////////////////////////////////////////
2342 /// InitStoreTilesTable - Helper for setting up the tables.
2343 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2344 void InitStoreTilesTableColor_Half1(
2345     PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2346 {
2347     table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2348     table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2349     table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2350     table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2351     table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2352     table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2353     table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2354     table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2355     table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2356     table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2357     table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2358     table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2359     table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2360     table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2361     table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2362     table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2363     table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2364     table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2365     table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2366     table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2367     table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2368     table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2369     table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2370     table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2371     table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2372     table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2373     table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2374     table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2375     table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2376     table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2377     table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2378     table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2379     table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2380     table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2381     table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2382     table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2383     table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2384     table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2385     table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2386     table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2387     table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2388     table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2389     table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2390     table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2391     table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2392     table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2393     table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2394     table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2395     table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2396     table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2397     table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2398     table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2399     table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2400     table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2401     table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2402     table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2403 }
2404 
2405 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2406 void InitStoreTilesTableColor_Half2(
2407     PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2408 {
2409     table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2410     table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2411     table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2412     table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2413     table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2414     table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2415     table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2416     table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2417     table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2418     table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2419     table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2420     table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2421     table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2422     table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2423     table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2424     table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2425     table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2426     table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2427     table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2428     table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2429     table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2430     table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2431     table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2432     table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2433     table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2434     table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2435     table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2436     table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2437     table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2438     table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2439     table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2440     table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2441     table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2442     table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2443     table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2444     table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2445     table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2446     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2447     table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2448     table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2449     table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2450     table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2451     table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2452     table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2453     table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2454     table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2455     table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2456     table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2457     table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2458     table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2459     table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2460     table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2461     table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2462     table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2463     table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2464     table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2465     table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2466     table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2467     table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2468     table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2469     table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2470     table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2471     table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2472     table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2473 }
2474 
2475 //////////////////////////////////////////////////////////////////////////
2476 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2477 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2478 void InitStoreTilesTableDepth(
2479     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2480 {
2481    table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2482    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2483    table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2484    table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2485 }
2486 
2487 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2488 void InitStoreTilesTableStencil(
2489     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2490 {
2491     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2492 }
2493