• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35 
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39 
40 #include <array>
41 #include <sstream>
42 
43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
44 
45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
46 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
47 
48 //////////////////////////////////////////////////////////////////////////
49 /// Store Raster Tile Function Tables.
50 //////////////////////////////////////////////////////////////////////////
51 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54 
55 void InitStoreTilesTable_Linear_1();
56 void InitStoreTilesTable_Linear_2();
57 void InitStoreTilesTable_TileX_1();
58 void InitStoreTilesTable_TileX_2();
59 void InitStoreTilesTable_TileY_1();
60 void InitStoreTilesTable_TileY_2();
61 void InitStoreTilesTable_TileW();
62 void InitStoreTilesTable();
63 
64 //////////////////////////////////////////////////////////////////////////
65 /// StorePixels
66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
67 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
68 /// @param ppDsts   - Array of destination pointers.  Each pointer is
69 ///                   to a single row of at most 16B.
70 /// @tparam NumDests - Number of destination pointers.  Each pair of
71 ///                    pointers is for a 16-byte column of two rows.
72 //////////////////////////////////////////////////////////////////////////
73 template <size_t PixelSize, size_t NumDests>
74 struct StorePixels
75 {
76     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
77 };
78 
79 //////////////////////////////////////////////////////////////////////////
80 /// StorePixels (32-bit pixel specialization)
81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
82 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
83 /// @param ppDsts   - Array of destination pointers.  Each pointer is
84 ///                   to a single row of at most 16B.
85 /// @tparam NumDests - Number of destination pointers.  Each pair of
86 ///                    pointers is for a 16-byte column of two rows.
87 //////////////////////////////////////////////////////////////////////////
88 template <>
89 struct StorePixels<8, 2>
90 {
91     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
92     {
93         // Each 4-pixel row is 4 bytes.
94         const uint16_t* pPixSrc = (const uint16_t*)pSrc;
95 
96         // Unswizzle from SWR-Z order
97         uint16_t* pRow = (uint16_t*)ppDsts[0];
98         pRow[0] = pPixSrc[0];
99         pRow[1] = pPixSrc[2];
100 
101         pRow = (uint16_t*)ppDsts[1];
102         pRow[0] = pPixSrc[1];
103         pRow[1] = pPixSrc[3];
104     }
105 };
106 
107 #if USE_8x2_TILE_BACKEND
108 template <>
109 struct StorePixels<8, 4>
110 {
111     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
112     {
113         // 8 x 2 bytes = 16 bytes, 16 pixels
114         const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
115 
116         uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
117 
118         // Unswizzle from SWR-Z order
119         ppDsts16[0][0] = pSrc16[0];     // 0 1
120         ppDsts16[0][1] = pSrc16[2];     // 4 5
121 
122         ppDsts16[1][0] = pSrc16[1];     // 2 3
123         ppDsts16[1][1] = pSrc16[3];     // 6 7
124 
125         ppDsts16[2][0] = pSrc16[4];     // 8 9
126         ppDsts16[2][1] = pSrc16[6];     // C D
127 
128         ppDsts16[3][0] = pSrc16[5];     // A B
129         ppDsts16[3][1] = pSrc16[7];     // E F
130     }
131 };
132 
133 #endif
134 //////////////////////////////////////////////////////////////////////////
135 /// StorePixels (32-bit pixel specialization)
136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
137 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
138 /// @param ppDsts   - Array of destination pointers.  Each pointer is
139 ///                   to a single row of at most 16B.
140 /// @tparam NumDests - Number of destination pointers.  Each pair of
141 ///                    pointers is for a 16-byte column of two rows.
142 //////////////////////////////////////////////////////////////////////////
143 template <>
144 struct StorePixels<16, 2>
145 {
146     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
147     {
148         // Each 4-pixel row is 8 bytes.
149         const uint32_t* pPixSrc = (const uint32_t*)pSrc;
150 
151         // Unswizzle from SWR-Z order
152         uint32_t* pRow = (uint32_t*)ppDsts[0];
153         pRow[0] = pPixSrc[0];
154         pRow[1] = pPixSrc[2];
155 
156         pRow = (uint32_t*)ppDsts[1];
157         pRow[0] = pPixSrc[1];
158         pRow[1] = pPixSrc[3];
159     }
160 };
161 
162 #if USE_8x2_TILE_BACKEND
163 template <>
164 struct StorePixels<16, 4>
165 {
166     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
167     {
168         // 8 x 4 bytes = 32 bytes, 16 pixels
169         const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
170 
171         uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
172 
173         // Unswizzle from SWR-Z order
174         ppDsts32[0][0] = pSrc32[0];     // 0 1
175         ppDsts32[0][1] = pSrc32[2];     // 4 5
176 
177         ppDsts32[1][0] = pSrc32[1];     // 2 3
178         ppDsts32[1][1] = pSrc32[3];     // 6 7
179 
180         ppDsts32[2][0] = pSrc32[4];     // 8 9
181         ppDsts32[2][1] = pSrc32[6];     // C D
182 
183         ppDsts32[3][0] = pSrc32[5];     // A B
184         ppDsts32[3][1] = pSrc32[7];     // E F
185     }
186 };
187 
188 #endif
189 //////////////////////////////////////////////////////////////////////////
190 /// StorePixels (32-bit pixel specialization)
191 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
192 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
193 /// @param ppDsts   - Array of destination pointers.  Each pointer is
194 ///                   to a single row of at most 16B.
195 /// @tparam NumDests - Number of destination pointers.  Each pair of
196 ///                    pointers is for a 16-byte column of two rows.
197 //////////////////////////////////////////////////////////////////////////
198 template <>
199 struct StorePixels<32, 2>
200 {
201     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
202     {
203         // Each 4-pixel row is 16-bytes
204         simd4scalari *pZRow01 = (simd4scalari*)pSrc;
205         simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
206         simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
207 
208         simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
209         simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
210 
211         SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
212         SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
213     }
214 };
215 
216 #if USE_8x2_TILE_BACKEND
217 template <>
218 struct StorePixels<32, 4>
219 {
220     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
221     {
222         // 4 x 16 bytes = 64 bytes, 16 pixels
223         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
224 
225         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
226 
227         // Unswizzle from SWR-Z order
228         simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
229         simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
230         simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
231         simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
232 
233         SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
234         SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
235         SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
236         SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
237     }
238 };
239 
240 #endif
241 //////////////////////////////////////////////////////////////////////////
242 /// StorePixels (32-bit pixel specialization)
243 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
244 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
245 /// @param ppDsts   - Array of destination pointers.  Each pointer is
246 ///                   to a single row of at most 16B.
247 /// @tparam NumDests - Number of destination pointers.  Each pair of
248 ///                    pointers is for a 16-byte column of two rows.
249 //////////////////////////////////////////////////////////////////////////
250 template <>
251 struct StorePixels<64, 4>
252 {
253     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
254     {
255         // Each 4-pixel row is 32 bytes.
256         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
257 
258         // order of pointers match SWR-Z layout
259         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
260         *pvDsts[0] = pPixSrc[0];
261         *pvDsts[1] = pPixSrc[1];
262         *pvDsts[2] = pPixSrc[2];
263         *pvDsts[3] = pPixSrc[3];
264     }
265 };
266 
267 #if USE_8x2_TILE_BACKEND
268 template <>
269 struct StorePixels<64, 8>
270 {
271     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
272     {
273         // 8 x 16 bytes = 128 bytes, 16 pixels
274         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
275 
276         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
277 
278         // order of pointers match SWR-Z layout
279         *ppDsts128[0] = pSrc128[0];     // 0 1
280         *ppDsts128[1] = pSrc128[1];     // 2 3
281         *ppDsts128[2] = pSrc128[2];     // 4 5
282         *ppDsts128[3] = pSrc128[3];     // 6 7
283         *ppDsts128[4] = pSrc128[4];     // 8 9
284         *ppDsts128[5] = pSrc128[5];     // A B
285         *ppDsts128[6] = pSrc128[6];     // C D
286         *ppDsts128[7] = pSrc128[7];     // E F
287     }
288 };
289 
290 #endif
291 //////////////////////////////////////////////////////////////////////////
292 /// StorePixels (32-bit pixel specialization)
293 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
294 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
295 /// @param ppDsts   - Array of destination pointers.  Each pointer is
296 ///                   to a single row of at most 16B.
297 /// @tparam NumDests - Number of destination pointers.  Each pair of
298 ///                    pointers is for a 16-byte column of two rows.
299 //////////////////////////////////////////////////////////////////////////
300 template <>
301 struct StorePixels<128, 8>
302 {
303     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
304     {
305         // Each 4-pixel row is 64 bytes.
306         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
307 
308         // Unswizzle from SWR-Z order
309         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
310         *pvDsts[0] = pPixSrc[0];
311         *pvDsts[1] = pPixSrc[2];
312         *pvDsts[2] = pPixSrc[1];
313         *pvDsts[3] = pPixSrc[3];
314         *pvDsts[4] = pPixSrc[4];
315         *pvDsts[5] = pPixSrc[6];
316         *pvDsts[6] = pPixSrc[5];
317         *pvDsts[7] = pPixSrc[7];
318     }
319 };
320 
321 #if USE_8x2_TILE_BACKEND
322 template <>
323 struct StorePixels<128, 16>
324 {
325     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
326     {
327         // 16 x 16 bytes = 256 bytes, 16 pixels
328         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
329 
330         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
331 
332         for (uint32_t i = 0; i < 16; i += 4)
333         {
334             *ppDsts128[i + 0] = pSrc128[i + 0];
335             *ppDsts128[i + 1] = pSrc128[i + 2];
336             *ppDsts128[i + 2] = pSrc128[i + 1];
337             *ppDsts128[i + 3] = pSrc128[i + 3];
338         }
339     }
340 };
341 
342 #endif
343 //////////////////////////////////////////////////////////////////////////
344 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
345 //////////////////////////////////////////////////////////////////////////
346 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
347 struct ConvertPixelsSOAtoAOS
348 {
349     //////////////////////////////////////////////////////////////////////////
350     /// @brief Converts a SIMD from the Hot Tile to the destination format
351     ///        and converts from SOA to AOS.
352     /// @param pSrc - Pointer to raster tile.
353     /// @param pDst - Pointer to destination surface or deswizzling buffer.
354     template <size_t NumDests>
355     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
356     {
357 #if USE_8x2_TILE_BACKEND
358         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
359 
360         OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
361         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
362 
363         // Convert from SrcFormat --> DstFormat
364         simd16vector src;
365         LoadSOA<SrcFormat>(pSrc, src);
366         StoreSOA<DstFormat>(src, soaTile);
367 
368         // Convert from SOA --> AOS
369         FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
370 
371 #else
372         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
373 
374         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
375         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
376 
377         // Convert from SrcFormat --> DstFormat
378         simdvector src;
379         LoadSOA<SrcFormat>(pSrc, src);
380         StoreSOA<DstFormat>(src, soaTile);
381 
382         // Convert from SOA --> AOS
383         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
384 
385 #endif
386         // Store data into destination
387         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
388     }
389 };
390 
391 //////////////////////////////////////////////////////////////////////////
392 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
393 /// Specialization for no format conversion
394 //////////////////////////////////////////////////////////////////////////
395 template<SWR_FORMAT Format>
396 struct ConvertPixelsSOAtoAOS<Format, Format>
397 {
398     //////////////////////////////////////////////////////////////////////////
399     /// @brief Converts a SIMD from the Hot Tile to the destination format
400     ///        and converts from SOA to AOS.
401     /// @param pSrc - Pointer to raster tile.
402     /// @param pDst - Pointer to destination surface or deswizzling buffer.
403     template <size_t NumDests>
404     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
405     {
406 #if USE_8x2_TILE_BACKEND
407         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
408 
409         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
410 
411         // Convert from SOA --> AOS
412         FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
413 
414 #else
415         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
416 
417         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
418 
419         // Convert from SOA --> AOS
420         FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
421 
422 #endif
423         // Store data into destination
424         StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
425     }
426 };
427 
428 //////////////////////////////////////////////////////////////////////////
429 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
430 //////////////////////////////////////////////////////////////////////////
431 template<>
432 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
433 {
434     //////////////////////////////////////////////////////////////////////////
435     /// @brief Converts a SIMD from the Hot Tile to the destination format
436     ///        and converts from SOA to AOS.
437     /// @param pSrc - Pointer to raster tile.
438     /// @param pDst - Pointer to destination surface or deswizzling buffer.
439     template <size_t NumDests>
440     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
441     {
442 #if USE_8x2_TILE_BACKEND
443         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
444         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
445 
446         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
447 
448         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
449 
450         // Load hot-tile
451         simd16vector src, dst;
452         LoadSOA<SrcFormat>(pSrc, src);
453 
454         // deswizzle
455         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
456         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
457         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
458 
459         // clamp
460         dst.x = Clamp<DstFormat>(dst.x, 0);
461         dst.y = Clamp<DstFormat>(dst.y, 1);
462         dst.z = Clamp<DstFormat>(dst.z, 2);
463 
464         // normalize
465         dst.x = Normalize<DstFormat>(dst.x, 0);
466         dst.y = Normalize<DstFormat>(dst.y, 1);
467         dst.z = Normalize<DstFormat>(dst.z, 2);
468 
469         // pack
470         simd16scalari packed = _simd16_castps_si(dst.x);
471 
472         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
473         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
474 
475         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
476         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
477 
478         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
479         uint32_t *pPacked = (uint32_t*)&packed;
480         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
481         for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
482         {
483             *pAosTile++ = *pPacked++;
484         }
485 
486 #else
487         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
488         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
489         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
490 
491         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
492 
493         // Load hot-tile
494         simdvector src, dst;
495         LoadSOA<SrcFormat>(pSrc, src);
496 
497         // deswizzle
498         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
499         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
500         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
501 
502         // clamp
503         dst.x = Clamp<DstFormat>(dst.x, 0);
504         dst.y = Clamp<DstFormat>(dst.y, 1);
505         dst.z = Clamp<DstFormat>(dst.z, 2);
506 
507         // normalize
508         dst.x = Normalize<DstFormat>(dst.x, 0);
509         dst.y = Normalize<DstFormat>(dst.y, 1);
510         dst.z = Normalize<DstFormat>(dst.z, 2);
511 
512         // pack
513         simdscalari packed = _simd_castps_si(dst.x);
514         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
515         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
516                                                                               FormatTraits<DstFormat>::GetConstBPC(1)));
517 
518         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
519         uint32_t *pPacked = (uint32_t*)&packed;
520         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
521         for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
522         {
523             *pAosTile++ = *pPacked++;
524         }
525 
526 #endif
527         // Store data into destination
528         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
529     }
530 };
531 
532 //////////////////////////////////////////////////////////////////////////
533 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
534 //////////////////////////////////////////////////////////////////////////
535 template<>
536 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
537 {
538     static const SWR_FORMAT SrcFormat = R32_FLOAT;
539     static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
540 
541     //////////////////////////////////////////////////////////////////////////
542     /// @brief Converts a SIMD from the Hot Tile to the destination format
543     ///        and converts from SOA to AOS.
544     /// @param pSrc - Pointer to raster tile.
545     /// @param pDst - Pointer to destination surface or deswizzling buffer.
546     template <size_t NumDests>
547     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
548     {
549 #if USE_8x2_TILE_BACKEND
550         simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
551 
552         // clamp
553         const simd16scalar zero = _simd16_setzero_ps();
554         const simd16scalar ones = _simd16_set1_ps(1.0f);
555 
556         comp = _simd16_max_ps(comp, zero);
557         comp = _simd16_min_ps(comp, ones);
558 
559         // normalize
560         comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
561 
562         simd16scalari temp = _simd16_cvtps_epi32(comp);
563 
564         // swizzle
565         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
566 
567         // merge/store data into destination but don't overwrite the X8 bits
568         simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
569         simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
570 
571         simd16scalari dest = _simd16_setzero_si();
572 
573         dest = _simd16_insert_si(dest, destlo, 0);
574         dest = _simd16_insert_si(dest, desthi, 1);
575 
576         simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
577 
578         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
579 
580         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
581         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
582 #else
583         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
584 
585         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
586         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
587 
588         // Convert from SrcFormat --> DstFormat
589         simdvector src;
590         LoadSOA<SrcFormat>(pSrc, src);
591         StoreSOA<DstFormat>(src, soaTile);
592 
593         // Convert from SOA --> AOS
594         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
595 
596         // Store data into destination but don't overwrite the X8 bits
597         // Each 4-pixel row is 16-bytes
598         simd4scalari *pZRow01 = (simd4scalari*)aosTile;
599         simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
600         simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
601 
602         simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
603         simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
604 
605         simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
606         simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
607 
608         simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
609 
610         vDst0 = SIMD128::andnot_si(vMask, vDst0);
611         vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
612         vDst1 = SIMD128::andnot_si(vMask, vDst1);
613         vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
614 
615         SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
616         SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
617 #endif
618     }
619 };
620 
621 #if USE_8x2_TILE_BACKEND
622 template<SWR_FORMAT DstFormat>
623 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
624 {
625     // swizzle rgba -> bgra while we load
626     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
627     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
628     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
629     simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
630 
631     // clamp
632     const simd16scalar zero = _simd16_setzero_ps();
633     const simd16scalar ones = _simd16_set1_ps(1.0f);
634 
635     comp0 = _simd16_max_ps(comp0, zero);
636     comp0 = _simd16_min_ps(comp0, ones);
637 
638     comp1 = _simd16_max_ps(comp1, zero);
639     comp1 = _simd16_min_ps(comp1, ones);
640 
641     comp2 = _simd16_max_ps(comp2, zero);
642     comp2 = _simd16_min_ps(comp2, ones);
643 
644     comp3 = _simd16_max_ps(comp3, zero);
645     comp3 = _simd16_min_ps(comp3, ones);
646 
647     // gamma-correct only rgb
648     if (FormatTraits<DstFormat>::isSRGB)
649     {
650         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
651         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
652         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
653     }
654 
655     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
656     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
657     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
658     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
659     comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
660 
661     // moving to 16 wide integer vector types
662     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
663     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
664     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
665     simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
666 
667     // SOA to AOS conversion
668     src1 = _simd16_slli_epi32(src1,  8);
669     src2 = _simd16_slli_epi32(src2, 16);
670     src3 = _simd16_slli_epi32(src3, 24);
671 
672     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
673 
674     // de-swizzle conversion
675 #if 1
676     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
677     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
678 
679     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
680 
681 #else
682     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
683 
684 #endif
685     // store 8x2 memory order:
686     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
687     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
688     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
689     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
690 }
691 
692 #endif
693 template<SWR_FORMAT DstFormat>
694 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
695 {
696     static const uint32_t offset = sizeof(simdscalar);
697 
698     // swizzle rgba -> bgra while we load
699     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
700     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
701     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
702     simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
703 
704     // clamp
705     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
706     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
707 
708     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
709     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
710 
711     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
712     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
713 
714     vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
715     vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
716 
717     if (FormatTraits<DstFormat>::isSRGB)
718     {
719         // Gamma-correct only rgb
720         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
721         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
722         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
723     }
724 
725     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
726     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
727     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
728     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
729     vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
730 
731     // moving to 8 wide integer vector types
732     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
733     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
734     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
735     simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
736 
737 #if KNOB_ARCH <= KNOB_ARCH_AVX
738 
739     // splitting into two sets of 4 wide integer vector types
740     // because AVX doesn't have instructions to support this operation at 8 wide
741     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
742     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
743     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
744     simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
745 
746     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
747     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
748     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
749     simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
750 
751     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
752     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
753     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
754     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
755     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
756     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
757 
758     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
759     srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
760 
761     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
762     srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
763 
764     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
765     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
766 
767     // unpack into rows that get the tiling order correct
768     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
769     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
770 
771     simdscalari final = _mm256_castsi128_si256(vRow00);
772     final = _mm256_insertf128_si256(final, vRow10, 1);
773 
774 #else
775 
776     // logic is as above, only wider
777     src1 = _mm256_slli_si256(src1, 1);
778     src2 = _mm256_slli_si256(src2, 2);
779     src3 = _mm256_slli_si256(src3, 3);
780 
781     src0 = _mm256_or_si256(src0, src1);
782     src2 = _mm256_or_si256(src2, src3);
783 
784     simdscalari final = _mm256_or_si256(src0, src2);
785 
786     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
787     final = _mm256_permute4x64_epi64(final, 0xD8);
788 #endif
789 
790     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
791 }
792 
793 #if USE_8x2_TILE_BACKEND
794 template<SWR_FORMAT DstFormat>
795 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
796 {
797     // swizzle rgba -> bgra while we load
798     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
799     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
800     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
801 
802     // clamp
803     const simd16scalar zero = _simd16_setzero_ps();
804     const simd16scalar ones = _simd16_set1_ps(1.0f);
805 
806     comp0 = _simd16_max_ps(comp0, zero);
807     comp0 = _simd16_min_ps(comp0, ones);
808 
809     comp1 = _simd16_max_ps(comp1, zero);
810     comp1 = _simd16_min_ps(comp1, ones);
811 
812     comp2 = _simd16_max_ps(comp2, zero);
813     comp2 = _simd16_min_ps(comp2, ones);
814 
815     // gamma-correct only rgb
816     if (FormatTraits<DstFormat>::isSRGB)
817     {
818         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
819         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
820         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
821     }
822 
823     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
824     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
825     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
826     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
827 
828     // moving to 16 wide integer vector types
829     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
830     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
831     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
832 
833     // SOA to AOS conversion
834     src1 = _simd16_slli_epi32(src1,  8);
835     src2 = _simd16_slli_epi32(src2, 16);
836 
837     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
838 
839     // de-swizzle conversion
840 #if 1
841     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
842     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
843 
844     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
845 
846 #else
847     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
848 
849 #endif
850     // store 8x2 memory order:
851     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
852     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
853     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
854     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
855 }
856 
857 #endif
858 template<SWR_FORMAT DstFormat>
859 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
860 {
861     static const uint32_t offset = sizeof(simdscalar);
862 
863     // swizzle rgba -> bgra while we load
864     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
865     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
866     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
867                                                                                                             // clamp
868     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
869     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
870 
871     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
872     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
873 
874     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
875     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
876 
877     if (FormatTraits<DstFormat>::isSRGB)
878     {
879         // Gamma-correct only rgb
880         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
881         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
882         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
883     }
884 
885     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
886     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
887     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
888     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
889 
890     // moving to 8 wide integer vector types
891     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
892     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
893     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
894 
895 #if KNOB_ARCH <= KNOB_ARCH_AVX
896 
897     // splitting into two sets of 4 wide integer vector types
898     // because AVX doesn't have instructions to support this operation at 8 wide
899     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
900     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
901     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
902 
903     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
904     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
905     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
906 
907     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
908     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
909     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
910     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
911 
912     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
913 
914     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
915 
916     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
917     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
918 
919     // unpack into rows that get the tiling order correct
920     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
921     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
922 
923     simdscalari final = _mm256_castsi128_si256(vRow00);
924     final = _mm256_insertf128_si256(final, vRow10, 1);
925 
926 #else
927 
928                                               // logic is as above, only wider
929     src1 = _mm256_slli_si256(src1, 1);
930     src2 = _mm256_slli_si256(src2, 2);
931 
932     src0 = _mm256_or_si256(src0, src1);
933 
934     simdscalari final = _mm256_or_si256(src0, src2);
935 
936     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
937     final = _mm256_permute4x64_epi64(final, 0xD8);
938 
939 #endif
940 
941     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
942 }
943 
944 template<>
945 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
946 {
947     template <size_t NumDests>
948     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
949     {
950 #if USE_8x2_TILE_BACKEND
951         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
952 #else
953         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
954 #endif
955     }
956 };
957 
958 template<>
959 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
960 {
961     template <size_t NumDests>
962     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
963     {
964 #if USE_8x2_TILE_BACKEND
965         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
966 #else
967         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
968 #endif
969     }
970 };
971 
972 template<>
973 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
974 {
975     template <size_t NumDests>
976     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
977     {
978 #if USE_8x2_TILE_BACKEND
979         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
980 #else
981         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
982 #endif
983     }
984 };
985 
986 template<>
987 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
988 {
989     template <size_t NumDests>
990     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
991     {
992 #if USE_8x2_TILE_BACKEND
993         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
994 #else
995         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
996 #endif
997     }
998 };
999 
1000 template<>
1001 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1002 {
1003     template <size_t NumDests>
1004     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1005     {
1006 #if USE_8x2_TILE_BACKEND
1007         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1008 #else
1009         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1010 #endif
1011     }
1012 };
1013 
1014 template<>
1015 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1016 {
1017     template <size_t NumDests>
1018     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1019     {
1020 #if USE_8x2_TILE_BACKEND
1021         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1022 #else
1023         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1024 #endif
1025     }
1026 };
1027 
1028 template<>
1029 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1030 {
1031     template <size_t NumDests>
1032     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1033     {
1034 #if USE_8x2_TILE_BACKEND
1035         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1036 #else
1037         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1038 #endif
1039     }
1040 };
1041 
1042 template<>
1043 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1044 {
1045     template <size_t NumDests>
1046     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1047     {
1048 #if USE_8x2_TILE_BACKEND
1049         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1050 #else
1051         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1052 #endif
1053     }
1054 };
1055 
1056 //////////////////////////////////////////////////////////////////////////
1057 /// StoreRasterTile
1058 //////////////////////////////////////////////////////////////////////////
1059 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1060 struct StoreRasterTile
1061 {
1062     //////////////////////////////////////////////////////////////////////////
1063     /// @brief Retrieve color from hot tile source which is always float.
1064     /// @param pSrc - Pointer to raster tile.
1065     /// @param x, y - Coordinates to raster tile.
1066     /// @param output - output color
1067     INLINE static void GetSwizzledSrcColor(
1068         uint8_t* pSrc,
1069         uint32_t x, uint32_t y,
1070         float outputColor[4])
1071     {
1072 #if USE_8x2_TILE_BACKEND
1073         typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1074 
1075         SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1076 
1077         // Compute which simd tile we're accessing within 8x8 tile.
1078         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1079         uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1080 
1081         SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1082 
1083         uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1084 
1085         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1086 #else
1087         typedef SimdTile<SrcFormat, DstFormat> SimdT;
1088 
1089         SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1090 
1091         // Compute which simd tile we're accessing within 8x8 tile.
1092         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1093         uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1094 
1095         SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1096 
1097         uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1098 
1099         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1100 #endif
1101     }
1102 
1103     //////////////////////////////////////////////////////////////////////////
1104     /// @brief Stores an 8x8 raster tile to the destination surface.
1105     /// @param pSrc - Pointer to raster tile.
1106     /// @param pDstSurface - Destination surface state
1107     /// @param x, y - Coordinates to raster tile.
1108     INLINE static void Store(
1109         uint8_t *pSrc,
1110         SWR_SURFACE_STATE* pDstSurface,
1111         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1112     {
1113         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1114         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1115 
1116         // For each raster tile pixel (rx, ry)
1117         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1118         {
1119             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1120             {
1121                 // Perform bounds checking.
1122                 if (((x + rx) < lodWidth) &&
1123                     ((y + ry) < lodHeight))
1124                 {
1125                     float srcColor[4];
1126                     GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1127 
1128                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1129                         pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1130                         sampleNum, pDstSurface->lod, pDstSurface);
1131                     {
1132                         ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1133                     }
1134                 }
1135             }
1136         }
1137     }
1138 
1139     //////////////////////////////////////////////////////////////////////////
1140     /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
1141     /// @param pSrc - Pointer to raster tile.
1142     /// @param pDstSurface - Destination surface state
1143     /// @param x, y - Coordinates to raster tile.
1144     /// @param sampleOffset - Offset between adjacent multisamples
1145     INLINE static void Resolve(
1146         uint8_t *pSrc,
1147         SWR_SURFACE_STATE* pDstSurface,
1148         uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1149     {
1150         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1151         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1152 
1153         float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
1154 
1155         // For each raster tile pixel (rx, ry)
1156         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1157         {
1158             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1159             {
1160                 // Perform bounds checking.
1161                 if (((x + rx) < lodWidth) &&
1162                         ((y + ry) < lodHeight))
1163                 {
1164                     // Sum across samples
1165                     float resolveColor[4] = {0};
1166                     for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1167                     {
1168                         float sampleColor[4] = {0};
1169                         uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1170                         GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1171                         resolveColor[0] += sampleColor[0];
1172                         resolveColor[1] += sampleColor[1];
1173                         resolveColor[2] += sampleColor[2];
1174                         resolveColor[3] += sampleColor[3];
1175                     }
1176 
1177                     // Divide by numSamples to average
1178                     resolveColor[0] *= oneOverNumSamples;
1179                     resolveColor[1] *= oneOverNumSamples;
1180                     resolveColor[2] *= oneOverNumSamples;
1181                     resolveColor[3] *= oneOverNumSamples;
1182 
1183                     // Use the resolve surface state
1184                     SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1185                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1186                         pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1187                         0, pResolveSurface->lod, pResolveSurface);
1188                     {
1189                         ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1190                     }
1191                 }
1192             }
1193         }
1194     }
1195 
1196 };
1197 
1198 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1199 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1200 {};
1201 
1202 //////////////////////////////////////////////////////////////////////////
1203 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1204 //////////////////////////////////////////////////////////////////////////
1205 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1206 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1207 {
1208     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1209     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1210     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1211 
1212     //////////////////////////////////////////////////////////////////////////
1213     /// @brief Stores an 8x8 raster tile to the destination surface.
1214     /// @param pSrc - Pointer to raster tile.
1215     /// @param pDstSurface - Destination surface state
1216     /// @param x, y - Coordinates to raster tile.
1217     INLINE static void Store(
1218         uint8_t *pSrc,
1219         SWR_SURFACE_STATE* pDstSurface,
1220         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1221     {
1222         // Punt non-full tiles to generic store
1223         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1224         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1225 
1226         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1227         {
1228             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1229         }
1230 
1231         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1232             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1233 #if USE_8x2_TILE_BACKEND
1234 
1235         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1236         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1237 
1238         uint8_t* ppDsts[] =
1239         {
1240             pDst,                                           // row 0, col 0
1241             pDst + pDstSurface->pitch,                      // row 1, col 0
1242             pDst + dx / 2,                                  // row 0, col 1
1243             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1244         };
1245 
1246         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1247         {
1248             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1249             {
1250                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1251 
1252                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1253 
1254                 ppDsts[0] += dx;
1255                 ppDsts[1] += dx;
1256                 ppDsts[2] += dx;
1257                 ppDsts[3] += dx;
1258             }
1259 
1260             ppDsts[0] += dy;
1261             ppDsts[1] += dy;
1262             ppDsts[2] += dy;
1263             ppDsts[3] += dy;
1264         }
1265 #else
1266         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1267 
1268         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1269         {
1270             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1271 
1272             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1273             {
1274                 // Format conversion and convert from SOA to AOS, and store the rows.
1275                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1276 
1277                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1278                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1279                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1280             }
1281 
1282             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1283             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1284         }
1285 #endif
1286     }
1287 };
1288 
1289 //////////////////////////////////////////////////////////////////////////
1290 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1291 //////////////////////////////////////////////////////////////////////////
1292 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1293 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1294 {
1295     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1296     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1297     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1298 
1299     //////////////////////////////////////////////////////////////////////////
1300     /// @brief Stores an 8x8 raster tile to the destination surface.
1301     /// @param pSrc - Pointer to raster tile.
1302     /// @param pDstSurface - Destination surface state
1303     /// @param x, y - Coordinates to raster tile.
1304     INLINE static void Store(
1305         uint8_t *pSrc,
1306         SWR_SURFACE_STATE* pDstSurface,
1307         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1308     {
1309         // Punt non-full tiles to generic store
1310         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1311         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1312 
1313         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1314         {
1315             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1316         }
1317 
1318         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1319             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1320 #if USE_8x2_TILE_BACKEND
1321 
1322         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1323         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1324 
1325         uint8_t* ppDsts[] =
1326         {
1327             pDst,                                           // row 0, col 0
1328             pDst + pDstSurface->pitch,                      // row 1, col 0
1329             pDst + dx / 2,                                  // row 0, col 1
1330             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1331         };
1332 
1333         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1334         {
1335             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1336             {
1337                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1338 
1339                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1340 
1341                 ppDsts[0] += dx;
1342                 ppDsts[1] += dx;
1343                 ppDsts[2] += dx;
1344                 ppDsts[3] += dx;
1345             }
1346 
1347             ppDsts[0] += dy;
1348             ppDsts[1] += dy;
1349             ppDsts[2] += dy;
1350             ppDsts[3] += dy;
1351         }
1352 #else
1353         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1354 
1355         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1356         {
1357             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1358 
1359             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1360             {
1361                 // Format conversion and convert from SOA to AOS, and store the rows.
1362                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1363 
1364                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1365                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1366                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1367             }
1368 
1369             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1370             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1371         }
1372 #endif
1373     }
1374 };
1375 
1376 //////////////////////////////////////////////////////////////////////////
1377 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1378 //////////////////////////////////////////////////////////////////////////
1379 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1380 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1381 {
1382     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1383     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1384     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1385 
1386     //////////////////////////////////////////////////////////////////////////
1387     /// @brief Stores an 8x8 raster tile to the destination surface.
1388     /// @param pSrc - Pointer to raster tile.
1389     /// @param pDstSurface - Destination surface state
1390     /// @param x, y - Coordinates to raster tile.
1391     INLINE static void Store(
1392         uint8_t *pSrc,
1393         SWR_SURFACE_STATE* pDstSurface,
1394         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1395     {
1396         // Punt non-full tiles to generic store
1397         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1398         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1399 
1400         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1401         {
1402             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1403         }
1404 
1405         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1406             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1407 #if USE_8x2_TILE_BACKEND
1408 
1409         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1410         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1411 
1412         uint8_t* ppDsts[] =
1413         {
1414             pDst,                                           // row 0, col 0
1415             pDst + pDstSurface->pitch,                      // row 1, col 0
1416             pDst + dx / 2,                                  // row 0, col 1
1417             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1418         };
1419 
1420         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1421         {
1422             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1423             {
1424                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1425 
1426                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1427 
1428                 ppDsts[0] += dx;
1429                 ppDsts[1] += dx;
1430                 ppDsts[2] += dx;
1431                 ppDsts[3] += dx;
1432             }
1433 
1434             ppDsts[0] += dy;
1435             ppDsts[1] += dy;
1436             ppDsts[2] += dy;
1437             ppDsts[3] += dy;
1438         }
1439 #else
1440         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1441 
1442         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1443         {
1444             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1445 
1446             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1447             {
1448                 // Format conversion and convert from SOA to AOS, and store the rows.
1449                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1450 
1451                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1452                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1453                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1454             }
1455 
1456             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1457             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1458         }
1459 #endif
1460     }
1461 };
1462 
1463 //////////////////////////////////////////////////////////////////////////
1464 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1465 //////////////////////////////////////////////////////////////////////////
1466 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1467 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1468 {
1469     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1470     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1471     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1472     static const size_t MAX_DST_COLUMN_BYTES = 16;
1473 #if !USE_8x2_TILE_BACKEND
1474     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1475     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1476 #endif
1477 
1478     //////////////////////////////////////////////////////////////////////////
1479     /// @brief Stores an 8x8 raster tile to the destination surface.
1480     /// @param pSrc - Pointer to raster tile.
1481     /// @param pDstSurface - Destination surface state
1482     /// @param x, y - Coordinates to raster tile.
1483     INLINE static void Store(
1484         uint8_t *pSrc,
1485         SWR_SURFACE_STATE* pDstSurface,
1486         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1487     {
1488         // Punt non-full tiles to generic store
1489         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1490         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1491 
1492         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1493         {
1494             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1495         }
1496 
1497         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1498             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1499 #if USE_8x2_TILE_BACKEND
1500 
1501         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1502         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1503 
1504         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1505         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1506 
1507         uint8_t *ppDsts[] =
1508         {
1509             pDst,                                                               // row 0, col 0
1510             pDst + pDstSurface->pitch,                                          // row 1, col 0
1511             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1512             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1513             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1514             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1515             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1516             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
1517         };
1518 
1519         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1520         {
1521             // Raster tile width is same as simd16 tile width
1522             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1523 
1524             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1525 
1526             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1527 
1528             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1529             {
1530                 ppDsts[i] += dy;
1531             }
1532         }
1533 #else
1534         uint8_t* ppDsts[] =
1535         {
1536             pDst,                                               // row 0, col 0
1537             pDst + pDstSurface->pitch,                          // row 1, col 0
1538             pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
1539             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
1540         };
1541 
1542         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1543         {
1544             uint8_t* ppStartRows[] =
1545             {
1546                 ppDsts[0],
1547                 ppDsts[1],
1548                 ppDsts[2],
1549                 ppDsts[3],
1550             };
1551 
1552             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1553             {
1554                 // Format conversion and convert from SOA to AOS, and store the rows.
1555                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1556 
1557                 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1558                 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1559                 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1560                 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1561                 pSrc += SRC_COLUMN_BYTES;
1562             }
1563 
1564             ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1565             ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1566             ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1567             ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1568         }
1569 #endif
1570     }
1571 };
1572 
1573 //////////////////////////////////////////////////////////////////////////
1574 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1575 //////////////////////////////////////////////////////////////////////////
1576 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1577 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1578 {
1579     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1580     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1581     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1582     static const size_t MAX_DST_COLUMN_BYTES = 16;
1583 #if !USE_8x2_TILE_BACKEND
1584     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1585     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1586 #endif
1587 
1588     //////////////////////////////////////////////////////////////////////////
1589     /// @brief Stores an 8x8 raster tile to the destination surface.
1590     /// @param pSrc - Pointer to raster tile.
1591     /// @param pDstSurface - Destination surface state
1592     /// @param x, y - Coordinates to raster tile.
1593     INLINE static void Store(
1594         uint8_t *pSrc,
1595         SWR_SURFACE_STATE* pDstSurface,
1596         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1597     {
1598         // Punt non-full tiles to generic store
1599         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1600         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1601 
1602         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1603         {
1604             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1605         }
1606 
1607         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1608             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1609 #if USE_8x2_TILE_BACKEND
1610 
1611         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1612         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1613 
1614         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1615         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1616 
1617         uint8_t* ppDsts[] =
1618         {
1619             pDst,                                                               // row 0, col 0
1620             pDst + pDstSurface->pitch,                                          // row 1, col 0
1621             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1622             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1623             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1624             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1625             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1626             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
1627             pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
1628             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
1629             pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
1630             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
1631             pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
1632             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
1633             pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
1634             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
1635         };
1636 
1637         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1638         {
1639             // Raster tile width is same as simd16 tile width
1640             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1641 
1642             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1643 
1644             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1645 
1646             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1647             {
1648                 ppDsts[i] += dy;
1649             }
1650         }
1651 #else
1652         struct DstPtrs
1653         {
1654             uint8_t* ppDsts[8];
1655         } ptrs;
1656 
1657         // Need 8 pointers, 4 columns of 2 rows each
1658         for (uint32_t y = 0; y < 2; ++y)
1659         {
1660             for (uint32_t x = 0; x < 4; ++x)
1661             {
1662                 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1663             }
1664         }
1665 
1666         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1667         {
1668             DstPtrs startPtrs = ptrs;
1669 
1670             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1671             {
1672                 // Format conversion and convert from SOA to AOS, and store the rows.
1673                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1674 
1675                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1676                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1677                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1678                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1679                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1680                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1681                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1682                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1683                 pSrc += SRC_COLUMN_BYTES;
1684             }
1685 
1686             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1687             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1688             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1689             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1690             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1691             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1692             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1693             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1694         }
1695 #endif
1696     }
1697 };
1698 
1699 //////////////////////////////////////////////////////////////////////////
1700 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1701 //////////////////////////////////////////////////////////////////////////
1702 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1703 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1704 {
1705     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1706     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1707 
1708     //////////////////////////////////////////////////////////////////////////
1709     /// @brief Stores an 8x8 raster tile to the destination surface.
1710     /// @param pSrc - Pointer to raster tile.
1711     /// @param pDstSurface - Destination surface state
1712     /// @param x, y - Coordinates to raster tile.
1713     INLINE static void Store(
1714         uint8_t *pSrc,
1715         SWR_SURFACE_STATE* pDstSurface,
1716         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1717     {
1718         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1719 
1720         // Punt non-full tiles to generic store
1721         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1722         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1723 
1724         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1725         {
1726             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1727         }
1728 
1729         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1730         // We can compute the offsets to each column within the raster tile once and increment from these.
1731 #if USE_8x2_TILE_BACKEND
1732         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1733         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1734             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1735 
1736         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1737 
1738         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1739         uint8_t *ppDsts[] =
1740         {
1741             pDst,
1742             pDst + DestRowWidthBytes,
1743             pDst + DestRowWidthBytes / 4,
1744             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1745         };
1746 
1747         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1748         {
1749             // Raster tile width is same as simd16 tile width
1750             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1751 
1752             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1753 
1754             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1755 
1756             ppDsts[0] += dy;
1757             ppDsts[1] += dy;
1758             ppDsts[2] += dy;
1759             ppDsts[3] += dy;
1760         }
1761 #else
1762         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1763         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1764             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1765 
1766         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1767         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1768 
1769         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1770         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1771         {
1772             uint32_t rowOffset = row * DestRowWidthBytes;
1773 
1774             uint8_t* pRow = pCol0 + rowOffset;
1775             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1776 
1777             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1778             pSrc += pSrcInc;
1779 
1780             ppDsts[0] += DestRowWidthBytes / 4;
1781             ppDsts[1] += DestRowWidthBytes / 4;
1782 
1783             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1784             pSrc += pSrcInc;
1785         }
1786 #endif
1787     }
1788 };
1789 
1790 //////////////////////////////////////////////////////////////////////////
1791 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1792 //////////////////////////////////////////////////////////////////////////
1793 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1794 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1795 {
1796     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1797     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1798 
1799     //////////////////////////////////////////////////////////////////////////
1800     /// @brief Stores an 8x8 raster tile to the destination surface.
1801     /// @param pSrc - Pointer to raster tile.
1802     /// @param pDstSurface - Destination surface state
1803     /// @param x, y - Coordinates to raster tile.
1804     INLINE static void Store(
1805         uint8_t *pSrc,
1806         SWR_SURFACE_STATE* pDstSurface,
1807         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1808     {
1809         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1810 
1811         // Punt non-full tiles to generic store
1812         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1813         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1814 
1815         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1816         {
1817             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1818         }
1819 
1820         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1821         // We can compute the offsets to each column within the raster tile once and increment from these.
1822 #if USE_8x2_TILE_BACKEND
1823         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1824         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1825             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1826 
1827         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1828 
1829         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1830         uint8_t *ppDsts[] =
1831         {
1832             pDst,
1833             pDst + DestRowWidthBytes,
1834             pDst + DestRowWidthBytes / 2,
1835             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1836         };
1837 
1838         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1839         {
1840             // Raster tile width is same as simd16 tile width
1841             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1842 
1843             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1844 
1845             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1846 
1847             ppDsts[0] += dy;
1848             ppDsts[1] += dy;
1849             ppDsts[2] += dy;
1850             ppDsts[3] += dy;
1851         }
1852 #else
1853         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1854         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1855             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1856 
1857         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1858         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1859 
1860         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1861         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1862         {
1863             uint32_t rowOffset = row * DestRowWidthBytes;
1864 
1865             uint8_t* pRow = pCol0 + rowOffset;
1866             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1867 
1868             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1869             pSrc += pSrcInc;
1870 
1871             ppDsts[0] += DestRowWidthBytes / 2;
1872             ppDsts[1] += DestRowWidthBytes / 2;
1873 
1874             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1875             pSrc += pSrcInc;
1876         }
1877 #endif
1878     }
1879 };
1880 
1881 //////////////////////////////////////////////////////////////////////////
1882 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1883 //////////////////////////////////////////////////////////////////////////
1884 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1885 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1886 {
1887     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1888     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1889     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1890 
1891     //////////////////////////////////////////////////////////////////////////
1892     /// @brief Stores an 8x8 raster tile to the destination surface.
1893     /// @param pSrc - Pointer to raster tile.
1894     /// @param pDstSurface - Destination surface state
1895     /// @param x, y - Coordinates to raster tile.
1896     INLINE static void Store(
1897         uint8_t *pSrc,
1898         SWR_SURFACE_STATE* pDstSurface,
1899         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1900     {
1901         static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
1902 
1903         // Punt non-full tiles to generic store
1904         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1905         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1906 
1907         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1908         {
1909             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1910         }
1911 
1912         // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1913         // We can compute the offsets to each column within the raster tile once and increment from these.
1914 #if USE_8x2_TILE_BACKEND
1915         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1916             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1917 
1918         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1919         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1920 
1921         uint8_t* ppDsts[] =
1922         {
1923             pDst,                                           // row 0, col 0
1924             pDst + DestRowWidthBytes,                       // row 1, col 0
1925             pDst + dx / 2,                                  // row 0, col 1
1926             pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
1927         };
1928 
1929         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1930         {
1931             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1932             {
1933                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1934 
1935                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1936 
1937                 ppDsts[0] += dx;
1938                 ppDsts[1] += dx;
1939                 ppDsts[2] += dx;
1940                 ppDsts[3] += dx;
1941             }
1942 
1943             ppDsts[0] += dy;
1944             ppDsts[1] += dy;
1945             ppDsts[2] += dy;
1946             ppDsts[3] += dy;
1947         }
1948 #else
1949         uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1950             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1951         uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1952 
1953         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1954         {
1955             for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1956             {
1957                 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1958 
1959                 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1960                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1961 
1962                 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1963                 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1964             }
1965 
1966             pRow0 += (DestRowWidthBytes * 2);
1967             pRow1 += (DestRowWidthBytes * 2);
1968         }
1969 #endif
1970     }
1971 };
1972 
1973 //////////////////////////////////////////////////////////////////////////
1974 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1975 //////////////////////////////////////////////////////////////////////////
1976 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1977 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1978 {
1979     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1980     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1981 
1982     //////////////////////////////////////////////////////////////////////////
1983     /// @brief Stores an 8x8 raster tile to the destination surface.
1984     /// @param pSrc - Pointer to raster tile.
1985     /// @param pDstSurface - Destination surface state
1986     /// @param x, y - Coordinates to raster tile.
1987     INLINE static void Store(
1988         uint8_t *pSrc,
1989         SWR_SURFACE_STATE* pDstSurface,
1990         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1991     {
1992         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1993         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1994 
1995         // Punt non-full tiles to generic store
1996         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1997         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1998 
1999         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2000         {
2001             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2002         }
2003 
2004         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2005         // We can compute the offsets to each column within the raster tile once and increment from these.
2006 #if USE_8x2_TILE_BACKEND
2007         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2008         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2009             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2010 
2011         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2012         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2013 
2014         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2015         uint8_t *ppDsts[] =
2016         {
2017             pDst,                                           // row 0, col 0
2018             pDst + DestRowWidthBytes,                       // row 1, col 0
2019             pDst + DestColumnBytes,                         // row 0, col 1
2020             pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
2021         };
2022 
2023         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2024         {
2025             // Raster tile width is same as simd16 tile width
2026             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2027 
2028             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2029 
2030             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2031 
2032             ppDsts[0] += dy;
2033             ppDsts[1] += dy;
2034             ppDsts[2] += dy;
2035             ppDsts[3] += dy;
2036         }
2037 #else
2038         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2039         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2040             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2041 
2042         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2043         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2044 
2045         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2046         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2047         {
2048             uint32_t rowOffset = row * DestRowWidthBytes;
2049 
2050             uint8_t* pRow = pCol0 + rowOffset;
2051             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
2052 
2053             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2054             pSrc += pSrcInc;
2055 
2056             ppDsts[0] += DestColumnBytes;
2057             ppDsts[1] += DestColumnBytes;
2058 
2059             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2060             pSrc += pSrcInc;
2061         }
2062 #endif
2063     }
2064 };
2065 
2066 //////////////////////////////////////////////////////////////////////////
2067 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2068 //////////////////////////////////////////////////////////////////////////
2069 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2070 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2071 {
2072     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2073     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2074 
2075     //////////////////////////////////////////////////////////////////////////
2076     /// @brief Stores an 8x8 raster tile to the destination surface.
2077     /// @param pSrc - Pointer to raster tile.
2078     /// @param pDstSurface - Destination surface state
2079     /// @param x, y - Coordinates to raster tile.
2080     INLINE static void Store(
2081         uint8_t *pSrc,
2082         SWR_SURFACE_STATE* pDstSurface,
2083         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2084     {
2085         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2086         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2087 
2088         // Punt non-full tiles to generic store
2089         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2090         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2091 
2092         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2093         {
2094             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2095         }
2096 
2097         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2098         // We can compute the offsets to each column within the raster tile once and increment from these.
2099 #if USE_8x2_TILE_BACKEND
2100         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2101         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2102             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2103 
2104         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2105         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2106 
2107         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2108         uint8_t *ppDsts[] =
2109         {
2110             pDst,                                           // row 0, col 0
2111             pDst + DestRowWidthBytes,                       // row 1, col 0
2112             pDst + DestColumnBytes,                         // row 0, col 1
2113             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2114             pDst + DestColumnBytes * 2,                     // row 0, col 2
2115             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2116             pDst + DestColumnBytes * 3,                     // row 0, col 3
2117             pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
2118         };
2119 
2120         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2121         {
2122             // Raster tile width is same as simd16 tile width
2123             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2124 
2125             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2126 
2127             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2128 
2129             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
2130             {
2131                 ppDsts[i] += dy;
2132             }
2133         }
2134 #else
2135         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2136         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2137             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2138         uint8_t* pCol1 = pCol0 + DestColumnBytes;
2139 
2140         // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2141         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2142         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2143 
2144         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2145         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2146         {
2147             uint32_t rowOffset = row * DestRowWidthBytes;
2148             uint8_t* ppDsts[] =
2149             {
2150                 pCol0 + rowOffset,
2151                 pCol0 + rowOffset + DestRowWidthBytes,
2152                 pCol1 + rowOffset,
2153                 pCol1 + rowOffset + DestRowWidthBytes,
2154             };
2155 
2156             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2157             pSrc += pSrcInc;
2158 
2159             ppDsts[0] += DestColumnBytes * 2;
2160             ppDsts[1] += DestColumnBytes * 2;
2161             ppDsts[2] += DestColumnBytes * 2;
2162             ppDsts[3] += DestColumnBytes * 2;
2163 
2164             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2165             pSrc += pSrcInc;
2166         }
2167 #endif
2168     }
2169 };
2170 
2171 //////////////////////////////////////////////////////////////////////////
2172 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2173 //////////////////////////////////////////////////////////////////////////
2174 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2175 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2176 {
2177     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2178 #if USE_8x2_TILE_BACKEND
2179     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2180 
2181 #else
2182     static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2183     static const size_t TILE_Y_ROWS = 32;
2184     static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2185 
2186     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2187     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2188     static const size_t MAX_DST_COLUMN_BYTES = 16;
2189 
2190     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2191     static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2192 
2193 #endif
2194     //////////////////////////////////////////////////////////////////////////
2195     /// @brief Stores an 8x8 raster tile to the destination surface.
2196     /// @param pSrc - Pointer to raster tile.
2197     /// @param pDstSurface - Destination surface state
2198     /// @param x, y - Coordinates to raster tile.
2199     INLINE static void Store(
2200         uint8_t *pSrc,
2201         SWR_SURFACE_STATE* pDstSurface,
2202         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2203     {
2204 #if USE_8x2_TILE_BACKEND
2205         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2206         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2207 #endif
2208 
2209         // Punt non-full tiles to generic store
2210         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2211         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2212 
2213         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2214         {
2215             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2216         }
2217 
2218         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2219         // We can compute the offsets to each column within the raster tile once and increment from these.
2220 #if USE_8x2_TILE_BACKEND
2221         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2222         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2223             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2224 
2225         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2226         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2227 
2228         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2229         uint8_t *ppDsts[] =
2230         {
2231             pDst,                                           // row 0, col 0
2232             pDst + DestRowWidthBytes,                       // row 1, col 0
2233             pDst + DestColumnBytes,                         // row 0, col 1
2234             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2235             pDst + DestColumnBytes * 2,                     // row 0, col 2
2236             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2237             pDst + DestColumnBytes * 3,                     // row 0, col 3
2238             pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2239             pDst + DestColumnBytes * 4,                     // row 0, col 4
2240             pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2241             pDst + DestColumnBytes * 5,                     // row 0, col 5
2242             pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2243             pDst + DestColumnBytes * 6,                     // row 0, col 6
2244             pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2245             pDst + DestColumnBytes * 7,                     // row 0, col 7
2246             pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
2247         };
2248 
2249         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2250         {
2251             // Raster tile width is same as simd16 tile width
2252             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2253 
2254             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2255 
2256             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2257 
2258             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
2259             {
2260                 ppDsts[i] += dy;
2261             }
2262         }
2263 #else
2264         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2265         uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2266             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2267         struct DstPtrs
2268         {
2269             uint8_t* ppDsts[8];
2270         } ptrs;
2271 
2272         // Need 8 pointers, 4 columns of 2 rows each
2273         for (uint32_t y = 0; y < 2; ++y)
2274         {
2275             for (uint32_t x = 0; x < 4; ++x)
2276             {
2277                 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2278             }
2279         }
2280 
2281         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2282         {
2283             DstPtrs startPtrs = ptrs;
2284 
2285             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2286             {
2287                 // Format conversion and convert from SOA to AOS, and store the rows.
2288                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2289 
2290                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2291                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2292                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2293                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2294                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2295                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2296                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2297                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2298                 pSrc += SRC_COLUMN_BYTES;
2299             }
2300 
2301             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2302             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2303             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2304             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2305             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2306             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2307             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2308             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2309         }
2310 #endif
2311     }
2312 };
2313 
2314 //////////////////////////////////////////////////////////////////////////
2315 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2316 //////////////////////////////////////////////////////////////////////////
2317 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2318 struct StoreMacroTile
2319 {
2320     //////////////////////////////////////////////////////////////////////////
2321     /// @brief Stores a macrotile to the destination surface using safe implementation.
2322     /// @param pSrc - Pointer to macro tile.
2323     /// @param pDstSurface - Destination surface state
2324     /// @param x, y - Coordinates to macro tile
2325     static void StoreGeneric(
2326         uint8_t *pSrcHotTile,
2327         SWR_SURFACE_STATE* pDstSurface,
2328         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2329     {
2330         PFN_STORE_TILES_INTERNAL pfnStore;
2331         pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2332 
2333         // Store each raster tile from the hot tile to the destination surface.
2334         for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2335         {
2336             for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2337             {
2338                 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2339                 {
2340                     pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2341                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2342                 }
2343             }
2344         }
2345 
2346     }
2347 
2348     typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2349     //////////////////////////////////////////////////////////////////////////
2350     /// @brief Stores a macrotile to the destination surface.
2351     /// @param pSrc - Pointer to macro tile.
2352     /// @param pDstSurface - Destination surface state
2353     /// @param x, y - Coordinates to macro tile
2354     static void Store(
2355         uint8_t *pSrcHotTile,
2356         SWR_SURFACE_STATE* pDstSurface,
2357         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2358     {
2359         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2360 
2361         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2362         {
2363             size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2364                 0,
2365                 0,
2366                 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2367                 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2368                 sampleNum,
2369                 pDstSurface->lod,
2370                 pDstSurface);
2371 
2372             // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2373             bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2374                 (pDstSurface->bInterleavedSamples);
2375 
2376             pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2377         }
2378 
2379         // Save original for pSrcHotTile resolve.
2380         uint8_t *pResolveSrcHotTile = pSrcHotTile;
2381 
2382         // Store each raster tile from the hot tile to the destination surface.
2383         for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2384         {
2385             for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2386             {
2387                 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2388                 {
2389                     pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2390                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2391                 }
2392             }
2393         }
2394 
2395         if (pDstSurface->xpAuxBaseAddress)
2396         {
2397             uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2398             // Store each raster tile from the hot tile to the destination surface.
2399             for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2400             {
2401                 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2402                 {
2403                     StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
2404                     pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
2405                 }
2406             }
2407         }
2408     }
2409 };
2410 
2411 //////////////////////////////////////////////////////////////////////////
2412 /// InitStoreTilesTable - Helper for setting up the tables.
2413 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2414 void InitStoreTilesTableColor_Half1(
2415     PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2416 {
2417     table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2418     table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2419     table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2420     table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2421     table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2422     table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2423     table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2424     table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2425     table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2426     table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2427     table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2428     table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2429     table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2430     table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2431     table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2432     table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2433     table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2434     table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2435     table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2436     table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2437     table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2438     table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2439     table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2440     table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2441     table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2442     table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2443     table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2444     table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2445     table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2446     table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2447     table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2448     table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2449     table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2450     table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2451     table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2452     table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2453     table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2454     table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2455     table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2456     table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2457     table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2458     table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2459     table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2460     table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2461     table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2462     table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2463     table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2464     table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2465     table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2466     table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2467     table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2468     table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2469     table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2470     table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2471     table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2472     table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2473 }
2474 
2475 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2476 void InitStoreTilesTableColor_Half2(
2477     PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2478 {
2479     table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2480     table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2481     table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2482     table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2483     table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2484     table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2485     table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2486     table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2487     table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2488     table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2489     table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2490     table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2491     table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2492     table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2493     table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2494     table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2495     table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2496     table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2497     table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2498     table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2499     table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2500     table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2501     table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2502     table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2503     table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2504     table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2505     table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2506     table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2507     table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2508     table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2509     table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2510     table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2511     table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2512     table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2513     table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2514     table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2515     table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2516     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2517     table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2518     table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2519     table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2520     table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2521     table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2522     table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2523     table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2524     table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2525     table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2526     table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2527     table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2528     table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2529     table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2530     table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2531     table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2532     table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2533     table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2534     table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2535     table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2536     table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2537     table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2538     table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2539     table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2540     table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2541     table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2542     table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2543 }
2544 
2545 //////////////////////////////////////////////////////////////////////////
2546 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2547 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2548 void InitStoreTilesTableDepth(
2549     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2550 {
2551    table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2552    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2553    table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2554    table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2555 }
2556 
2557 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2558 void InitStoreTilesTableStencil(
2559     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2560 {
2561     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2562 }
2563