• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 #pragma once
30 
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35 
36 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
37 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
38 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
39 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
41 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
42 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
43 void InitClearTilesTable();
44 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
45 void InitBackendFuncTables();
46 void InitCPSFuncTables();
47 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
48 
49 enum SWR_BACKEND_FUNCS
50 {
51     SWR_BACKEND_SINGLE_SAMPLE,
52     SWR_BACKEND_MSAA_PIXEL_RATE,
53     SWR_BACKEND_MSAA_SAMPLE_RATE,
54     SWR_BACKEND_FUNCS_MAX,
55 };
56 
57 #if KNOB_SIMD_WIDTH == 8
58 extern const __m256 vCenterOffsetsX;
59 extern const __m256 vCenterOffsetsY;
60 extern const __m256 vULOffsetsX;
61 extern const __m256 vULOffsetsY;
62 #define MASK 0xff
63 #endif
64 
RasterTileColorOffset(uint32_t sampleNum)65 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
66 {
67     static const uint32_t RasterTileColorOffsets[16]
68     { 0,
69       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
70       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
71       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
72       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
73       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
74       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
75       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
76       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
77       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
78       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
79       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
80       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
81       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
82       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
83       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
84     };
85     assert(sampleNum < 16);
86     return RasterTileColorOffsets[sampleNum];
87 }
88 
RasterTileDepthOffset(uint32_t sampleNum)89 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
90 {
91     static const uint32_t RasterTileDepthOffsets[16]
92     { 0,
93       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
94       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
95       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
96       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
97       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
98       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
99       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
100       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
101       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
102       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
103       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
104       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
105       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
106       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
107       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
108     };
109     assert(sampleNum < 16);
110     return RasterTileDepthOffsets[sampleNum];
111 }
112 
RasterTileStencilOffset(uint32_t sampleNum)113 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
114 {
115     static const uint32_t RasterTileStencilOffsets[16]
116     { 0,
117       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
118       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
119       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
120       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
121       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
122       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
123       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
124       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
125       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
126       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
127       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
128       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
129       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
130       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
131       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
132     };
133     assert(sampleNum < 16);
134     return RasterTileStencilOffsets[sampleNum];
135 }
136 
137 template<typename T, uint32_t InputCoverage>
138 struct generateInputCoverage
139 {
generateInputCoveragegenerateInputCoverage140     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
141     {
142         // will need to update for avx512
143         assert(KNOB_SIMD_WIDTH == 8);
144 
145         __m256i mask[2];
146         __m256i sampleCoverage[2];
147         if(T::bIsStandardPattern)
148         {
149             __m256i src = _mm256_set1_epi32(0);
150             __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
151 
152             if(T::MultisampleT::numSamples == 1)
153             {
154                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
155             }
156             else if(T::MultisampleT::numSamples == 2)
157             {
158                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
159             }
160             else if(T::MultisampleT::numSamples == 4)
161             {
162                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
163             }
164             else if(T::MultisampleT::numSamples == 8)
165             {
166                 mask[0] = _mm256_set1_epi32(-1);
167             }
168             else if(T::MultisampleT::numSamples == 16)
169             {
170                 mask[0] = _mm256_set1_epi32(-1);
171                 mask[1] = _mm256_set1_epi32(-1);
172                 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
173             }
174 
175             // gather coverage for samples 0-7
176             sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
177             if(T::MultisampleT::numSamples > 8)
178             {
179                 // gather coverage for samples 8-15
180                 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
181             }
182         }
183         else
184         {
185             // center coverage is the same for all samples; just broadcast to the sample slots
186             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
187             if(T::MultisampleT::numSamples == 1)
188             {
189                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
190             }
191             else if(T::MultisampleT::numSamples == 2)
192             {
193                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
194             }
195             else if(T::MultisampleT::numSamples == 4)
196             {
197                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
198             }
199             else if(T::MultisampleT::numSamples == 8)
200             {
201                 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
202             }
203             else if(T::MultisampleT::numSamples == 16)
204             {
205                 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
206                 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
207             }
208         }
209 
210         mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
211                                   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
212         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
213         __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
214 
215         __m256i packedCoverage1;
216         if(T::MultisampleT::numSamples > 8)
217         {
218             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
219             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
220         }
221 
222     #if (KNOB_ARCH == KNOB_ARCH_AVX)
223         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
224         __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
225         __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
226         packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
227 
228         __m256i packedSampleCoverage;
229         if(T::MultisampleT::numSamples > 8)
230         {
231             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
232             hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
233             shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
234             shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
235             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
236             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
237         }
238         else
239         {
240             packedSampleCoverage = packedCoverage0;
241         }
242     #else
243         __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
244         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
245         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
246 
247         __m256i packedSampleCoverage;
248         if(T::MultisampleT::numSamples > 8)
249         {
250             permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
251             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
252             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
253 
254             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
255             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
256         }
257         else
258         {
259             packedSampleCoverage = packedCoverage0;
260         }
261     #endif
262 
263         for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
264         {
265             // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
266             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
267 
268             if(!T::bForcedSampleCount)
269             {
270                 // input coverage has to be anded with sample mask if MSAA isn't forced on
271                 inputMask[i] &= sampleMask;
272             }
273 
274             // shift to the next pixel in the 4x2
275             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
276         }
277     }
278 
generateInputCoveragegenerateInputCoverage279     INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
280     {
281         uint32_t inputMask[KNOB_SIMD_WIDTH];
282         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
283         inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
284     }
285 
286 };
287 
288 template<typename T>
289 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
290 {
291     INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
292     {
293         // will need to update for avx512
294         assert(KNOB_SIMD_WIDTH == 8);
295         __m256i vec = _mm256_set1_epi32(coverageMask[0]);
296         const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
297         vec = _simd_and_si(vec, bit);
298         vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
299         vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
300         inputCoverage = _simd_castsi_ps(vec);
301     }
302 
303     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
304     {
305         uint32_t simdCoverage = (coverageMask[0] & MASK);
306         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
307         for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
308         {
309             // set all samples to covered if conservative coverage mask is set for that pixel
310             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
311         }
312     }
313 };
314 
315 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
316 // Centroid behaves exactly as follows :
317 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
318 //     have a sample location there).
319 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
320 //     coverage with the SampleMask Rasterizer State.
321 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
322 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
323 //     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
324 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
325 template<typename T>
326 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
327                             const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
328 {
329     uint32_t inputMask[KNOB_SIMD_WIDTH];
330     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
331 
332     // Case (2) - partially covered pixel
333 
334     // scan for first covered sample per pixel in the 4x2 span
335     unsigned long sampleNum[KNOB_SIMD_WIDTH];
336     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
337     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
338     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
339     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
340     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
341     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
342     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
343     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
344 
345     // look up and set the sample offsets from UL pixel corner for first covered sample
346     __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
347                                     T::MultisampleT::X(sampleNum[6]),
348                                     T::MultisampleT::X(sampleNum[5]),
349                                     T::MultisampleT::X(sampleNum[4]),
350                                     T::MultisampleT::X(sampleNum[3]),
351                                     T::MultisampleT::X(sampleNum[2]),
352                                     T::MultisampleT::X(sampleNum[1]),
353                                     T::MultisampleT::X(sampleNum[0]));
354 
355     __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
356                                     T::MultisampleT::Y(sampleNum[6]),
357                                     T::MultisampleT::Y(sampleNum[5]),
358                                     T::MultisampleT::Y(sampleNum[4]),
359                                     T::MultisampleT::Y(sampleNum[3]),
360                                     T::MultisampleT::Y(sampleNum[2]),
361                                     T::MultisampleT::Y(sampleNum[1]),
362                                     T::MultisampleT::Y(sampleNum[0]));
363     // add sample offset to UL pixel corner
364     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
365     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
366 
367     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
368     static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
369     __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
370     __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
371 
372     static const __m256i vZero = _simd_setzero_si();
373     const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
374     __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
375     __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
376     __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
377 
378     __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
379 
380     // set the centroid position based on results from above
381     psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
382     psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
383 
384     // Case (3a) No samples covered and partial sample mask
385     __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
386     // sample mask should never be all 0's for this case, but handle it anyways
387     unsigned long firstCoveredSampleMaskSample = 0;
388     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
389 
390     __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
391 
392     vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
393     vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
394 
395     // blend in case 3a pixel locations
396     psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
397     psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
398 }
399 
400 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
401                                      const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
402 {
403     // evaluate I,J
404     psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
405     psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
406     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
407     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
408 
409     // interpolate 1/w
410     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
411 }
412 
413 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
414 {
415     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
416     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
417 
418     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
419 }
420 
421 template<typename T>
422 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
423 {
424     // RT has to be single sample if we're in forcedMSAA mode
425     if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
426     {
427         return 1;
428     }
429     // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
430     else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
431     {
432         return GetNumSamples(blendSampleCount);
433     }
434     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
435     else
436     {
437         return T::MultisampleT::numSamples;
438     }
439 }
440 
441 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
442 {
443     // broadcast scalars
444 
445     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
446     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
447     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
448 
449     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
450     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
451     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
452 
453     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
454     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
455     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
456 
457     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
458 
459     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
460     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
461     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
462 }
463 
464 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
465 {
466     assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
467 
468     if (pColorBuffer)
469     {
470         for (uint32_t index = 0; index < colorBufferCount; index += 1)
471         {
472             pColorBuffer[index] = renderBuffers.pColor[index];
473         }
474     }
475 
476     if (pDepthBuffer)
477     {
478         *pDepthBuffer = renderBuffers.pDepth;
479     }
480 
481     if (pStencilBuffer)
482     {
483         *pStencilBuffer = renderBuffers.pStencil;;
484     }
485 }
486 
487 template<typename T>
488 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work)
489 {
490     psContext->pAttribs = work.pAttribs;
491     psContext->pPerspAttribs = work.pPerspAttribs;
492     psContext->frontFace = work.triFlags.frontFacing;
493     psContext->primID = work.triFlags.primID;
494 
495     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
496     psContext->I = work.I;
497     psContext->J = work.J;
498 
499     psContext->recipDet = work.recipDet;
500     psContext->pRecipW = work.pRecipW;
501     psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
502     psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
503     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
504     psContext->sampleIndex = 0;
505 }
506 
507 template<typename T, bool IsSingleSample>
508 void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
509 {
510     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
511     {
512         // for 1x case, centroid is pixel center
513         psContext->vX.centroid = psContext->vX.center;
514         psContext->vY.centroid = psContext->vY.center;
515         psContext->vI.centroid = psContext->vI.center;
516         psContext->vJ.centroid = psContext->vJ.center;
517         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
518     }
519     else
520     {
521         if (T::bCentroidPos)
522         {
523             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
524             if (T::bIsStandardPattern)
525             {
526                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
527                 CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
528             }
529             else
530             {
531                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
532                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
533             }
534 
535             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
536         }
537         else
538         {
539             psContext->vX.centroid = psContext->vX.sample;
540             psContext->vY.centroid = psContext->vY.sample;
541         }
542     }
543 }
544 
545 template<typename T>
546 struct PixelRateZTestLoop
547 {
548     PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
549                        uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
550                        pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
551                        clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {};
552 
553     INLINE
554     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
555                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
556     {
557         SWR_CONTEXT *pContext = pDC->pContext;
558 
559         uint32_t statCount = 0;
560         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
561         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
562         {
563             const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
564             vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
565 
566             if(!_simd_movemask_ps(vCoverageMask[sample]))
567             {
568                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
569                 continue;
570             }
571 
572             // offset depth/stencil buffers current sample
573             uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
574             uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
575 
576             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
577             {
578                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
579 
580                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
581 
582                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
583                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
584 
585                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
586             }
587 
588             AR_BEGIN(BEBarycentric, pDC->drawId);
589 
590             // calculate per sample positions
591             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
592             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
593 
594             // calc I & J per sample
595             CalcSampleBarycentrics(coeffs, psContext);
596 
597             if(psState.writesODepth)
598             {
599                 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
600                 vZ[sample] = psContext.vZ;
601             }
602             else
603             {
604                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
605                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
606             }
607 
608             AR_END(BEBarycentric, 0);
609 
610             ///@todo: perspective correct vs non-perspective correct clipping?
611             // if clip distances are enabled, we need to interpolate for each sample
612             if(clipDistanceMask)
613             {
614                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
615 
616                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
617             }
618 
619             // ZTest for this sample
620             ///@todo Need to uncomment out this bucket.
621             //AR_BEGIN(BEDepthBucket, pDC->drawId);
622             depthPassMask[sample] = vCoverageMask[sample];
623             stencilPassMask[sample] = vCoverageMask[sample];
624             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
625                                                      vZ[sample], pDepthSample, vCoverageMask[sample],
626                                                      pStencilSample, &stencilPassMask[sample]);
627             //AR_END(BEDepthBucket, 0);
628 
629             // early-exit if no pixels passed depth or earlyZ is forced on
630             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
631             {
632                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
633                                   pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
634 
635                 if(!_simd_movemask_ps(depthPassMask[sample]))
636                 {
637                     continue;
638                 }
639             }
640             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
641             uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
642             statCount += _mm_popcnt_u32(statMask);
643         }
644 
645         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
646         // return number of samples that passed depth and coverage
647         return statCount;
648     }
649 
650     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
651     simdscalar vZ[T::MultisampleT::numCoverageSamples];
652     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
653     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
654     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
655 
656 private:
657     // functor inputs
658     DRAW_CONTEXT* pDC;
659     uint32_t workerId;
660 
661     const SWR_TRIANGLE_DESC& work;
662     const BarycentricCoeffs& coeffs;
663     const API_STATE& state;
664     const SWR_PS_STATE& psState;
665     const uint8_t clipDistanceMask;
666     uint8_t*& pDepthBuffer;
667     uint8_t*& pStencilBuffer;
668 };
669 
670 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
671 {
672     // evaluate I,J
673     psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
674     psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
675     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
676     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
677 
678     // interpolate 1/w
679     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
680 }
681 
682 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
683 {
684     // evaluate I,J
685     psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
686     psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
687     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
688     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
689 
690     // interpolate 1/w
691     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
692 }
693 
694 // Merge Output to 4x2 SIMD Tile Format
695 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
696     const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
697 {
698     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
699     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
700     simdvector blendOut;
701 
702     for(uint32_t rt = 0; rt < NumRT; ++rt)
703     {
704         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
705 
706         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
707         // pfnBlendFunc may not update all channels.  Initialize with PS output.
708         /// TODO: move this into the blend JIT.
709         blendOut = psContext.shaded[rt];
710 
711         // Blend outputs and update coverage mask for alpha test
712         if(pfnBlendFunc[rt] != nullptr)
713         {
714             pfnBlendFunc[rt](
715                 pBlendState,
716                 psContext.shaded[rt],
717                 psContext.shaded[1],
718                 psContext.shaded[0].w,
719                 sample,
720                 pColorSample,
721                 blendOut,
722                 &psContext.oMask,
723                 (simdscalari*)&coverageMask);
724         }
725 
726         // final write mask
727         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
728 
729         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
730         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
731 
732         const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
733 
734         // store with color mask
735         if(!pRTBlend->writeDisableRed)
736         {
737             _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
738         }
739         if(!pRTBlend->writeDisableGreen)
740         {
741             _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
742         }
743         if(!pRTBlend->writeDisableBlue)
744         {
745             _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
746         }
747         if(!pRTBlend->writeDisableAlpha)
748         {
749             _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
750         }
751     }
752 }
753 
754 #if USE_8x2_TILE_BACKEND
755 // Merge Output to 8x2 SIMD16 Tile Format
756 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
757     const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
758 {
759     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
760     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
761 
762     if (useAlternateOffset)
763     {
764         rasterTileColorOffset += sizeof(simdscalar);
765     }
766 
767     simdvector blendSrc;
768     simdvector blendOut;
769 
770     uint32_t colorBufferBit = 1;
771     for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
772     {
773         simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
774 
775         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
776         // pfnBlendFunc may not update all channels.  Initialize with PS output.
777         /// TODO: move this into the blend JIT.
778         blendOut = psContext.shaded[rt];
779 
780         if (colorBufferBit & colorBufferEnableMask)
781         {
782             blendSrc[0] = pColorSample[0];
783             blendSrc[1] = pColorSample[2];
784             blendSrc[2] = pColorSample[4];
785             blendSrc[3] = pColorSample[6];
786         }
787 
788         // Blend outputs and update coverage mask for alpha test
789         if (pfnBlendFunc[rt] != nullptr)
790         {
791             pfnBlendFunc[rt](
792                 pBlendState,
793                 psContext.shaded[rt],
794                 psContext.shaded[1],
795                 psContext.shaded[0].w,
796                 sample,
797                 reinterpret_cast<uint8_t *>(&blendSrc),
798                 blendOut,
799                 &psContext.oMask,
800                 reinterpret_cast<simdscalari *>(&coverageMask));
801         }
802 
803         // final write mask
804         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
805 
806         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
807         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
808 
809         // store with color mask
810         if (!pRTBlend->writeDisableRed)
811         {
812             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
813         }
814         if (!pRTBlend->writeDisableGreen)
815         {
816             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
817         }
818         if (!pRTBlend->writeDisableBlue)
819         {
820             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
821         }
822         if (!pRTBlend->writeDisableAlpha)
823         {
824             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
825         }
826     }
827 }
828 
829 #endif
830 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
831          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
832 struct SwrBackendTraits
833 {
834     static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
835     static const uint32_t InputCoverage = coverage;
836     static const bool bCentroidPos = (centroid == 1);
837     static const bool bForcedSampleCount = (forced == 1);
838     static const bool bCanEarlyZ = (canEarlyZ == 1);
839     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
840 };
841