• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 
38 #include <algorithm>
39 
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42 
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessComputeBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t threadGroupId,void * & pSpillFillBuffer)48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50     SWR_CONTEXT *pContext = pDC->pContext;
51 
52     AR_BEGIN(BEDispatch, pDC->drawId);
53 
54     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55     SWR_ASSERT(pTaskData != nullptr);
56 
57     // Ensure spill fill memory has been allocated.
58     size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59     if (spillFillSize && pSpillFillBuffer == nullptr)
60     {
61         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62     }
63 
64     const API_STATE& state = GetApiState(pDC);
65 
66     SWR_CS_CONTEXT csContext{ 0 };
67     csContext.tileCounter = threadGroupId;
68     csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71     csContext.pTGSM = pContext->ppScratch[workerId];
72     csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73 
74     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75 
76     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
77 
78     AR_END(BEDispatch, 1);
79 }
80 
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessShutdownBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88     // Dummy function
89 }
90 
ProcessSyncBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93     uint32_t x, y;
94     MacroTileMgr::getTileIndices(macroTile, x, y);
95     SWR_ASSERT(x == 0 && y == 0);
96 }
97 
98 template<SWR_FORMAT format>
ClearRasterTile(uint8_t * pTileBuffer,simdvector & value)99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101     auto lambda = [&](int32_t comp)
102     {
103         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104 
105         pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
106     };
107 
108     const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
109 
110     for (uint32_t i = 0; i < numIter; ++i)
111     {
112         UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
113     }
114 }
115 
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format>
ClearRasterTile(uint8_t * pTileBuffer,simd16vector & value)118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
119 {
120     auto lambda = [&](int32_t comp)
121     {
122         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
123 
124         pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
125     };
126 
127     const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
128 
129     for (uint32_t i = 0; i < numIter; ++i)
130     {
131         UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
132     }
133 }
134 
135 #endif
136 template<SWR_FORMAT format>
ClearMacroTile(DRAW_CONTEXT * pDC,SWR_RENDERTARGET_ATTACHMENT rt,uint32_t macroTile,uint32_t renderTargetArrayIndex,DWORD clear[4],const SWR_RECT & rect)137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
138 {
139     // convert clear color to hottile format
140     // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
142     simd16vector vClear;
143     for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
144     {
145         simd16scalar vComp;
146         vComp = _simd16_load1_ps((const float*)&clear[comp]);
147         if (FormatTraits<format>::isNormalized(comp))
148         {
149             vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
150             vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
151         }
152         vComp = FormatTraits<format>::pack(comp, vComp);
153         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
154     }
155 
156 #else
157     simdvector vClear;
158     for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
159     {
160         simdscalar vComp;
161         vComp = _simd_load1_ps((const float*)&clear[comp]);
162         if (FormatTraits<format>::isNormalized(comp))
163         {
164             vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
165             vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
166         }
167         vComp = FormatTraits<format>::pack(comp, vComp);
168         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
169     }
170 
171 #endif
172     uint32_t tileX, tileY;
173     MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
174 
175     // Init to full macrotile
176     SWR_RECT clearTile =
177     {
178         KNOB_MACROTILE_X_DIM * int32_t(tileX),
179         KNOB_MACROTILE_Y_DIM * int32_t(tileY),
180         KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
181         KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
182     };
183 
184     // intersect with clear rect
185     clearTile &= rect;
186 
187     // translate to local hottile origin
188     clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
189 
190     // Make maximums inclusive (needed for convert to raster tiles)
191     clearTile.xmax -= 1;
192     clearTile.ymax -= 1;
193 
194     // convert to raster tiles
195     clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
196     clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
197     clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
198     clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
199 
200     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
201     // compute steps between raster tile samples / raster tiles / macro tile rows
202     const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
203     const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
204     const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
205     const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
206 
207     HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
208     uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
209     uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
210 
211     // loop over all raster tiles in the current hot tile
212     for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
213     {
214         uint8_t* pRasterTile = pRasterTileRow;
215         for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
216         {
217             for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
218             {
219                 ClearRasterTile<format>(pRasterTile, vClear);
220                 pRasterTile += rasterTileSampleStep;
221             }
222         }
223         pRasterTileRow += macroTileRowStep;
224     }
225 
226     pHotTile->state = HOTTILE_DIRTY;
227 }
228 
229 
ProcessClearBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
231 {
232     SWR_CONTEXT *pContext = pDC->pContext;
233 
234     if (KNOB_FAST_CLEAR)
235     {
236         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237         SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
238         uint32_t numSamples = GetNumSamples(sampleCount);
239 
240         SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
241 
242         AR_BEGIN(BEClear, pDC->drawId);
243 
244         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
245         {
246             unsigned long rt = 0;
247             uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
248             while (_BitScanForward(&rt, mask))
249             {
250                 mask &= ~(1 << rt);
251 
252                 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
253 
254                 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
255                 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
256                 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
257                 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
258                 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
259                 pHotTile->state = HOTTILE_CLEAR;
260             }
261         }
262 
263         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
264         {
265             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
266             pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
267             pHotTile->state = HOTTILE_CLEAR;
268         }
269 
270         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
271         {
272             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
273 
274             pHotTile->clearData[0] = pClear->clearStencil;
275             pHotTile->state = HOTTILE_CLEAR;
276         }
277 
278         AR_END(BEClear, 1);
279     }
280     else
281     {
282         // Legacy clear
283         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
284         AR_BEGIN(BEClear, pDC->drawId);
285 
286         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
287         {
288             DWORD clearData[4];
289             clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
290             clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
291             clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
292             clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
293 
294             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
295             SWR_ASSERT(pfnClearTiles != nullptr);
296 
297             unsigned long rt = 0;
298             uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
299             while (_BitScanForward(&rt, mask))
300             {
301                 mask &= ~(1 << rt);
302 
303                 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
304             }
305         }
306 
307         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
308         {
309             DWORD clearData[4];
310             clearData[0] = *(DWORD*)&pClear->clearDepth;
311             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
312             SWR_ASSERT(pfnClearTiles != nullptr);
313 
314             pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
315         }
316 
317         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
318         {
319             DWORD clearData[4];
320             clearData[0] = pClear->clearStencil;
321             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
322 
323             pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
324         }
325 
326         AR_END(BEClear, 1);
327     }
328 }
329 
ProcessStoreTileBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,STORE_TILES_DESC * pDesc,SWR_RENDERTARGET_ATTACHMENT attachment)330 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
331     SWR_RENDERTARGET_ATTACHMENT attachment)
332 {
333     SWR_CONTEXT *pContext = pDC->pContext;
334 
335     AR_BEGIN(BEStoreTiles, pDC->drawId);
336 
337     SWR_FORMAT srcFormat;
338     switch (attachment)
339     {
340     case SWR_ATTACHMENT_COLOR0:
341     case SWR_ATTACHMENT_COLOR1:
342     case SWR_ATTACHMENT_COLOR2:
343     case SWR_ATTACHMENT_COLOR3:
344     case SWR_ATTACHMENT_COLOR4:
345     case SWR_ATTACHMENT_COLOR5:
346     case SWR_ATTACHMENT_COLOR6:
347     case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
348     case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
349     case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
350     default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
351     }
352 
353     uint32_t x, y;
354     MacroTileMgr::getTileIndices(macroTile, x, y);
355 
356     // Only need to store the hottile if it's been rendered to...
357     HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
358     if (pHotTile)
359     {
360         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
361         if (pHotTile->state == HOTTILE_CLEAR)
362         {
363             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
364             SWR_ASSERT(pfnClearTiles != nullptr);
365 
366             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
367         }
368 
369         if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
370         {
371             int32_t destX = KNOB_MACROTILE_X_DIM * x;
372             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
373 
374             pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
375                 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
376         }
377 
378 
379         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
380         {
381             pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
382         }
383     }
384     AR_END(BEStoreTiles, 1);
385 }
386 
ProcessStoreTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)387 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
388 {
389     STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
390 
391     unsigned long rt = 0;
392     uint32_t mask = pDesc->attachmentMask;
393     while (_BitScanForward(&rt, mask))
394     {
395         mask &= ~(1 << rt);
396         ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
397     }
398 }
399 
ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
401 {
402     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
403     SWR_CONTEXT *pContext = pDC->pContext;
404 
405     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
406 
407     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
408     {
409         if (pDesc->attachmentMask & (1 << i))
410         {
411             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
412                 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
413             if (pHotTile)
414             {
415                 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
416             }
417         }
418     }
419 }
420 
421 #if KNOB_SIMD_WIDTH == 8
422 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
423 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
424 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
425 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
426 #else
427 #error Unsupported vector width
428 #endif
429 
ComputeUserClipMask(uint8_t clipMask,float * pUserClipBuffer,simdscalar vI,simdscalar vJ)430 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
431 {
432     simdscalar vClipMask = _simd_setzero_ps();
433     uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
434 
435     for (uint32_t i = 0; i < numClipDistance; ++i)
436     {
437         // pull triangle clip distance values from clip buffer
438         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
439         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
440         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
441 
442         // interpolate
443         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
444 
445         // clip if interpolated clip distance is < 0 || NAN
446         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
447 
448         vClipMask = _simd_or_ps(vClipMask, vCull);
449     }
450 
451     return _simd_movemask_ps(vClipMask);
452 }
453 
454 template<typename T>
BackendSingleSample(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)455 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
456 {
457     SWR_CONTEXT *pContext = pDC->pContext;
458 
459     AR_BEGIN(BESingleSampleBackend, pDC->drawId);
460     AR_BEGIN(BESetup, pDC->drawId);
461 
462     const API_STATE &state = GetApiState(pDC);
463 
464     BarycentricCoeffs coeffs;
465     SetupBarycentricCoeffs(&coeffs, work);
466 
467     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
468     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
469 
470     SWR_PS_CONTEXT psContext;
471     SetupPixelShaderContext<T>(&psContext, work);
472 
473     AR_END(BESetup, 1);
474 
475     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
476     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
477 
478     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
479 
480     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
481     {
482         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
483         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
484 
485         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
486 
487         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
488         {
489 #if USE_8x2_TILE_BACKEND
490             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
491 
492 #endif
493             simdmask coverageMask = work.coverageMask[0] & MASK;
494 
495             if (coverageMask)
496             {
497                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
498                 {
499                     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
500 
501                     const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
502 
503                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
504                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
505 
506                     coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
507                 }
508 
509                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
510                 {
511                     const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
512 
513                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
514                 }
515 
516                 AR_BEGIN(BEBarycentric, pDC->drawId);
517 
518                 CalcPixelBarycentrics(coeffs, psContext);
519 
520                 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
521 
522                 // interpolate and quantize z
523                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
524                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
525 
526                 AR_END(BEBarycentric, 1);
527 
528                 // interpolate user clip distance if available
529                 if (state.rastState.clipDistanceMask)
530                 {
531                     coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
532                 }
533 
534                 simdscalar vCoverageMask = vMask(coverageMask);
535                 simdscalar depthPassMask = vCoverageMask;
536                 simdscalar stencilPassMask = vCoverageMask;
537 
538                 // Early-Z?
539                 if (T::bCanEarlyZ)
540                 {
541                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
542                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
543                                                      psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
544                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
545                     AR_END(BEEarlyDepthTest, 0);
546 
547                     // early-exit if no pixels passed depth or earlyZ is forced on
548                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
549                     {
550                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
551                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
552 
553                         if (!_simd_movemask_ps(depthPassMask))
554                         {
555                             goto Endtile;
556                         }
557                     }
558                 }
559 
560                 psContext.sampleIndex = 0;
561                 psContext.activeMask = _simd_castps_si(vCoverageMask);
562 
563                 // execute pixel shader
564                 AR_BEGIN(BEPixelShader, pDC->drawId);
565                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
566                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
567                 AR_END(BEPixelShader, 0);
568 
569                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
570 
571                 // late-Z
572                 if (!T::bCanEarlyZ)
573                 {
574                     AR_BEGIN(BELateDepthTest, pDC->drawId);
575                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
576                                                         psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
577                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
578                     AR_END(BELateDepthTest, 0);
579 
580                     if (!_simd_movemask_ps(depthPassMask))
581                     {
582                         // need to call depth/stencil write for stencil write
583                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
584                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
585                         goto Endtile;
586                     }
587                 }
588 
589                 uint32_t statMask = _simd_movemask_ps(depthPassMask);
590                 uint32_t statCount = _mm_popcnt_u32(statMask);
591                 UPDATE_STAT_BE(DepthPassCount, statCount);
592 
593                 // output merger
594                 AR_BEGIN(BEOutputMerger, pDC->drawId);
595 #if USE_8x2_TILE_BACKEND
596                 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
597 #else
598                 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
599 #endif
600 
601                 // do final depth write after all pixel kills
602                 if (!state.psState.forceEarlyZ)
603                 {
604                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
605                         pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
606                 }
607                 AR_END(BEOutputMerger, 0);
608             }
609 
610 Endtile:
611             AR_BEGIN(BEEndTile, pDC->drawId);
612 
613             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
614             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
615             {
616                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
617             }
618 
619 #if USE_8x2_TILE_BACKEND
620             if (useAlternateOffset)
621             {
622                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
623                 {
624                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
625                 }
626             }
627 #else
628             for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
629             {
630                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
631             }
632 #endif
633             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
634             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
635 
636             AR_END(BEEndTile, 0);
637 
638             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
639             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
640         }
641 
642         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
643         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
644     }
645 
646     AR_END(BESingleSampleBackend, 0);
647 }
648 
649 template<typename T>
BackendSampleRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)650 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
651 {
652     SWR_CONTEXT *pContext = pDC->pContext;
653 
654     AR_BEGIN(BESampleRateBackend, pDC->drawId);
655     AR_BEGIN(BESetup, pDC->drawId);
656 
657     const API_STATE &state = GetApiState(pDC);
658 
659     BarycentricCoeffs coeffs;
660     SetupBarycentricCoeffs(&coeffs, work);
661 
662     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
663     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
664 
665     SWR_PS_CONTEXT psContext;
666     SetupPixelShaderContext<T>(&psContext, work);
667 
668     AR_END(BESetup, 0);
669 
670     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
671     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
672 
673     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
674 
675     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
676     {
677         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
678         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
679 
680         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
681 
682         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
683         {
684 #if USE_8x2_TILE_BACKEND
685             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
686 
687 #endif
688             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
689             {
690                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
691 
692                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
693             }
694 
695             AR_BEGIN(BEBarycentric, pDC->drawId);
696 
697             CalcPixelBarycentrics(coeffs, psContext);
698 
699             CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
700 
701             AR_END(BEBarycentric, 0);
702 
703             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
704             {
705                 simdmask coverageMask = work.coverageMask[sample] & MASK;
706 
707                 if (coverageMask)
708                 {
709                     // offset depth/stencil buffers current sample
710                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
711                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
712 
713                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
714                     {
715                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
716 
717                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
718 
719                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
720                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
721 
722                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
723                     }
724 
725                     AR_BEGIN(BEBarycentric, pDC->drawId);
726 
727                     // calculate per sample positions
728                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
729                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
730 
731                     CalcSampleBarycentrics(coeffs, psContext);
732 
733                     // interpolate and quantize z
734                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
735                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
736 
737                     AR_END(BEBarycentric, 0);
738 
739                     // interpolate user clip distance if available
740                     if (state.rastState.clipDistanceMask)
741                     {
742                         coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
743                     }
744 
745                     simdscalar vCoverageMask = vMask(coverageMask);
746                     simdscalar depthPassMask = vCoverageMask;
747                     simdscalar stencilPassMask = vCoverageMask;
748 
749                     // Early-Z?
750                     if (T::bCanEarlyZ)
751                     {
752                         AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
753                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
754                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
755                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
756                         AR_END(BEEarlyDepthTest, 0);
757 
758                         // early-exit if no samples passed depth or earlyZ is forced on.
759                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
760                         {
761                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
762                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
763 
764                             if (!_simd_movemask_ps(depthPassMask))
765                             {
766                                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
767                                 continue;
768                             }
769                         }
770                     }
771 
772                     psContext.sampleIndex = sample;
773                     psContext.activeMask = _simd_castps_si(vCoverageMask);
774 
775                     // execute pixel shader
776                     AR_BEGIN(BEPixelShader, pDC->drawId);
777                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
778                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
779                     AR_END(BEPixelShader, 0);
780 
781                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
782 
783                     // late-Z
784                     if (!T::bCanEarlyZ)
785                     {
786                         AR_BEGIN(BELateDepthTest, pDC->drawId);
787                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
788                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
789                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
790                         AR_END(BELateDepthTest, 0);
791 
792                         if (!_simd_movemask_ps(depthPassMask))
793                         {
794                             // need to call depth/stencil write for stencil write
795                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
796                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
797 
798                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
799                             continue;
800                         }
801                     }
802 
803                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
804                     uint32_t statCount = _mm_popcnt_u32(statMask);
805                     UPDATE_STAT_BE(DepthPassCount, statCount);
806 
807                     // output merger
808                     AR_BEGIN(BEOutputMerger, pDC->drawId);
809 #if USE_8x2_TILE_BACKEND
810                     OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
811 #else
812                     OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
813 #endif
814 
815                     // do final depth write after all pixel kills
816                     if (!state.psState.forceEarlyZ)
817                     {
818                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
819                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
820                     }
821                     AR_END(BEOutputMerger, 0);
822                 }
823                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
824             }
825 
826 Endtile:
827             ATTR_UNUSED;
828 
829             AR_BEGIN(BEEndTile, pDC->drawId);
830 
831             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
832             {
833                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
834             }
835 
836 #if USE_8x2_TILE_BACKEND
837             if (useAlternateOffset)
838             {
839                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
840                 {
841                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
842                 }
843             }
844 #else
845             for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
846             {
847                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
848             }
849 #endif
850             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
851             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
852 
853             AR_END(BEEndTile, 0);
854 
855             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
856             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
857         }
858 
859         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
860         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
861     }
862 
863     AR_END(BESampleRateBackend, 0);
864 }
865 
866 template<typename T>
BackendPixelRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)867 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
868 {
869     SWR_CONTEXT *pContext = pDC->pContext;
870 
871     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
872     AR_BEGIN(BESetup, pDC->drawId);
873 
874     const API_STATE &state = GetApiState(pDC);
875 
876     BarycentricCoeffs coeffs;
877     SetupBarycentricCoeffs(&coeffs, work);
878 
879     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
880     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
881 
882     SWR_PS_CONTEXT psContext;
883     SetupPixelShaderContext<T>(&psContext, work);
884 
885     AR_END(BESetup, 0);
886 
887     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
888 
889     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
890     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
891 
892     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
893 
894     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
895     {
896         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
897         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
898 
899         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
900 
901         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
902         {
903 #if USE_8x2_TILE_BACKEND
904             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
905 
906 #endif
907             simdscalar activeLanes;
908             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
909             activeLanes = vMask(work.anyCoveredSamples & MASK);
910 
911             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
912             {
913                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
914 
915                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
916             }
917 
918             AR_BEGIN(BEBarycentric, pDC->drawId);
919 
920             CalcPixelBarycentrics(coeffs, psContext);
921 
922             CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
923 
924             AR_END(BEBarycentric, 0);
925 
926             if(T::bForcedSampleCount)
927             {
928                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
929                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
930                 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
931             }
932 
933             // Early-Z?
934             if(T::bCanEarlyZ && !T::bForcedSampleCount)
935             {
936                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
937                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
938                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
939             }
940 
941             // if we have no covered samples that passed depth at this point, go to next tile
942             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
943 
944             if(state.psState.usesSourceDepth)
945             {
946                 AR_BEGIN(BEBarycentric, pDC->drawId);
947                 // interpolate and quantize z
948                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
949                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
950                 AR_END(BEBarycentric, 0);
951             }
952 
953             // pixels that are currently active
954             psContext.activeMask = _simd_castps_si(activeLanes);
955             psContext.oMask = T::MultisampleT::FullSampleMask();
956 
957             // execute pixel shader
958             AR_BEGIN(BEPixelShader, pDC->drawId);
959             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
960             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
961             AR_END(BEPixelShader, 0);
962 
963             // update active lanes to remove any discarded or oMask'd pixels
964             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
965             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
966 
967             // late-Z
968             if(!T::bCanEarlyZ && !T::bForcedSampleCount)
969             {
970                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
971                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
972                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
973             }
974 
975             // if we have no covered samples that passed depth at this point, skip OM and go to next tile
976             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
977 
978             // output merger
979             // loop over all samples, broadcasting the results of the PS to all passing pixels
980             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
981             {
982                 AR_BEGIN(BEOutputMerger, pDC->drawId);
983                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
984                 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
985                 simdscalar coverageMask, depthMask;
986                 if(T::bForcedSampleCount)
987                 {
988                     coverageMask = depthMask = activeLanes;
989                 }
990                 else
991                 {
992                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
993                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
994                     if(!_simd_movemask_ps(depthMask))
995                     {
996                         // stencil should already have been written in early/lateZ tests
997                         AR_END(BEOutputMerger, 0);
998                         continue;
999                     }
1000                 }
1001 
1002                 // broadcast the results of the PS to all passing pixels
1003 #if USE_8x2_TILE_BACKEND
1004                 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
1005 #else
1006                 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
1007 #endif
1008 
1009                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1010                 {
1011                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1012                     uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1013 
1014                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1015                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1016                 }
1017                 AR_END(BEOutputMerger, 0);
1018             }
1019 Endtile:
1020             AR_BEGIN(BEEndTile, pDC->drawId);
1021 
1022             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1023             {
1024                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1025             }
1026 
1027             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1028             {
1029                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1030             }
1031             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1032 
1033 #if USE_8x2_TILE_BACKEND
1034             if (useAlternateOffset)
1035             {
1036                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1037                 {
1038                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1039                 }
1040             }
1041 #else
1042             for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1043             {
1044                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1045             }
1046             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1047             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1048 #endif
1049 
1050             AR_END(BEEndTile, 0);
1051 
1052             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
1053             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1054         }
1055 
1056         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
1057         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1058     }
1059 
1060     AR_END(BEPixelRateBackend, 0);
1061 }
1062 // optimized backend flow with NULL PS
1063 template<uint32_t sampleCountT>
BackendNullPS(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)1064 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1065 {
1066     SWR_CONTEXT *pContext = pDC->pContext;
1067 
1068     AR_BEGIN(BENullBackend, pDC->drawId);
1069     ///@todo: handle center multisample pattern
1070     typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1071     AR_BEGIN(BESetup, pDC->drawId);
1072 
1073     const API_STATE &state = GetApiState(pDC);
1074 
1075     BarycentricCoeffs coeffs;
1076     SetupBarycentricCoeffs(&coeffs, work);
1077 
1078     uint8_t *pDepthBuffer, *pStencilBuffer;
1079     SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
1080 
1081     SWR_PS_CONTEXT psContext;
1082     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1083 
1084     AR_END(BESetup, 0);
1085 
1086     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1087 
1088     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1089 
1090     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1091     {
1092         simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1093 
1094         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1095 
1096         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1097         {
1098             // iterate over active samples
1099             unsigned long sample = 0;
1100             uint32_t sampleMask = state.blendState.sampleMask;
1101             while (_BitScanForward(&sample, sampleMask))
1102             {
1103                 sampleMask &= ~(1 << sample);
1104 
1105                 simdmask coverageMask = work.coverageMask[sample] & MASK;
1106 
1107                 if (coverageMask)
1108                 {
1109                     // offset depth/stencil buffers current sample
1110                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1111                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1112 
1113                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
1114                     {
1115                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
1116 
1117                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
1118 
1119                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
1120                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
1121 
1122                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
1123                     }
1124 
1125                     AR_BEGIN(BEBarycentric, pDC->drawId);
1126 
1127                     // calculate per sample positions
1128                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1129                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1130 
1131                     CalcSampleBarycentrics(coeffs, psContext);
1132 
1133                     // interpolate and quantize z
1134                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1135                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1136 
1137                     AR_END(BEBarycentric, 0);
1138 
1139                     // interpolate user clip distance if available
1140                     if (state.rastState.clipDistanceMask)
1141                     {
1142                         coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
1143                     }
1144 
1145                     simdscalar vCoverageMask = vMask(coverageMask);
1146                     simdscalar stencilPassMask = vCoverageMask;
1147 
1148                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
1149                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1150                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1151                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
1152                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1153                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1154                     AR_END(BEEarlyDepthTest, 0);
1155 
1156                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
1157                     uint32_t statCount = _mm_popcnt_u32(statMask);
1158                     UPDATE_STAT_BE(DepthPassCount, statCount);
1159                 }
1160 
1161 Endtile:
1162                 ATTR_UNUSED;
1163                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1164             }
1165 
1166             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1167             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1168 
1169             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
1170         }
1171 
1172         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
1173     }
1174 
1175     AR_END(BENullBackend, 0);
1176 }
1177 
InitClearTilesTable()1178 void InitClearTilesTable()
1179 {
1180     memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1181 
1182     sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1183     sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1184     sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1185     sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1186     sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1187 }
1188 
1189 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1190 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1191                                      [2] // centroid
1192                                      [2] // canEarlyZ
1193                                      = {};
1194 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1195                                        [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1196                                        [SWR_INPUT_COVERAGE_COUNT]
1197                                        [2] // centroid
1198                                        [2] // forcedSampleCount
1199                                        [2] // canEarlyZ
1200                                        = {};
1201 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1202                                         [SWR_INPUT_COVERAGE_COUNT]
1203                                         [2] // centroid
1204                                         [2] // canEarlyZ
1205                                         = {};
1206 
1207 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
1208 // arguments to static template arguments.
1209 template <uint32_t... ArgsT>
1210 struct BEChooser
1211 {
1212     // Last Arg Terminator
GetFuncBEChooser1213     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1214     {
1215         switch(tArg)
1216         {
1217         case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1218         case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1219         case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1220         default:
1221             SWR_ASSERT(0 && "Invalid backend func\n");
1222             return nullptr;
1223             break;
1224         }
1225     }
1226 
1227     // Recursively parse args
1228     template <typename... TArgsT>
GetFuncBEChooser1229     static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1230     {
1231         switch(tArg)
1232         {
1233         case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1234         case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1235         default:
1236         SWR_ASSERT(0 && "Invalid sample pattern\n");
1237         return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1238         break;
1239         }
1240     }
1241 
1242     // Recursively parse args
1243     template <typename... TArgsT>
GetFuncBEChooser1244     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1245     {
1246         switch(tArg)
1247         {
1248         case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1249         case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1250         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1251         default:
1252         SWR_ASSERT(0 && "Invalid sample pattern\n");
1253         return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1254         break;
1255         }
1256     }
1257 
1258     // Recursively parse args
1259     template <typename... TArgsT>
GetFuncBEChooser1260     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1261     {
1262         switch(tArg)
1263         {
1264         case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1265         case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1266         case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1267         case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1268         case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1269         default:
1270         SWR_ASSERT(0 && "Invalid sample count\n");
1271         return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1272         break;
1273         }
1274     }
1275 
1276     // Recursively parse args
1277     template <typename... TArgsT>
GetFuncBEChooser1278     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1279     {
1280         if(tArg == true)
1281         {
1282             return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1283         }
1284 
1285         return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1286     }
1287 };
1288 
InitBackendSingleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_INPUT_COVERAGE_COUNT][2][2])1289 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1290 {
1291     for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1292     {
1293         for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1294         {
1295             for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1296             {
1297                 table[inputCoverage][isCentroid][canEarlyZ] =
1298                     BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1299                                          (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1300             }
1301         }
1302     }
1303 }
1304 
InitBackendPixelFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])1305 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1306 {
1307     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1308     {
1309         for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1310         {
1311             for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1312             {
1313                 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1314                 {
1315                     for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1316                     {
1317                         for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1318                         {
1319                             table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1320                                 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1321                                                         (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1322                         }
1323                     }
1324                 }
1325             }
1326         }
1327     }
1328 }
1329 
InitBackendSampleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])1330 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1331 {
1332     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1333     {
1334         for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1335         {
1336             for(uint32_t centroid = 0; centroid < 2; centroid++)
1337             {
1338                 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1339                 {
1340                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1341                         BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1342                                              (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1343                 }
1344             }
1345         }
1346     }
1347 }
1348 
InitBackendFuncTables()1349 void InitBackendFuncTables()
1350 {
1351     InitBackendSingleFuncTable(gBackendSingleSample);
1352     InitBackendPixelFuncTable(gBackendPixelRateTable);
1353     InitBackendSampleFuncTable(gBackendSampleRateTable);
1354 
1355     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1356     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1357     gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1358     gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1359     gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1360 }
1361