1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessComputeBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t threadGroupId,void * & pSpillFillBuffer)48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->ppScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
77
78 AR_END(BEDispatch, 1);
79 }
80
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessShutdownBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88 // Dummy function
89 }
90
ProcessSyncBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93 uint32_t x, y;
94 MacroTileMgr::getTileIndices(macroTile, x, y);
95 SWR_ASSERT(x == 0 && y == 0);
96 }
97
98 template<SWR_FORMAT format>
ClearRasterTile(uint8_t * pTileBuffer,simdvector & value)99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101 auto lambda = [&](int32_t comp)
102 {
103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104
105 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
106 };
107
108 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
109
110 for (uint32_t i = 0; i < numIter; ++i)
111 {
112 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
113 }
114 }
115
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format>
ClearRasterTile(uint8_t * pTileBuffer,simd16vector & value)118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
119 {
120 auto lambda = [&](int32_t comp)
121 {
122 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
123
124 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
125 };
126
127 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
128
129 for (uint32_t i = 0; i < numIter; ++i)
130 {
131 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
132 }
133 }
134
135 #endif
136 template<SWR_FORMAT format>
ClearMacroTile(DRAW_CONTEXT * pDC,SWR_RENDERTARGET_ATTACHMENT rt,uint32_t macroTile,uint32_t renderTargetArrayIndex,DWORD clear[4],const SWR_RECT & rect)137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
138 {
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
142 simd16vector vClear;
143 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
144 {
145 simd16scalar vComp;
146 vComp = _simd16_load1_ps((const float*)&clear[comp]);
147 if (FormatTraits<format>::isNormalized(comp))
148 {
149 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
150 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
151 }
152 vComp = FormatTraits<format>::pack(comp, vComp);
153 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
154 }
155
156 #else
157 simdvector vClear;
158 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
159 {
160 simdscalar vComp;
161 vComp = _simd_load1_ps((const float*)&clear[comp]);
162 if (FormatTraits<format>::isNormalized(comp))
163 {
164 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
165 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
166 }
167 vComp = FormatTraits<format>::pack(comp, vComp);
168 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
169 }
170
171 #endif
172 uint32_t tileX, tileY;
173 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
174
175 // Init to full macrotile
176 SWR_RECT clearTile =
177 {
178 KNOB_MACROTILE_X_DIM * int32_t(tileX),
179 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
180 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
181 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
182 };
183
184 // intersect with clear rect
185 clearTile &= rect;
186
187 // translate to local hottile origin
188 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
189
190 // Make maximums inclusive (needed for convert to raster tiles)
191 clearTile.xmax -= 1;
192 clearTile.ymax -= 1;
193
194 // convert to raster tiles
195 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
196 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
197 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
198 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
199
200 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
203 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
204 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
205 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
206
207 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
208 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
209 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
210
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
213 {
214 uint8_t* pRasterTile = pRasterTileRow;
215 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
216 {
217 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
218 {
219 ClearRasterTile<format>(pRasterTile, vClear);
220 pRasterTile += rasterTileSampleStep;
221 }
222 }
223 pRasterTileRow += macroTileRowStep;
224 }
225
226 pHotTile->state = HOTTILE_DIRTY;
227 }
228
229
ProcessClearBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
231 {
232 SWR_CONTEXT *pContext = pDC->pContext;
233
234 if (KNOB_FAST_CLEAR)
235 {
236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
238 uint32_t numSamples = GetNumSamples(sampleCount);
239
240 SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
241
242 AR_BEGIN(BEClear, pDC->drawId);
243
244 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
245 {
246 unsigned long rt = 0;
247 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
248 while (_BitScanForward(&rt, mask))
249 {
250 mask &= ~(1 << rt);
251
252 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
253
254 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
255 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
256 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
257 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
258 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
259 pHotTile->state = HOTTILE_CLEAR;
260 }
261 }
262
263 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
264 {
265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
266 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
267 pHotTile->state = HOTTILE_CLEAR;
268 }
269
270 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
271 {
272 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
273
274 pHotTile->clearData[0] = pClear->clearStencil;
275 pHotTile->state = HOTTILE_CLEAR;
276 }
277
278 AR_END(BEClear, 1);
279 }
280 else
281 {
282 // Legacy clear
283 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
284 AR_BEGIN(BEClear, pDC->drawId);
285
286 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
287 {
288 DWORD clearData[4];
289 clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
290 clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
291 clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
292 clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
293
294 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
295 SWR_ASSERT(pfnClearTiles != nullptr);
296
297 unsigned long rt = 0;
298 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
299 while (_BitScanForward(&rt, mask))
300 {
301 mask &= ~(1 << rt);
302
303 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
304 }
305 }
306
307 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
308 {
309 DWORD clearData[4];
310 clearData[0] = *(DWORD*)&pClear->clearDepth;
311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
312 SWR_ASSERT(pfnClearTiles != nullptr);
313
314 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
315 }
316
317 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
318 {
319 DWORD clearData[4];
320 clearData[0] = pClear->clearStencil;
321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
322
323 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
324 }
325
326 AR_END(BEClear, 1);
327 }
328 }
329
ProcessStoreTileBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,STORE_TILES_DESC * pDesc,SWR_RENDERTARGET_ATTACHMENT attachment)330 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
331 SWR_RENDERTARGET_ATTACHMENT attachment)
332 {
333 SWR_CONTEXT *pContext = pDC->pContext;
334
335 AR_BEGIN(BEStoreTiles, pDC->drawId);
336
337 SWR_FORMAT srcFormat;
338 switch (attachment)
339 {
340 case SWR_ATTACHMENT_COLOR0:
341 case SWR_ATTACHMENT_COLOR1:
342 case SWR_ATTACHMENT_COLOR2:
343 case SWR_ATTACHMENT_COLOR3:
344 case SWR_ATTACHMENT_COLOR4:
345 case SWR_ATTACHMENT_COLOR5:
346 case SWR_ATTACHMENT_COLOR6:
347 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
348 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
349 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
350 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
351 }
352
353 uint32_t x, y;
354 MacroTileMgr::getTileIndices(macroTile, x, y);
355
356 // Only need to store the hottile if it's been rendered to...
357 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
358 if (pHotTile)
359 {
360 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
361 if (pHotTile->state == HOTTILE_CLEAR)
362 {
363 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
364 SWR_ASSERT(pfnClearTiles != nullptr);
365
366 pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
367 }
368
369 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
370 {
371 int32_t destX = KNOB_MACROTILE_X_DIM * x;
372 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
373
374 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
375 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
376 }
377
378
379 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
380 {
381 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
382 }
383 }
384 AR_END(BEStoreTiles, 1);
385 }
386
ProcessStoreTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)387 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
388 {
389 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
390
391 unsigned long rt = 0;
392 uint32_t mask = pDesc->attachmentMask;
393 while (_BitScanForward(&rt, mask))
394 {
395 mask &= ~(1 << rt);
396 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
397 }
398 }
399
ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
401 {
402 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
403 SWR_CONTEXT *pContext = pDC->pContext;
404
405 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
406
407 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
408 {
409 if (pDesc->attachmentMask & (1 << i))
410 {
411 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
412 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
413 if (pHotTile)
414 {
415 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
416 }
417 }
418 }
419 }
420
421 #if KNOB_SIMD_WIDTH == 8
422 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
423 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
424 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
425 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
426 #else
427 #error Unsupported vector width
428 #endif
429
ComputeUserClipMask(uint8_t clipMask,float * pUserClipBuffer,simdscalar vI,simdscalar vJ)430 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
431 {
432 simdscalar vClipMask = _simd_setzero_ps();
433 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
434
435 for (uint32_t i = 0; i < numClipDistance; ++i)
436 {
437 // pull triangle clip distance values from clip buffer
438 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
439 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
440 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
441
442 // interpolate
443 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
444
445 // clip if interpolated clip distance is < 0 || NAN
446 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
447
448 vClipMask = _simd_or_ps(vClipMask, vCull);
449 }
450
451 return _simd_movemask_ps(vClipMask);
452 }
453
454 template<typename T>
BackendSingleSample(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)455 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
456 {
457 SWR_CONTEXT *pContext = pDC->pContext;
458
459 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
460 AR_BEGIN(BESetup, pDC->drawId);
461
462 const API_STATE &state = GetApiState(pDC);
463
464 BarycentricCoeffs coeffs;
465 SetupBarycentricCoeffs(&coeffs, work);
466
467 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
468 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
469
470 SWR_PS_CONTEXT psContext;
471 SetupPixelShaderContext<T>(&psContext, work);
472
473 AR_END(BESetup, 1);
474
475 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
476 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
477
478 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
479
480 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
481 {
482 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
483 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
484
485 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
486
487 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
488 {
489 #if USE_8x2_TILE_BACKEND
490 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
491
492 #endif
493 simdmask coverageMask = work.coverageMask[0] & MASK;
494
495 if (coverageMask)
496 {
497 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
498 {
499 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
500
501 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
502
503 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
504 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
505
506 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
507 }
508
509 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
510 {
511 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
512
513 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
514 }
515
516 AR_BEGIN(BEBarycentric, pDC->drawId);
517
518 CalcPixelBarycentrics(coeffs, psContext);
519
520 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
521
522 // interpolate and quantize z
523 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
524 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
525
526 AR_END(BEBarycentric, 1);
527
528 // interpolate user clip distance if available
529 if (state.rastState.clipDistanceMask)
530 {
531 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
532 }
533
534 simdscalar vCoverageMask = vMask(coverageMask);
535 simdscalar depthPassMask = vCoverageMask;
536 simdscalar stencilPassMask = vCoverageMask;
537
538 // Early-Z?
539 if (T::bCanEarlyZ)
540 {
541 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
542 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
543 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
544 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
545 AR_END(BEEarlyDepthTest, 0);
546
547 // early-exit if no pixels passed depth or earlyZ is forced on
548 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
549 {
550 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
551 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
552
553 if (!_simd_movemask_ps(depthPassMask))
554 {
555 goto Endtile;
556 }
557 }
558 }
559
560 psContext.sampleIndex = 0;
561 psContext.activeMask = _simd_castps_si(vCoverageMask);
562
563 // execute pixel shader
564 AR_BEGIN(BEPixelShader, pDC->drawId);
565 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
566 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
567 AR_END(BEPixelShader, 0);
568
569 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
570
571 // late-Z
572 if (!T::bCanEarlyZ)
573 {
574 AR_BEGIN(BELateDepthTest, pDC->drawId);
575 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
576 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
577 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
578 AR_END(BELateDepthTest, 0);
579
580 if (!_simd_movemask_ps(depthPassMask))
581 {
582 // need to call depth/stencil write for stencil write
583 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
584 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
585 goto Endtile;
586 }
587 }
588
589 uint32_t statMask = _simd_movemask_ps(depthPassMask);
590 uint32_t statCount = _mm_popcnt_u32(statMask);
591 UPDATE_STAT_BE(DepthPassCount, statCount);
592
593 // output merger
594 AR_BEGIN(BEOutputMerger, pDC->drawId);
595 #if USE_8x2_TILE_BACKEND
596 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
597 #else
598 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
599 #endif
600
601 // do final depth write after all pixel kills
602 if (!state.psState.forceEarlyZ)
603 {
604 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
605 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
606 }
607 AR_END(BEOutputMerger, 0);
608 }
609
610 Endtile:
611 AR_BEGIN(BEEndTile, pDC->drawId);
612
613 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
614 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
615 {
616 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
617 }
618
619 #if USE_8x2_TILE_BACKEND
620 if (useAlternateOffset)
621 {
622 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
623 {
624 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
625 }
626 }
627 #else
628 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
629 {
630 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
631 }
632 #endif
633 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
634 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
635
636 AR_END(BEEndTile, 0);
637
638 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
639 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
640 }
641
642 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
643 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
644 }
645
646 AR_END(BESingleSampleBackend, 0);
647 }
648
649 template<typename T>
BackendSampleRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)650 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
651 {
652 SWR_CONTEXT *pContext = pDC->pContext;
653
654 AR_BEGIN(BESampleRateBackend, pDC->drawId);
655 AR_BEGIN(BESetup, pDC->drawId);
656
657 const API_STATE &state = GetApiState(pDC);
658
659 BarycentricCoeffs coeffs;
660 SetupBarycentricCoeffs(&coeffs, work);
661
662 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
663 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
664
665 SWR_PS_CONTEXT psContext;
666 SetupPixelShaderContext<T>(&psContext, work);
667
668 AR_END(BESetup, 0);
669
670 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
671 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
672
673 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
674
675 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
676 {
677 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
678 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
679
680 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
681
682 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
683 {
684 #if USE_8x2_TILE_BACKEND
685 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
686
687 #endif
688 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
689 {
690 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
691
692 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
693 }
694
695 AR_BEGIN(BEBarycentric, pDC->drawId);
696
697 CalcPixelBarycentrics(coeffs, psContext);
698
699 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
700
701 AR_END(BEBarycentric, 0);
702
703 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
704 {
705 simdmask coverageMask = work.coverageMask[sample] & MASK;
706
707 if (coverageMask)
708 {
709 // offset depth/stencil buffers current sample
710 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
711 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
712
713 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
714 {
715 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
716
717 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
718
719 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
720 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
721
722 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
723 }
724
725 AR_BEGIN(BEBarycentric, pDC->drawId);
726
727 // calculate per sample positions
728 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
729 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
730
731 CalcSampleBarycentrics(coeffs, psContext);
732
733 // interpolate and quantize z
734 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
735 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
736
737 AR_END(BEBarycentric, 0);
738
739 // interpolate user clip distance if available
740 if (state.rastState.clipDistanceMask)
741 {
742 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
743 }
744
745 simdscalar vCoverageMask = vMask(coverageMask);
746 simdscalar depthPassMask = vCoverageMask;
747 simdscalar stencilPassMask = vCoverageMask;
748
749 // Early-Z?
750 if (T::bCanEarlyZ)
751 {
752 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
753 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
754 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
755 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
756 AR_END(BEEarlyDepthTest, 0);
757
758 // early-exit if no samples passed depth or earlyZ is forced on.
759 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
760 {
761 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
762 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
763
764 if (!_simd_movemask_ps(depthPassMask))
765 {
766 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
767 continue;
768 }
769 }
770 }
771
772 psContext.sampleIndex = sample;
773 psContext.activeMask = _simd_castps_si(vCoverageMask);
774
775 // execute pixel shader
776 AR_BEGIN(BEPixelShader, pDC->drawId);
777 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
778 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
779 AR_END(BEPixelShader, 0);
780
781 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
782
783 // late-Z
784 if (!T::bCanEarlyZ)
785 {
786 AR_BEGIN(BELateDepthTest, pDC->drawId);
787 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
788 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
789 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
790 AR_END(BELateDepthTest, 0);
791
792 if (!_simd_movemask_ps(depthPassMask))
793 {
794 // need to call depth/stencil write for stencil write
795 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
796 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
797
798 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
799 continue;
800 }
801 }
802
803 uint32_t statMask = _simd_movemask_ps(depthPassMask);
804 uint32_t statCount = _mm_popcnt_u32(statMask);
805 UPDATE_STAT_BE(DepthPassCount, statCount);
806
807 // output merger
808 AR_BEGIN(BEOutputMerger, pDC->drawId);
809 #if USE_8x2_TILE_BACKEND
810 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
811 #else
812 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
813 #endif
814
815 // do final depth write after all pixel kills
816 if (!state.psState.forceEarlyZ)
817 {
818 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
819 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
820 }
821 AR_END(BEOutputMerger, 0);
822 }
823 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
824 }
825
826 Endtile:
827 ATTR_UNUSED;
828
829 AR_BEGIN(BEEndTile, pDC->drawId);
830
831 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
832 {
833 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
834 }
835
836 #if USE_8x2_TILE_BACKEND
837 if (useAlternateOffset)
838 {
839 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
840 {
841 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
842 }
843 }
844 #else
845 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
846 {
847 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
848 }
849 #endif
850 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
851 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
852
853 AR_END(BEEndTile, 0);
854
855 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
856 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
857 }
858
859 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
860 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
861 }
862
863 AR_END(BESampleRateBackend, 0);
864 }
865
866 template<typename T>
BackendPixelRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)867 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
868 {
869 SWR_CONTEXT *pContext = pDC->pContext;
870
871 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
872 AR_BEGIN(BESetup, pDC->drawId);
873
874 const API_STATE &state = GetApiState(pDC);
875
876 BarycentricCoeffs coeffs;
877 SetupBarycentricCoeffs(&coeffs, work);
878
879 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
880 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
881
882 SWR_PS_CONTEXT psContext;
883 SetupPixelShaderContext<T>(&psContext, work);
884
885 AR_END(BESetup, 0);
886
887 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
888
889 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
890 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
891
892 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
893
894 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
895 {
896 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
897 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
898
899 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
900
901 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
902 {
903 #if USE_8x2_TILE_BACKEND
904 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
905
906 #endif
907 simdscalar activeLanes;
908 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
909 activeLanes = vMask(work.anyCoveredSamples & MASK);
910
911 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
912 {
913 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
914
915 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
916 }
917
918 AR_BEGIN(BEBarycentric, pDC->drawId);
919
920 CalcPixelBarycentrics(coeffs, psContext);
921
922 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
923
924 AR_END(BEBarycentric, 0);
925
926 if(T::bForcedSampleCount)
927 {
928 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
929 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
930 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
931 }
932
933 // Early-Z?
934 if(T::bCanEarlyZ && !T::bForcedSampleCount)
935 {
936 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
937 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
938 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
939 }
940
941 // if we have no covered samples that passed depth at this point, go to next tile
942 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
943
944 if(state.psState.usesSourceDepth)
945 {
946 AR_BEGIN(BEBarycentric, pDC->drawId);
947 // interpolate and quantize z
948 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
949 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
950 AR_END(BEBarycentric, 0);
951 }
952
953 // pixels that are currently active
954 psContext.activeMask = _simd_castps_si(activeLanes);
955 psContext.oMask = T::MultisampleT::FullSampleMask();
956
957 // execute pixel shader
958 AR_BEGIN(BEPixelShader, pDC->drawId);
959 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
960 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
961 AR_END(BEPixelShader, 0);
962
963 // update active lanes to remove any discarded or oMask'd pixels
964 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
965 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
966
967 // late-Z
968 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
969 {
970 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
971 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
972 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
973 }
974
975 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
976 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
977
978 // output merger
979 // loop over all samples, broadcasting the results of the PS to all passing pixels
980 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
981 {
982 AR_BEGIN(BEOutputMerger, pDC->drawId);
983 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
984 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
985 simdscalar coverageMask, depthMask;
986 if(T::bForcedSampleCount)
987 {
988 coverageMask = depthMask = activeLanes;
989 }
990 else
991 {
992 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
993 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
994 if(!_simd_movemask_ps(depthMask))
995 {
996 // stencil should already have been written in early/lateZ tests
997 AR_END(BEOutputMerger, 0);
998 continue;
999 }
1000 }
1001
1002 // broadcast the results of the PS to all passing pixels
1003 #if USE_8x2_TILE_BACKEND
1004 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
1005 #else
1006 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
1007 #endif
1008
1009 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1010 {
1011 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1012 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1013
1014 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1015 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1016 }
1017 AR_END(BEOutputMerger, 0);
1018 }
1019 Endtile:
1020 AR_BEGIN(BEEndTile, pDC->drawId);
1021
1022 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1023 {
1024 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1025 }
1026
1027 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1028 {
1029 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1030 }
1031 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1032
1033 #if USE_8x2_TILE_BACKEND
1034 if (useAlternateOffset)
1035 {
1036 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1037 {
1038 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1039 }
1040 }
1041 #else
1042 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1043 {
1044 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1045 }
1046 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1047 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1048 #endif
1049
1050 AR_END(BEEndTile, 0);
1051
1052 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1053 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1054 }
1055
1056 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1057 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1058 }
1059
1060 AR_END(BEPixelRateBackend, 0);
1061 }
1062 // optimized backend flow with NULL PS
1063 template<uint32_t sampleCountT>
BackendNullPS(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)1064 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1065 {
1066 SWR_CONTEXT *pContext = pDC->pContext;
1067
1068 AR_BEGIN(BENullBackend, pDC->drawId);
1069 ///@todo: handle center multisample pattern
1070 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1071 AR_BEGIN(BESetup, pDC->drawId);
1072
1073 const API_STATE &state = GetApiState(pDC);
1074
1075 BarycentricCoeffs coeffs;
1076 SetupBarycentricCoeffs(&coeffs, work);
1077
1078 uint8_t *pDepthBuffer, *pStencilBuffer;
1079 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
1080
1081 SWR_PS_CONTEXT psContext;
1082 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1083
1084 AR_END(BESetup, 0);
1085
1086 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1087
1088 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1089
1090 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1091 {
1092 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1093
1094 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1095
1096 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1097 {
1098 // iterate over active samples
1099 unsigned long sample = 0;
1100 uint32_t sampleMask = state.blendState.sampleMask;
1101 while (_BitScanForward(&sample, sampleMask))
1102 {
1103 sampleMask &= ~(1 << sample);
1104
1105 simdmask coverageMask = work.coverageMask[sample] & MASK;
1106
1107 if (coverageMask)
1108 {
1109 // offset depth/stencil buffers current sample
1110 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1111 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1112
1113 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
1114 {
1115 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
1116
1117 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
1118
1119 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
1120 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
1121
1122 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
1123 }
1124
1125 AR_BEGIN(BEBarycentric, pDC->drawId);
1126
1127 // calculate per sample positions
1128 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1129 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1130
1131 CalcSampleBarycentrics(coeffs, psContext);
1132
1133 // interpolate and quantize z
1134 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1135 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1136
1137 AR_END(BEBarycentric, 0);
1138
1139 // interpolate user clip distance if available
1140 if (state.rastState.clipDistanceMask)
1141 {
1142 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
1143 }
1144
1145 simdscalar vCoverageMask = vMask(coverageMask);
1146 simdscalar stencilPassMask = vCoverageMask;
1147
1148 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
1149 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1150 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1151 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
1152 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1153 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1154 AR_END(BEEarlyDepthTest, 0);
1155
1156 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1157 uint32_t statCount = _mm_popcnt_u32(statMask);
1158 UPDATE_STAT_BE(DepthPassCount, statCount);
1159 }
1160
1161 Endtile:
1162 ATTR_UNUSED;
1163 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1164 }
1165
1166 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1167 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1168
1169 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
1170 }
1171
1172 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
1173 }
1174
1175 AR_END(BENullBackend, 0);
1176 }
1177
InitClearTilesTable()1178 void InitClearTilesTable()
1179 {
1180 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1181
1182 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1183 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1184 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1185 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1186 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1187 }
1188
1189 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1190 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1191 [2] // centroid
1192 [2] // canEarlyZ
1193 = {};
1194 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1195 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1196 [SWR_INPUT_COVERAGE_COUNT]
1197 [2] // centroid
1198 [2] // forcedSampleCount
1199 [2] // canEarlyZ
1200 = {};
1201 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1202 [SWR_INPUT_COVERAGE_COUNT]
1203 [2] // centroid
1204 [2] // canEarlyZ
1205 = {};
1206
1207 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1208 // arguments to static template arguments.
1209 template <uint32_t... ArgsT>
1210 struct BEChooser
1211 {
1212 // Last Arg Terminator
GetFuncBEChooser1213 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1214 {
1215 switch(tArg)
1216 {
1217 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1218 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1219 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1220 default:
1221 SWR_ASSERT(0 && "Invalid backend func\n");
1222 return nullptr;
1223 break;
1224 }
1225 }
1226
1227 // Recursively parse args
1228 template <typename... TArgsT>
GetFuncBEChooser1229 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1230 {
1231 switch(tArg)
1232 {
1233 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1234 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1235 default:
1236 SWR_ASSERT(0 && "Invalid sample pattern\n");
1237 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1238 break;
1239 }
1240 }
1241
1242 // Recursively parse args
1243 template <typename... TArgsT>
GetFuncBEChooser1244 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1245 {
1246 switch(tArg)
1247 {
1248 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1249 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1250 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1251 default:
1252 SWR_ASSERT(0 && "Invalid sample pattern\n");
1253 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1254 break;
1255 }
1256 }
1257
1258 // Recursively parse args
1259 template <typename... TArgsT>
GetFuncBEChooser1260 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1261 {
1262 switch(tArg)
1263 {
1264 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1265 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1266 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1267 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1268 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1269 default:
1270 SWR_ASSERT(0 && "Invalid sample count\n");
1271 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1272 break;
1273 }
1274 }
1275
1276 // Recursively parse args
1277 template <typename... TArgsT>
GetFuncBEChooser1278 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1279 {
1280 if(tArg == true)
1281 {
1282 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1283 }
1284
1285 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1286 }
1287 };
1288
InitBackendSingleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_INPUT_COVERAGE_COUNT][2][2])1289 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1290 {
1291 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1292 {
1293 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1294 {
1295 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1296 {
1297 table[inputCoverage][isCentroid][canEarlyZ] =
1298 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1299 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1300 }
1301 }
1302 }
1303 }
1304
InitBackendPixelFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])1305 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1306 {
1307 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1308 {
1309 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1310 {
1311 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1312 {
1313 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1314 {
1315 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1316 {
1317 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1318 {
1319 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1320 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1321 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1322 }
1323 }
1324 }
1325 }
1326 }
1327 }
1328 }
1329
InitBackendSampleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])1330 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1331 {
1332 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1333 {
1334 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1335 {
1336 for(uint32_t centroid = 0; centroid < 2; centroid++)
1337 {
1338 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1339 {
1340 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1341 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1342 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1343 }
1344 }
1345 }
1346 }
1347 }
1348
InitBackendFuncTables()1349 void InitBackendFuncTables()
1350 {
1351 InitBackendSingleFuncTable(gBackendSingleSample);
1352 InitBackendPixelFuncTable(gBackendPixelRateTable);
1353 InitBackendSampleFuncTable(gBackendSampleRateTable);
1354
1355 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1356 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1357 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1358 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1359 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1360 }
1361