1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 template <typename T>
BackendSingleSample(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSingleSample(DRAW_CONTEXT* pDC,
42 uint32_t workerId,
43 uint32_t x,
44 uint32_t y,
45 SWR_TRIANGLE_DESC& work,
46 RenderOutputBuffers& renderBuffers)
47 {
48 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
49 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
50
51 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
52
53 const API_STATE& state = GetApiState(pDC);
54
55 BarycentricCoeffs coeffs;
56 SetupBarycentricCoeffs(&coeffs, work);
57
58 SWR_PS_CONTEXT psContext;
59 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
60 SetupPixelShaderContext<T>(&psContext, samplePos, work);
61
62 uint8_t *pDepthBuffer, *pStencilBuffer;
63 SetupRenderBuffers(psContext.pColorBuffer,
64 &pDepthBuffer,
65 &pStencilBuffer,
66 state.colorHottileEnable,
67 renderBuffers);
68
69 // Indicates backend rendered something to the color buffer
70 bool isTileDirty = false;
71
72 RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
73
74 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
75 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
76
77 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
78
79 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
80 {
81 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
82 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
83
84 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
85
86 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
87 {
88 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
89
90
91 simdmask coverageMask = work.coverageMask[0] & MASK;
92
93 if (coverageMask)
94 {
95 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
96 {
97 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
98 "Unsupported depth hot tile format");
99
100 const simdscalar z =
101 _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
102
103 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
104 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
105
106 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
107 }
108
109 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
110 {
111 const uint64_t* pCoverageMask =
112 (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
113 ? &work.innerCoverageMask
114 : &work.coverageMask[0];
115
116 generateInputCoverage<T, T::InputCoverage>(
117 pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
118 }
119
120 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
121
122 CalcPixelBarycentrics(coeffs, psContext);
123
124 CalcCentroid<T, true>(
125 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
126
127 // interpolate and quantize z
128 psContext.vZ = vplaneps(
129 coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
130 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
131
132 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
133
134 // interpolate user clip distance if available
135 if (state.backendState.clipDistanceMask)
136 {
137 coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
138 work.pUserClipBuffer,
139 psContext.vI.center,
140 psContext.vJ.center);
141 }
142
143 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
144 simdscalar depthPassMask = vCoverageMask;
145 simdscalar stencilPassMask = vCoverageMask;
146
147 // Early-Z?
148 if (T::bCanEarlyZ)
149 {
150 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
151 depthPassMask = DepthStencilTest(&state,
152 work.triFlags.frontFacing,
153 work.triFlags.viewportIndex,
154 psContext.vZ,
155 pDepthBuffer,
156 vCoverageMask,
157 pStencilBuffer,
158 &stencilPassMask);
159 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
160 _simd_movemask_ps(stencilPassMask),
161 _simd_movemask_ps(vCoverageMask)));
162 RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
163
164 // early-exit if no pixels passed depth or earlyZ is forced on
165 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
166 {
167 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
168 &state.depthStencilState,
169 work.triFlags.frontFacing,
170 psContext.vZ,
171 pDepthBuffer,
172 depthPassMask,
173 vCoverageMask,
174 pStencilBuffer,
175 stencilPassMask);
176
177 if (!_simd_movemask_ps(depthPassMask))
178 {
179 goto Endtile;
180 }
181 }
182 }
183
184 psContext.sampleIndex = 0;
185 psContext.activeMask = _simd_castps_si(vCoverageMask);
186
187 // execute pixel shader
188 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
189 state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
190 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
191
192 // update stats
193 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
194 AR_EVENT(PSStats((HANDLE)&psContext.stats));
195
196 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
197
198 if (_simd_movemask_ps(vCoverageMask))
199 {
200 isTileDirty = true;
201 }
202
203 // late-Z
204 if (!T::bCanEarlyZ)
205 {
206 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
207 depthPassMask = DepthStencilTest(&state,
208 work.triFlags.frontFacing,
209 work.triFlags.viewportIndex,
210 psContext.vZ,
211 pDepthBuffer,
212 vCoverageMask,
213 pStencilBuffer,
214 &stencilPassMask);
215 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
216 _simd_movemask_ps(stencilPassMask),
217 _simd_movemask_ps(vCoverageMask)));
218 RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
219
220 if (!_simd_movemask_ps(depthPassMask))
221 {
222 // need to call depth/stencil write for stencil write
223 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
224 &state.depthStencilState,
225 work.triFlags.frontFacing,
226 psContext.vZ,
227 pDepthBuffer,
228 depthPassMask,
229 vCoverageMask,
230 pStencilBuffer,
231 stencilPassMask);
232 goto Endtile;
233 }
234 }
235 else
236 {
237 // for early z, consolidate discards from shader
238 // into depthPassMask
239 depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
240 }
241
242 uint32_t statMask = _simd_movemask_ps(depthPassMask);
243 uint32_t statCount = _mm_popcnt_u32(statMask);
244 UPDATE_STAT_BE(DepthPassCount, statCount);
245
246 // output merger
247 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
248
249 OutputMerger8x2(pDC,
250 psContext,
251 psContext.pColorBuffer,
252 0,
253 &state.blendState,
254 state.pfnBlendFunc,
255 vCoverageMask,
256 depthPassMask,
257 state.psState.renderTargetMask,
258 useAlternateOffset,
259 workerId);
260
261 // do final depth write after all pixel kills
262 if (!state.psState.forceEarlyZ)
263 {
264 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
265 &state.depthStencilState,
266 work.triFlags.frontFacing,
267 psContext.vZ,
268 pDepthBuffer,
269 depthPassMask,
270 vCoverageMask,
271 pStencilBuffer,
272 stencilPassMask);
273 }
274 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
275 }
276
277 Endtile:
278 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
279
280 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
281 if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
282 {
283 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
284 }
285
286 if (useAlternateOffset)
287 {
288 unsigned long rt;
289 uint32_t rtMask = state.colorHottileEnable;
290 while (_BitScanForward(&rt, rtMask))
291 {
292 rtMask &= ~(1 << rt);
293 psContext.pColorBuffer[rt] +=
294 (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
295 }
296 }
297
298 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
299 pStencilBuffer +=
300 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
301
302 RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
303
304 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
305 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
306 }
307
308 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
309 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
310 }
311
312 if (isTileDirty)
313 {
314 SetRenderHotTilesDirty(pDC, renderBuffers);
315 }
316
317 RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
318 }
319
320 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
321 // arguments to static template arguments.
322 template <uint32_t... ArgsT>
323 struct BEChooserSingleSample
324 {
325 // Last Arg Terminator
GetFuncBEChooserSingleSample326 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
327 {
328 switch (tArg)
329 {
330 case SWR_BACKEND_SINGLE_SAMPLE:
331 return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
332 break;
333 case SWR_BACKEND_MSAA_PIXEL_RATE:
334 case SWR_BACKEND_MSAA_SAMPLE_RATE:
335 default:
336 SWR_ASSERT(0 && "Invalid backend func\n");
337 return nullptr;
338 break;
339 }
340 }
341
342 // Recursively parse args
343 template <typename... TArgsT>
GetFuncBEChooserSingleSample344 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
345 {
346 switch (tArg)
347 {
348 case SWR_INPUT_COVERAGE_NONE:
349 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
350 remainingArgs...);
351 break;
352 case SWR_INPUT_COVERAGE_NORMAL:
353 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
354 remainingArgs...);
355 break;
356 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
357 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
358 remainingArgs...);
359 break;
360 default:
361 SWR_ASSERT(0 && "Invalid sample pattern\n");
362 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
363 remainingArgs...);
364 break;
365 }
366 }
367
368 // Recursively parse args
369 template <typename... TArgsT>
GetFuncBEChooserSingleSample370 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
371 {
372 switch (tArg)
373 {
374 case SWR_MULTISAMPLE_1X:
375 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
376 break;
377 case SWR_MULTISAMPLE_2X:
378 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
379 break;
380 case SWR_MULTISAMPLE_4X:
381 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
382 break;
383 case SWR_MULTISAMPLE_8X:
384 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
385 break;
386 case SWR_MULTISAMPLE_16X:
387 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
388 break;
389 default:
390 SWR_ASSERT(0 && "Invalid sample count\n");
391 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
392 break;
393 }
394 }
395
396 // Recursively parse args
397 template <typename... TArgsT>
GetFuncBEChooserSingleSample398 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
399 {
400 if (tArg == true)
401 {
402 return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
403 }
404
405 return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
406 }
407 };
408
InitBackendSingleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_INPUT_COVERAGE_COUNT][2][2])409 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
410 {
411 for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
412 {
413 for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
414 {
415 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
416 {
417 table[inputCoverage][isCentroid][canEarlyZ] =
418 BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
419 false,
420 (SWR_INPUT_COVERAGE)inputCoverage,
421 (isCentroid > 0),
422 false,
423 (canEarlyZ > 0),
424 SWR_BACKEND_SINGLE_SAMPLE);
425 }
426 }
427 }
428 }
429