1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 #include "tilemgr.h"
32 #include "state.h"
33 #include "context.h"
34
35
36 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
37 void InitBackendSampleFuncTable(
38 PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
39
40 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
41 SWR_PS_CONTEXT& psContext);
42
43
44 enum SWR_BACKEND_FUNCS
45 {
46 SWR_BACKEND_SINGLE_SAMPLE,
47 SWR_BACKEND_MSAA_PIXEL_RATE,
48 SWR_BACKEND_MSAA_SAMPLE_RATE,
49 SWR_BACKEND_FUNCS_MAX,
50 };
51
52 #if KNOB_SIMD_WIDTH == 8
53 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
54 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
55 static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
56 static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
57 #define MASK 0xff
58 #endif
59
ComputeUserClipMask(uint8_t clipMask,float * pUserClipBuffer,simdscalar const & vI,simdscalar const & vJ)60 static INLINE simdmask ComputeUserClipMask(uint8_t clipMask,
61 float* pUserClipBuffer,
62 simdscalar const& vI,
63 simdscalar const& vJ)
64 {
65 simdscalar vClipMask = _simd_setzero_ps();
66 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
67
68 for (uint32_t i = 0; i < numClipDistance; ++i)
69 {
70 // pull triangle clip distance values from clip buffer
71 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
72 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
73 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
74
75 // interpolate
76 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
77
78 // clip if interpolated clip distance is < 0 || NAN
79 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
80
81 vClipMask = _simd_or_ps(vClipMask, vCull);
82 }
83
84 return _simd_movemask_ps(vClipMask);
85 }
86
RasterTileColorOffset(uint32_t sampleNum)87 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
88 {
89 static const uint32_t RasterTileColorOffsets[16]{
90 0,
91 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
92 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
93 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
94 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
95 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
96 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
97 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
98 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
99 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
100 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
101 10,
102 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
103 11,
104 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
105 12,
106 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
107 13,
108 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
109 14,
110 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
111 15,
112 };
113 assert(sampleNum < 16);
114 return RasterTileColorOffsets[sampleNum];
115 }
116
RasterTileDepthOffset(uint32_t sampleNum)117 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
118 {
119 static const uint32_t RasterTileDepthOffsets[16]{
120 0,
121 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
122 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
123 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
124 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
125 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
126 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
127 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
128 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
129 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
130 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
131 10,
132 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
133 11,
134 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
135 12,
136 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
137 13,
138 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
139 14,
140 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
141 15,
142 };
143 assert(sampleNum < 16);
144 return RasterTileDepthOffsets[sampleNum];
145 }
146
RasterTileStencilOffset(uint32_t sampleNum)147 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
148 {
149 static const uint32_t RasterTileStencilOffsets[16]{
150 0,
151 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
152 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
153 2,
154 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
155 3,
156 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
157 4,
158 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
159 5,
160 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
161 6,
162 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
163 7,
164 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
165 8,
166 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
167 9,
168 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
169 10,
170 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
171 11,
172 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
173 12,
174 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
175 13,
176 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
177 14,
178 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
179 15,
180 };
181 assert(sampleNum < 16);
182 return RasterTileStencilOffsets[sampleNum];
183 }
184
185 template <typename T, uint32_t InputCoverage>
186 struct generateInputCoverage
187 {
generateInputCoveragegenerateInputCoverage188 INLINE generateInputCoverage(const uint64_t* const coverageMask,
189 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
190 const uint32_t sampleMask)
191 {
192 // will need to update for avx512
193 assert(KNOB_SIMD_WIDTH == 8);
194
195 simdscalari mask[2];
196 simdscalari sampleCoverage[2];
197
198 if (T::bIsCenterPattern)
199 {
200 // center coverage is the same for all samples; just broadcast to the sample slots
201 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
202 if (T::MultisampleT::numSamples == 1)
203 {
204 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
205 }
206 else if (T::MultisampleT::numSamples == 2)
207 {
208 sampleCoverage[0] =
209 _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
210 }
211 else if (T::MultisampleT::numSamples == 4)
212 {
213 sampleCoverage[0] = _simd_set_epi32(
214 0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
215 }
216 else if (T::MultisampleT::numSamples == 8)
217 {
218 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
219 }
220 else if (T::MultisampleT::numSamples == 16)
221 {
222 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
223 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
224 }
225 }
226 else
227 {
228 simdscalari src = _simd_set1_epi32(0);
229 simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
230
231 if (T::MultisampleT::numSamples == 1)
232 {
233 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
234 }
235 else if (T::MultisampleT::numSamples == 2)
236 {
237 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
238 }
239 else if (T::MultisampleT::numSamples == 4)
240 {
241 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
242 }
243 else if (T::MultisampleT::numSamples == 8)
244 {
245 mask[0] = _simd_set1_epi32(-1);
246 }
247 else if (T::MultisampleT::numSamples == 16)
248 {
249 mask[0] = _simd_set1_epi32(-1);
250 mask[1] = _simd_set1_epi32(-1);
251 index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
252 }
253
254 // gather coverage for samples 0-7
255 sampleCoverage[0] =
256 _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
257 (const float*)coverageMask,
258 index0,
259 _mm256_castsi256_ps(mask[0]),
260 8));
261 if (T::MultisampleT::numSamples > 8)
262 {
263 // gather coverage for samples 8-15
264 sampleCoverage[1] =
265 _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
266 (const float*)coverageMask,
267 index1,
268 _mm256_castsi256_ps(mask[1]),
269 8));
270 }
271 }
272
273 mask[0] = _mm256_set_epi8(-1,
274 -1,
275 -1,
276 -1,
277 -1,
278 -1,
279 -1,
280 -1,
281 -1,
282 -1,
283 -1,
284 -1,
285 0xC,
286 0x8,
287 0x4,
288 0x0,
289 -1,
290 -1,
291 -1,
292 -1,
293 -1,
294 -1,
295 -1,
296 -1,
297 -1,
298 -1,
299 -1,
300 -1,
301 0xC,
302 0x8,
303 0x4,
304 0x0);
305 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
306 simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
307
308 simdscalari packedCoverage1;
309 if (T::MultisampleT::numSamples > 8)
310 {
311 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit
312 // lane
313 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
314 }
315
316 #if (KNOB_ARCH == KNOB_ARCH_AVX)
317 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
318 simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
319 simdscalar shufRes = _mm256_shuffle_ps(
320 _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
321 packedCoverage0 = _mm256_castps_si256(
322 _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
323
324 simdscalari packedSampleCoverage;
325 if (T::MultisampleT::numSamples > 8)
326 {
327 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
328 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
329 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow),
330 _mm256_castsi256_ps(hiToLow),
331 _MM_SHUFFLE(1, 1, 0, 1));
332 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
333 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(
334 _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
335 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(
336 _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
337 }
338 else
339 {
340 packedSampleCoverage = packedCoverage0;
341 }
342 #else
343 simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
344 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
345 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
346
347 simdscalari packedSampleCoverage;
348 if (T::MultisampleT::numSamples > 8)
349 {
350 permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
351 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
352 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
353
354 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
355 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
356 }
357 else
358 {
359 packedSampleCoverage = packedCoverage0;
360 }
361 #endif
362
363 for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
364 {
365 // convert packed sample coverage masks into single coverage masks for all samples for
366 // each pixel in the 4x2
367 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
368
369 if (!T::bForcedSampleCount)
370 {
371 // input coverage has to be anded with sample mask if MSAA isn't forced on
372 inputMask[i] &= sampleMask;
373 }
374
375 // shift to the next pixel in the 4x2
376 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
377 }
378 }
379
generateInputCoveragegenerateInputCoverage380 INLINE generateInputCoverage(const uint64_t* const coverageMask,
381 simdscalar& inputCoverage,
382 const uint32_t sampleMask)
383 {
384 uint32_t inputMask[KNOB_SIMD_WIDTH];
385 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
386 inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7],
387 inputMask[6],
388 inputMask[5],
389 inputMask[4],
390 inputMask[3],
391 inputMask[2],
392 inputMask[1],
393 inputMask[0]));
394 }
395 };
396
397 template <typename T>
398 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
399 {
400 INLINE generateInputCoverage(const uint64_t* const coverageMask,
401 simdscalar& inputCoverage,
402 const uint32_t sampleMask)
403 {
404 // will need to update for avx512
405 assert(KNOB_SIMD_WIDTH == 8);
406 simdscalari vec = _simd_set1_epi32(coverageMask[0]);
407 const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
408 vec = _simd_and_si(vec, bit);
409 vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
410 vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
411 inputCoverage = _simd_castsi_ps(vec);
412 }
413
414 INLINE generateInputCoverage(const uint64_t* const coverageMask,
415 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
416 const uint32_t sampleMask)
417 {
418 uint32_t simdCoverage = (coverageMask[0] & MASK);
419 static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
420 for (int i = 0; i < KNOB_SIMD_WIDTH; i++)
421 {
422 // set all samples to covered if conservative coverage mask is set for that pixel
423 inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
424 }
425 }
426 };
427
428 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
429 // Centroid behaves exactly as follows :
430 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center
431 // (even if the sample pattern does not happen to
432 // have a sample location there).
433 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample
434 // index, where sample coverage is after ANDing the
435 // coverage with the SampleMask Rasterizer State.
436 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to
437 // fill out 2x2 pixel stamps, the attribute is
438 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the
439 // pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation
440 // point.Otherwise (full SampleMask), the pixel center is the evaluation point.
441 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
442 template <typename T>
443 INLINE void CalcCentroidPos(SWR_PS_CONTEXT& psContext,
444 const SWR_MULTISAMPLE_POS& samplePos,
445 const uint64_t* const coverageMask,
446 const uint32_t sampleMask,
447 simdscalar const& vXSamplePosUL,
448 simdscalar const& vYSamplePosUL)
449 {
450 uint32_t inputMask[KNOB_SIMD_WIDTH];
451 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
452
453 // Case (2) - partially covered pixel
454
455 // scan for first covered sample per pixel in the 4x2 span
456 unsigned long sampleNum[KNOB_SIMD_WIDTH];
457 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
458 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
459 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
460 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
461 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
462 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
463 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
464 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
465
466 // look up and set the sample offsets from UL pixel corner for first covered sample
467 simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
468 samplePos.X(sampleNum[6]),
469 samplePos.X(sampleNum[5]),
470 samplePos.X(sampleNum[4]),
471 samplePos.X(sampleNum[3]),
472 samplePos.X(sampleNum[2]),
473 samplePos.X(sampleNum[1]),
474 samplePos.X(sampleNum[0]));
475
476 simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
477 samplePos.Y(sampleNum[6]),
478 samplePos.Y(sampleNum[5]),
479 samplePos.Y(sampleNum[4]),
480 samplePos.Y(sampleNum[3]),
481 samplePos.Y(sampleNum[2]),
482 samplePos.Y(sampleNum[1]),
483 samplePos.Y(sampleNum[0]));
484 // add sample offset to UL pixel corner
485 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
486 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
487
488 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
489 static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
490 simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7],
491 inputMask[6],
492 inputMask[5],
493 inputMask[4],
494 inputMask[3],
495 inputMask[2],
496 inputMask[1],
497 inputMask[0]);
498 simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
499
500 static const simdscalari vZero = _simd_setzero_si();
501 const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
502 simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
503 simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
504 simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
505
506 simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
507
508 // set the centroid position based on results from above
509 psContext.vX.centroid =
510 _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
511 psContext.vY.centroid =
512 _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
513
514 // Case (3a) No samples covered and partial sample mask
515 simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
516 // sample mask should never be all 0's for this case, but handle it anyways
517 unsigned long firstCoveredSampleMaskSample = 0;
518 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask))
519 : (firstCoveredSampleMaskSample = 0);
520
521 simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
522
523 vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
524 vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
525
526 // blend in case 3a pixel locations
527 psContext.vX.centroid =
528 _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
529 psContext.vY.centroid =
530 _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
531 }
532
533 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs,
534 SWR_PS_CONTEXT& psContext,
535 const simdscalar& vXSamplePosUL,
536 const simdscalar& vYSamplePosUL)
537 {
538 // evaluate I,J
539 psContext.vI.centroid =
540 vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
541 psContext.vJ.centroid =
542 vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
543 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
544 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
545
546 // interpolate 1/w
547 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW,
548 coeffs.vBOneOverW,
549 coeffs.vCOneOverW,
550 psContext.vI.centroid,
551 psContext.vJ.centroid);
552 }
553
554 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz)
555 {
556 const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
557 const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
558
559 return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
560 }
561
562 template <typename T>
563 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
564 {
565 // RT has to be single sample if we're in forcedMSAA mode
566 if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
567 {
568 return 1;
569 }
570 // unless we're forced to single sample, in which case we run the OM at the sample count of the
571 // RT
572 else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
573 {
574 return GetNumSamples(blendSampleCount);
575 }
576 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
577 else
578 {
579 return T::MultisampleT::numSamples;
580 }
581 }
582
583 inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work)
584 {
585 // broadcast scalars
586
587 coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
588 coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
589 coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
590
591 coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
592 coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
593 coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
594
595 coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
596 coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
597 coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
598
599 coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
600
601 coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
602 coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
603 coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
604 }
605
606 inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS],
607 uint8_t** pDepthBuffer,
608 uint8_t** pStencilBuffer,
609 uint32_t colorHotTileMask,
610 RenderOutputBuffers& renderBuffers)
611 {
612 unsigned long index;
613 while (_BitScanForward(&index, colorHotTileMask))
614 {
615 assert(index < SWR_NUM_RENDERTARGETS);
616 colorHotTileMask &= ~(1 << index);
617 pColorBuffer[index] = renderBuffers.pColor[index];
618 }
619
620 if (pDepthBuffer)
621 {
622 *pDepthBuffer = renderBuffers.pDepth;
623 }
624
625 if (pStencilBuffer)
626 {
627 *pStencilBuffer = renderBuffers.pStencil;
628 ;
629 }
630 }
631
632 INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers)
633 {
634 const API_STATE& state = GetApiState(pDC);
635
636 unsigned long rtSlot = 0;
637 uint32_t colorHottileEnableMask = state.colorHottileEnable;
638 while (_BitScanForward(&rtSlot, colorHottileEnableMask))
639 {
640 colorHottileEnableMask &= ~(1 << rtSlot);
641 renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY;
642 }
643 }
644
645 template <typename T>
646 void SetupPixelShaderContext(SWR_PS_CONTEXT* psContext,
647 const SWR_MULTISAMPLE_POS& samplePos,
648 SWR_TRIANGLE_DESC& work)
649 {
650 psContext->pAttribs = work.pAttribs;
651 psContext->pPerspAttribs = work.pPerspAttribs;
652 psContext->frontFace = work.triFlags.frontFacing;
653 psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
654 psContext->viewportIndex = work.triFlags.viewportIndex;
655
656 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull
657 // attribs
658 psContext->I = work.I;
659 psContext->J = work.J;
660
661 psContext->recipDet = work.recipDet;
662 psContext->pRecipW = work.pRecipW;
663 psContext->pSamplePosX =
664 samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
665 psContext->pSamplePosY =
666 samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
667 psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
668 psContext->sampleIndex = 0;
669 }
670
671 template <typename T, bool IsSingleSample>
672 void CalcCentroid(SWR_PS_CONTEXT* psContext,
673 const SWR_MULTISAMPLE_POS& samplePos,
674 const BarycentricCoeffs& coeffs,
675 const uint64_t* const coverageMask,
676 uint32_t sampleMask)
677 {
678 if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid
679 // positions are still different
680 {
681 // for 1x case, centroid is pixel center
682 psContext->vX.centroid = psContext->vX.center;
683 psContext->vY.centroid = psContext->vY.center;
684 psContext->vI.centroid = psContext->vI.center;
685 psContext->vJ.centroid = psContext->vJ.center;
686 psContext->vOneOverW.centroid = psContext->vOneOverW.center;
687 }
688 else
689 {
690 if (T::bCentroidPos)
691 {
692 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
693 if (T::bIsCenterPattern)
694 {
695 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
696 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
697 }
698 else
699 {
700 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate
701 // coverage 2X'..
702 CalcCentroidPos<T>(*psContext,
703 samplePos,
704 coverageMask,
705 sampleMask,
706 psContext->vX.UL,
707 psContext->vY.UL);
708 }
709
710 CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
711 }
712 else
713 {
714 psContext->vX.centroid = psContext->vX.sample;
715 psContext->vY.centroid = psContext->vY.sample;
716 }
717 }
718 }
719
720 template <typename T>
721 struct PixelRateZTestLoop
722 {
723 PixelRateZTestLoop(DRAW_CONTEXT* DC,
724 uint32_t _workerId,
725 const SWR_TRIANGLE_DESC& Work,
726 const BarycentricCoeffs& Coeffs,
727 const API_STATE& apiState,
728 uint8_t*& depthBuffer,
729 uint8_t*& stencilBuffer,
730 const uint8_t ClipDistanceMask) :
731 pDC(DC),
732 workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
733 samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask),
734 pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
735
736 INLINE
737 uint32_t operator()(simdscalar& activeLanes,
738 SWR_PS_CONTEXT& psContext,
739 const CORE_BUCKETS BEDepthBucket,
740 uint32_t currentSimdIn8x8 = 0)
741 {
742
743 uint32_t statCount = 0;
744 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
745 for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
746 {
747 const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample];
748 vCoverageMask[sample] =
749 _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
750
751 if (!_simd_movemask_ps(vCoverageMask[sample]))
752 {
753 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =
754 _simd_setzero_ps();
755 continue;
756 }
757
758 // offset depth/stencil buffers current sample
759 uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
760 uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
761
762 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
763 {
764 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
765 "Unsupported depth hot tile format");
766
767 const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
768
769 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
770 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
771
772 vCoverageMask[sample] =
773 _simd_and_ps(vCoverageMask[sample],
774 _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
775 }
776
777 RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId);
778
779 // calculate per sample positions
780 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
781 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
782
783 // calc I & J per sample
784 CalcSampleBarycentrics(coeffs, psContext);
785
786 if (psState.writesODepth)
787 {
788 {
789 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
790 vZ[sample] = psContext.vZ;
791 }
792 }
793 else
794 {
795 vZ[sample] = vplaneps(
796 coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
797 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
798 }
799
800 RDTSC_END(psContext.pBucketManager, BEBarycentric, 0);
801
802 ///@todo: perspective correct vs non-perspective correct clipping?
803 // if clip distances are enabled, we need to interpolate for each sample
804 if (clipDistanceMask)
805 {
806 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask,
807 work.pUserClipBuffer,
808 psContext.vI.sample,
809 psContext.vJ.sample);
810
811 vCoverageMask[sample] =
812 _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
813 }
814
815 // ZTest for this sample
816 ///@todo Need to uncomment out this bucket.
817 // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId);
818 depthPassMask[sample] = vCoverageMask[sample];
819 stencilPassMask[sample] = vCoverageMask[sample];
820 depthPassMask[sample] = DepthStencilTest(&state,
821 work.triFlags.frontFacing,
822 work.triFlags.viewportIndex,
823 vZ[sample],
824 pDepthSample,
825 vCoverageMask[sample],
826 pStencilSample,
827 &stencilPassMask[sample]);
828 // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0);
829
830 // early-exit if no pixels passed depth or earlyZ is forced on
831 if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
832 {
833 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
834 &state.depthStencilState,
835 work.triFlags.frontFacing,
836 vZ[sample],
837 pDepthSample,
838 depthPassMask[sample],
839 vCoverageMask[sample],
840 pStencilSample,
841 stencilPassMask[sample]);
842
843 if (!_simd_movemask_ps(depthPassMask[sample]))
844 {
845 continue;
846 }
847 }
848 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
849 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
850 statCount += _mm_popcnt_u32(statMask);
851 }
852
853 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
854 // return number of samples that passed depth and coverage
855 return statCount;
856 }
857
858 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
859 simdscalar vZ[T::MultisampleT::numCoverageSamples];
860 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
861 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
862 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
863
864 private:
865 // functor inputs
866 DRAW_CONTEXT* pDC;
867 uint32_t workerId;
868
869 const SWR_TRIANGLE_DESC& work;
870 const BarycentricCoeffs& coeffs;
871 const API_STATE& state;
872 const SWR_PS_STATE& psState;
873 const SWR_MULTISAMPLE_POS& samplePos;
874 const uint8_t clipDistanceMask;
875 uint8_t*& pDepthBuffer;
876 uint8_t*& pStencilBuffer;
877 };
878
879 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext)
880 {
881 // evaluate I,J
882 psContext.vI.center =
883 vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
884 psContext.vJ.center =
885 vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
886 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
887 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
888
889 // interpolate 1/w
890 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW,
891 coeffs.vBOneOverW,
892 coeffs.vCOneOverW,
893 psContext.vI.center,
894 psContext.vJ.center);
895 }
896
897 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
898 SWR_PS_CONTEXT& psContext)
899 {
900 // evaluate I,J
901 psContext.vI.sample =
902 vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
903 psContext.vJ.sample =
904 vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
905 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
906 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
907
908 // interpolate 1/w
909 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW,
910 coeffs.vBOneOverW,
911 coeffs.vCOneOverW,
912 psContext.vI.sample,
913 psContext.vJ.sample);
914 }
915
916 // Merge Output to 8x2 SIMD16 Tile Format
917 INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
918 SWR_PS_CONTEXT& psContext,
919 uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
920 uint32_t sample,
921 const SWR_BLEND_STATE* pBlendState,
922 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
923 simdscalar& coverageMask,
924 simdscalar const& depthPassMask,
925 uint32_t renderTargetMask,
926 bool useAlternateOffset,
927 uint32_t workerId)
928 {
929 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
930 uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
931
932 if (useAlternateOffset)
933 {
934 rasterTileColorOffset += sizeof(simdscalar);
935 }
936
937 simdvector blendSrc;
938 simdvector blendOut;
939
940 unsigned long rt;
941 while (_BitScanForward(&rt, renderTargetMask))
942 {
943 renderTargetMask &= ~(1 << rt);
944
945 const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
946
947 simdscalar* pColorSample;
948 bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed ||
949 !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
950 if (hotTileEnable)
951 {
952 pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset);
953 blendSrc[0] = pColorSample[0];
954 blendSrc[1] = pColorSample[2];
955 blendSrc[2] = pColorSample[4];
956 blendSrc[3] = pColorSample[6];
957 }
958 else
959 {
960 pColorSample = nullptr;
961 }
962
963 SWR_BLEND_CONTEXT blendContext = {0};
964 {
965 // pfnBlendFunc may not update all channels. Initialize with PS output.
966 /// TODO: move this into the blend JIT.
967 blendOut = psContext.shaded[rt];
968
969 blendContext.pBlendState = pBlendState;
970 blendContext.src = &psContext.shaded[rt];
971 blendContext.src1 = &psContext.shaded[1];
972 blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
973 blendContext.sampleNum = sample;
974 blendContext.pDst = &blendSrc;
975 blendContext.result = &blendOut;
976 blendContext.oMask = &psContext.oMask;
977 blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask);
978
979 // Blend outputs and update coverage mask for alpha test
980 if (pfnBlendFunc[rt] != nullptr)
981 {
982 pfnBlendFunc[rt](&blendContext);
983 }
984 }
985
986 // Track alpha events
987 AR_EVENT(
988 AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
989
990 // final write mask
991 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
992
993 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
994 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
995 "Unsupported hot tile format");
996
997 // store with color mask
998 if (!pRTBlend->writeDisableRed)
999 {
1000 _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x);
1001 }
1002 if (!pRTBlend->writeDisableGreen)
1003 {
1004 _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y);
1005 }
1006 if (!pRTBlend->writeDisableBlue)
1007 {
1008 _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z);
1009 }
1010 if (!pRTBlend->writeDisableAlpha)
1011 {
1012 _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w);
1013 }
1014 }
1015 }
1016
1017 template <typename T>
1018 void BackendPixelRate(DRAW_CONTEXT* pDC,
1019 uint32_t workerId,
1020 uint32_t x,
1021 uint32_t y,
1022 SWR_TRIANGLE_DESC& work,
1023 RenderOutputBuffers& renderBuffers)
1024 {
1025 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the
1026 /// backend
1027
1028
1029 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId);
1030 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
1031
1032 const API_STATE& state = GetApiState(pDC);
1033
1034 BarycentricCoeffs coeffs;
1035 SetupBarycentricCoeffs(&coeffs, work);
1036
1037 SWR_CONTEXT* pContext = pDC->pContext;
1038 void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1039
1040 SWR_PS_CONTEXT psContext;
1041 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
1042 SetupPixelShaderContext<T>(&psContext, samplePos, work);
1043
1044 uint8_t *pDepthBuffer, *pStencilBuffer;
1045 SetupRenderBuffers(psContext.pColorBuffer,
1046 &pDepthBuffer,
1047 &pStencilBuffer,
1048 state.colorHottileEnable,
1049 renderBuffers);
1050
1051 bool isTileDirty = false;
1052
1053 RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
1054
1055 PixelRateZTestLoop<T> PixelRateZTest(pDC,
1056 workerId,
1057 work,
1058 coeffs,
1059 state,
1060 pDepthBuffer,
1061 pStencilBuffer,
1062 state.backendState.clipDistanceMask);
1063
1064 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1065 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1066
1067 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1068
1069 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1070 {
1071 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1072 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1073
1074 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1075
1076 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1077 {
1078 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
1079
1080
1081 simdscalar activeLanes;
1082 if (!(work.anyCoveredSamples & MASK))
1083 {
1084 goto Endtile;
1085 };
1086 activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
1087
1088 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
1089 {
1090 const uint64_t* pCoverageMask =
1091 (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1092 ? &work.innerCoverageMask
1093 : &work.coverageMask[0];
1094
1095 generateInputCoverage<T, T::InputCoverage>(
1096 pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
1097 }
1098
1099 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
1100
1101 CalcPixelBarycentrics(coeffs, psContext);
1102
1103 CalcCentroid<T, false>(
1104 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
1105
1106 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
1107
1108 if (T::bForcedSampleCount)
1109 {
1110 // candidate pixels (that passed coverage) will cause shader invocation if any bits
1111 // in the samplemask are set
1112 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(
1113 _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
1114 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
1115 }
1116
1117 // Early-Z?
1118 if (T::bCanEarlyZ && !T::bForcedSampleCount)
1119 {
1120 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
1121 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
1122 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
1123 }
1124
1125 // if we have no covered samples that passed depth at this point, go to next tile
1126 if (!_simd_movemask_ps(activeLanes))
1127 {
1128 goto Endtile;
1129 };
1130
1131 if (state.psState.usesSourceDepth)
1132 {
1133 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
1134 // interpolate and quantize z
1135 psContext.vZ = vplaneps(
1136 coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1137 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1138 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
1139 }
1140
1141 // pixels that are currently active
1142 psContext.activeMask = _simd_castps_si(activeLanes);
1143 psContext.oMask = T::MultisampleT::FullSampleMask();
1144
1145 // execute pixel shader
1146 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
1147 state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
1148 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
1149
1150 // update stats
1151 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
1152 AR_EVENT(PSStats((HANDLE)&psContext.stats));
1153
1154 // update active lanes to remove any discarded or oMask'd pixels
1155 activeLanes = _simd_castsi_ps(_simd_and_si(
1156 psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
1157 if (!_simd_movemask_ps(activeLanes))
1158 {
1159 goto Endtile;
1160 };
1161
1162 isTileDirty = true;
1163
1164 // late-Z
1165 if (!T::bCanEarlyZ && !T::bForcedSampleCount)
1166 {
1167 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
1168 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
1169 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
1170 }
1171
1172 // if we have no covered samples that passed depth at this point, skip OM and go to next
1173 // tile
1174 if (!_simd_movemask_ps(activeLanes))
1175 {
1176 goto Endtile;
1177 };
1178
1179 // output merger
1180 // loop over all samples, broadcasting the results of the PS to all passing pixels
1181 for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount);
1182 sample++)
1183 {
1184 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
1185 // center pattern does a single coverage/depth/stencil test, standard pattern tests
1186 // all samples
1187 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
1188 simdscalar coverageMask, depthMask;
1189 if (T::bForcedSampleCount)
1190 {
1191 coverageMask = depthMask = activeLanes;
1192 }
1193 else
1194 {
1195 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
1196 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
1197 if (!_simd_movemask_ps(depthMask))
1198 {
1199 // stencil should already have been written in early/lateZ tests
1200 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
1201 continue;
1202 }
1203 }
1204
1205 // broadcast the results of the PS to all passing pixels
1206
1207 OutputMerger8x2(pDC,
1208 psContext,
1209 psContext.pColorBuffer,
1210 sample,
1211 &state.blendState,
1212 state.pfnBlendFunc,
1213 coverageMask,
1214 depthMask,
1215 state.psState.renderTargetMask,
1216 useAlternateOffset,
1217 workerId);
1218
1219
1220 if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1221 {
1222 uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1223 uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1224
1225 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
1226 &state.depthStencilState,
1227 work.triFlags.frontFacing,
1228 PixelRateZTest.vZ[coverageSampleNum],
1229 pDepthSample,
1230 depthMask,
1231 coverageMask,
1232 pStencilSample,
1233 PixelRateZTest.stencilPassMask[coverageSampleNum]);
1234 }
1235 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
1236 }
1237 Endtile:
1238 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
1239
1240 for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1241 {
1242 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1243 }
1244
1245 if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1246 {
1247 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1248 }
1249 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1250
1251 if (useAlternateOffset)
1252 {
1253 unsigned long rt;
1254 uint32_t rtMask = state.colorHottileEnable;
1255 while (_BitScanForward(&rt, rtMask))
1256 {
1257 rtMask &= ~(1 << rt);
1258 psContext.pColorBuffer[rt] +=
1259 (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1260 }
1261 }
1262
1263 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1264 pStencilBuffer +=
1265 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1266
1267 RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
1268
1269 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1270 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1271 }
1272
1273 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1274 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1275 }
1276
1277 if (isTileDirty)
1278 {
1279 SetRenderHotTilesDirty(pDC, renderBuffers);
1280 }
1281
1282 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0);
1283 }
1284
1285 template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X,
1286 uint32_t isCenter = 0,
1287 uint32_t coverage = 0,
1288 uint32_t centroid = 0,
1289 uint32_t forced = 0,
1290 uint32_t canEarlyZ = 0
1291 >
1292 struct SwrBackendTraits
1293 {
1294 static const bool bIsCenterPattern = (isCenter == 1);
1295 static const uint32_t InputCoverage = coverage;
1296 static const bool bCentroidPos = (centroid == 1);
1297 static const bool bForcedSampleCount = (forced == 1);
1298 static const bool bCanEarlyZ = (canEarlyZ == 1);
1299 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
1300 };
1301