1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC,
41 PA_STATE& pa,
42 uint32_t workerId,
43 Vec4<SIMD_T> prim[],
44 Float<SIMD_T> recipW[],
45 uint32_t primMask,
46 Integer<SIMD_T> const& primID,
47 Integer<SIMD_T> const& viewportIdx,
48 Integer<SIMD_T> const& rtIdx);
49
50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
51 void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
52 PA_STATE& pa,
53 uint32_t workerId,
54 Vec4<SIMD_T> prim[],
55 uint32_t primMask,
56 Integer<SIMD_T> const& primID,
57 Integer<SIMD_T> const& viewportIdx,
58 Integer<SIMD_T> const& rtIdx);
59
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template <typename NumVertsT,
70 typename IsSwizzledT,
71 typename HasConstantInterpT,
72 typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)73 INLINE void ProcessAttributes(
74 DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer)
75 {
76 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
77 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
78 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
79 uint32_t constantInterpMask =
80 IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
81 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
82 const PRIMITIVE_TOPOLOGY topo = pa.binTopology;
83
84 static const float constTable[3][4] = {
85 {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}};
86
87 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
88 {
89 uint32_t inputSlot;
90 if (IsSwizzledT::value)
91 {
92 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
93 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
94 }
95 else
96 {
97 inputSlot = backendState.vertexAttribOffset + i;
98 }
99
100 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
101 float* pAttribStart = pBuffer;
102
103 if (HasConstantInterpT::value || IsDegenerate::value)
104 {
105 if (CheckBit(constantInterpMask, i))
106 {
107 uint32_t vid;
108 uint32_t adjustedTriIndex;
109 static const uint32_t tristripProvokingVertex[] = {0, 2, 1};
110 static const int32_t quadProvokingTri[2][4] = {{0, 0, 0, 1}, {0, -1, 0, 0}};
111 static const uint32_t quadProvokingVertex[2][4] = {{0, 1, 2, 2}, {0, 1, 1, 2}};
112 static const int32_t qstripProvokingTri[2][4] = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
113 static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
114
115 switch (topo)
116 {
117 case TOP_QUAD_LIST:
118 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
119 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
120 break;
121 case TOP_QUAD_STRIP:
122 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
123 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
124 break;
125 case TOP_TRIANGLE_STRIP:
126 adjustedTriIndex = triIndex;
127 vid =
128 (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex;
129 break;
130 default:
131 adjustedTriIndex = triIndex;
132 vid = provokingVertex;
133 break;
134 }
135
136 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
137
138 for (uint32_t i = 0; i < NumVertsT::value; ++i)
139 {
140 SIMD128::store_ps(pBuffer, attrib[vid]);
141 pBuffer += 4;
142 }
143 }
144 else
145 {
146 pa.AssembleSingle(inputSlot, triIndex, attrib);
147
148 for (uint32_t i = 0; i < NumVertsT::value; ++i)
149 {
150 SIMD128::store_ps(pBuffer, attrib[i]);
151 pBuffer += 4;
152 }
153 }
154 }
155 else
156 {
157 pa.AssembleSingle(inputSlot, triIndex, attrib);
158
159 for (uint32_t i = 0; i < NumVertsT::value; ++i)
160 {
161 SIMD128::store_ps(pBuffer, attrib[i]);
162 pBuffer += 4;
163 }
164 }
165
166 // pad out the attrib buffer to 3 verts to ensure the triangle
167 // interpolation code in the pixel shader works correctly for the
168 // 3 topologies - point, line, tri. This effectively zeros out the
169 // effect of the missing vertices in the triangle interpolation.
170 for (uint32_t v = NumVertsT::value; v < 3; ++v)
171 {
172 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
173 pBuffer += 4;
174 }
175
176 // check for constant source overrides
177 if (IsSwizzledT::value)
178 {
179 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
180 if (mask)
181 {
182 unsigned long comp;
183 while (_BitScanForward(&comp, mask))
184 {
185 mask &= ~(1 << comp);
186
187 float constantValue = 0.0f;
188 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
189 {
190 case SWR_CONSTANT_SOURCE_CONST_0000:
191 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
192 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
193 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
194 break;
195 case SWR_CONSTANT_SOURCE_PRIM_ID:
196 constantValue = *(float*)&primId;
197 break;
198 }
199
200 // apply constant value to all 3 vertices
201 for (uint32_t v = 0; v < 3; ++v)
202 {
203 pAttribStart[comp + v * 4] = constantValue;
204 }
205 }
206 }
207 }
208 }
209 }
210
211 typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
212
213 struct ProcessAttributesChooser
214 {
215 typedef PFN_PROCESS_ATTRIBUTES FuncType;
216
217 template <typename... ArgsB>
GetFuncProcessAttributesChooser218 static FuncType GetFunc()
219 {
220 return ProcessAttributes<ArgsB...>;
221 }
222 };
223
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)224 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts,
225 bool IsSwizzled,
226 bool HasConstantInterp,
227 bool IsDegenerate = false)
228 {
229 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(
230 IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
231 }
232
233 //////////////////////////////////////////////////////////////////////////
234 /// @brief Processes enabled user clip distances. Loads the active clip
235 /// distances from the PA, sets up barycentric equations, and
236 /// stores the results to the output buffer
237 /// @param pa - Primitive Assembly state
238 /// @param primIndex - primitive index to process
239 /// @param clipDistMask - mask of enabled clip distances
240 /// @param pUserClipBuffer - buffer to store results
241 template <uint32_t NumVerts>
ProcessUserClipDist(const SWR_BACKEND_STATE & state,PA_STATE & pa,uint32_t primIndex,float * pRecipW,float * pUserClipBuffer)242 void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
243 PA_STATE& pa,
244 uint32_t primIndex,
245 float* pRecipW,
246 float* pUserClipBuffer)
247 {
248 unsigned long clipDist;
249 uint32_t clipDistMask = state.clipDistanceMask;
250 while (_BitScanForward(&clipDist, clipDistMask))
251 {
252 clipDistMask &= ~(1 << clipDist);
253 uint32_t clipSlot = clipDist >> 2;
254 uint32_t clipComp = clipDist & 0x3;
255 uint32_t clipAttribSlot =
256 clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
257
258 simd4scalar primClipDist[3];
259 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
260
261 float vertClipDist[NumVerts];
262 for (uint32_t e = 0; e < NumVerts; ++e)
263 {
264 OSALIGNSIMD(float) aVertClipDist[4];
265 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
266 vertClipDist[e] = aVertClipDist[clipComp];
267 };
268
269 // setup plane equations for barycentric interpolation in the backend
270 float baryCoeff[NumVerts];
271 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
272 for (uint32_t e = 0; e < NumVerts - 1; ++e)
273 {
274 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
275 }
276 baryCoeff[NumVerts - 1] = last;
277
278 for (uint32_t e = 0; e < NumVerts; ++e)
279 {
280 *(pUserClipBuffer++) = baryCoeff[e];
281 }
282 }
283 }
284
285 INLINE
TransposeVertices(simd4scalar (& dst)[8],const simdscalar & src0,const simdscalar & src1,const simdscalar & src2)286 void TransposeVertices(simd4scalar (&dst)[8],
287 const simdscalar& src0,
288 const simdscalar& src1,
289 const simdscalar& src2)
290 {
291 vTranspose3x8(dst, src0, src1, src2);
292 }
293
294 INLINE
TransposeVertices(simd4scalar (& dst)[16],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2)295 void TransposeVertices(simd4scalar (&dst)[16],
296 const simd16scalar& src0,
297 const simd16scalar& src1,
298 const simd16scalar& src2)
299 {
300 vTranspose4x16(
301 reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
302 }
303
304 #if KNOB_ENABLE_EARLY_RAST
305
306 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
307 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
308
309 template <typename SIMD_T>
310 struct EarlyRastHelper
311 {
312 };
313
314 template <>
315 struct EarlyRastHelper<SIMD256>
316 {
InitShiftCntrlEarlyRastHelper317 static SIMD256::Integer InitShiftCntrl()
318 {
319 return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
320 }
321 };
322
323 #if USE_SIMD16_FRONTEND
324 template <>
325 struct EarlyRastHelper<SIMD512>
326 {
InitShiftCntrlEarlyRastHelper327 static SIMD512::Integer InitShiftCntrl()
328 {
329 return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
330 }
331 };
332
333 #endif
334 //////////////////////////////////////////////////////////////////////////
335 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
336 /// (ER tile) can be rasterized as early as in binner to check if
337 /// they cover any pixels. If not - the triangles can be
338 /// culled in binner.
339 ///
340 /// @param er_bbox - coordinates of ER tile for each triangle
341 /// @param vAi - A coefficients of triangle edges
342 /// @param vBi - B coefficients of triangle edges
343 /// @param vXi - X coordinates of triangle vertices
344 /// @param vYi - Y coordinates of triangle vertices
345 /// @param frontWindingTris - mask indicating CCW/CW triangles
346 /// @param triMask - mask for valid SIMD lanes (triangles)
347 /// @param oneTileMask - defines triangles for ER to work on
348 /// (tris that fit into ER tile)
349 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
EarlyRasterizer(DRAW_CONTEXT * pDC,SIMDBBOX_T<SIMD_T> & er_bbox,Integer<SIMD_T> (& vAi)[3],Integer<SIMD_T> (& vBi)[3],Integer<SIMD_T> (& vXi)[3],Integer<SIMD_T> (& vYi)[3],uint32_t cwTrisMask,uint32_t triMask,uint32_t oneTileMask)350 uint32_t SIMDCALL EarlyRasterizer(DRAW_CONTEXT* pDC,
351 SIMDBBOX_T<SIMD_T>& er_bbox,
352 Integer<SIMD_T> (&vAi)[3],
353 Integer<SIMD_T> (&vBi)[3],
354 Integer<SIMD_T> (&vXi)[3],
355 Integer<SIMD_T> (&vYi)[3],
356 uint32_t cwTrisMask,
357 uint32_t triMask,
358 uint32_t oneTileMask)
359 {
360 // step to pixel center of top-left pixel of the triangle bbox
361 Integer<SIMD_T> vTopLeftX =
362 SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
363 vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
364
365 Integer<SIMD_T> vTopLeftY =
366 SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
367 vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
368
369 // negate A and B for CW tris
370 Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
371 Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
372 Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
373 Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
374 Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
375 Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
376
377 RDTSC_EVENT(pDC->pContext->pBucketMgr,
378 FEEarlyRastEnter,
379 _mm_popcnt_u32(oneTileMask & triMask),
380 0);
381
382 Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl();
383 Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask);
384 Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
385
386 vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
387 SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
388 vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
389 SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
390 vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
391 SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
392 vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
393 SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
394 vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
395 SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
396 vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
397 SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
398
399 // evaluate edge equations at top-left pixel
400 Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
401 Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
402 Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
403
404 Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
405 Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
406 Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
407
408 Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
409 Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
410 Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
411
412 Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
413 Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
414 Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
415
416 Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
417 Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
418 Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
419
420 vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
421 vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
422 vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
423
424 // top left rule
425 Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
426 Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
427 Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
428
429 // vA < 0
430 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
431 SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
432 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
433 SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
434 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
435 SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
436
437 // vA == 0 && vB < 0
438 Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
439 Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
440 Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
441
442 vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
443 vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
444 vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
445
446 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
447 SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
448 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
449 SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
450 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
451 SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
452
453 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
454 // Go down
455 // coverage pixel 0
456 Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
457 vMask0 = SIMD_T::and_si(vMask0, vEdge2);
458
459 // coverage pixel 1
460 Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
461 Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
462 Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
463 Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
464 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
465
466 // coverage pixel 2
467 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
468 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
469 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
470 Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
471 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
472
473 // coverage pixel 3
474 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
475 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
476 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
477 Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
478 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
479
480 // One step to the right and then up
481
482 // coverage pixel 4
483 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
484 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
485 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
486 Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
487 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
488
489 // coverage pixel 5
490 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
491 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
492 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
493 Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
494 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
495
496 // coverage pixel 6
497 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
498 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
499 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
500 Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
501 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
502
503 // coverage pixel 7
504 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
505 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
506 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
507 Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
508 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
509
510 Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
511 vLit1 = SIMD_T::or_si(vLit1, vMask2);
512 vLit1 = SIMD_T::or_si(vLit1, vMask3);
513 vLit1 = SIMD_T::or_si(vLit1, vMask4);
514 vLit1 = SIMD_T::or_si(vLit1, vMask5);
515 vLit1 = SIMD_T::or_si(vLit1, vMask6);
516 vLit1 = SIMD_T::or_si(vLit1, vMask7);
517
518 // Step to the right and go down again
519
520 // coverage pixel 0
521 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
522 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
523 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
524 vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
525 vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
526
527 // coverage pixel 1
528 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
529 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
530 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
531 vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
532 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
533
534 // coverage pixel 2
535 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
536 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
537 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
538 vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
539 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
540
541 // coverage pixel 3
542 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
543 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
544 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
545 vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
546 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
547
548 // And for the last time - to the right and up
549
550 // coverage pixel 4
551 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
552 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
553 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
554 vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
555 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
556
557 // coverage pixel 5
558 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
559 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
560 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
561 vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
562 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
563
564 // coverage pixel 6
565 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
566 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
567 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
568 vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
569 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
570
571 // coverage pixel 7
572 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
573 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
574 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
575 vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
576 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
577
578 Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
579 vLit2 = SIMD_T::or_si(vLit2, vMask2);
580 vLit2 = SIMD_T::or_si(vLit2, vMask3);
581 vLit2 = SIMD_T::or_si(vLit2, vMask4);
582 vLit2 = SIMD_T::or_si(vLit2, vMask5);
583 vLit2 = SIMD_T::or_si(vLit2, vMask6);
584 vLit2 = SIMD_T::or_si(vLit2, vMask7);
585
586 Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
587
588 #else
589 // Generic algorithm sweeping in row by row order
590 Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
591
592 Integer<SIMD_T> vEdge0N = vEdge0;
593 Integer<SIMD_T> vEdge1N = vEdge1;
594 Integer<SIMD_T> vEdge2N = vEdge2;
595
596 for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
597 {
598 // Store edge values at the beginning of the row
599 Integer<SIMD_T> vRowEdge0 = vEdge0N;
600 Integer<SIMD_T> vRowEdge1 = vEdge1N;
601 Integer<SIMD_T> vRowEdge2 = vEdge2N;
602
603 Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
604
605 for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
606 {
607 vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
608 vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
609
610 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
611 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
612 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
613 }
614 vRowMask[row] = vColMask[0];
615 for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
616 {
617 vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
618 }
619 // Restore values and go to the next row
620 vEdge0N = vRowEdge0;
621 vEdge1N = vRowEdge1;
622 vEdge2N = vRowEdge2;
623
624 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
625 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
626 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
627 }
628
629 // compress all masks
630 Integer<SIMD_T> vLit = vRowMask[0];
631 for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
632 {
633 vLit = SIMD_T::or_si(vLit, vRowMask[row]);
634 }
635
636 #endif
637 // Check which triangles has any pixel lit
638 uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
639 uint32_t maskUnlit = ~maskLit & oneTileMask;
640
641 uint32_t oldTriMask = triMask;
642 triMask &= ~maskUnlit;
643
644 if (triMask ^ oldTriMask)
645 {
646 RDTSC_EVENT(pDC->pContext->pBucketMgr,
647 FEEarlyRastExit,
648 _mm_popcnt_u32(triMask & oneTileMask),
649 0);
650 }
651 return triMask;
652 }
653
654 #endif // Early rasterizer
655
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
658 /// culling, viewport transform, etc.
659 /// @param pDC - pointer to draw context.
660 /// @param pa - The primitive assembly object.
661 /// @param workerId - thread's worker id. Even thread has a unique id.
662 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
663 /// @param primID - Primitive ID for each triangle.
664 /// @param viewportIdx - viewport array index for each triangle.
665 /// @tparam CT - ConservativeRastFETraits
666 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
BinTrianglesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> tri[3],uint32_t triMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)667 void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT* pDC,
668 PA_STATE& pa,
669 uint32_t workerId,
670 Vec4<SIMD_T> tri[3],
671 uint32_t triMask,
672 Integer<SIMD_T> const& primID,
673 Integer<SIMD_T> const& viewportIdx,
674 Integer<SIMD_T> const& rtIdx)
675 {
676 const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
677
678 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinTriangles, pDC->drawId);
679
680 const API_STATE& state = GetApiState(pDC);
681 const SWR_RASTSTATE& rastState = state.rastState;
682 const SWR_FRONTEND_STATE& feState = state.frontendState;
683
684 MacroTileMgr* pTileMgr = pDC->pTileMgr;
685
686 Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
687 Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
688 Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
689
690 if (feState.vpTransformDisable)
691 {
692 // RHW is passed in directly when VP transform is disabled
693 vRecipW0 = tri[0].v[3];
694 vRecipW1 = tri[1].v[3];
695 vRecipW2 = tri[2].v[3];
696 }
697 else
698 {
699 // Perspective divide
700 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
701 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
702 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
703
704 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
705 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
706 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
707
708 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
709 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
710 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
711
712 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
713 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
714 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
715
716 // Viewport transform to screen space coords
717 if (pa.viewportArrayActive)
718 {
719 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
720 }
721 else
722 {
723 viewportTransform<3>(tri, state.vpMatrices);
724 }
725 }
726
727 // Adjust for pixel center location
728 Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
729
730 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
731 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
732
733 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
734 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
735
736 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
737 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
738
739 // Set vXi, vYi to required fixed point precision
740 Integer<SIMD_T> vXi[3], vYi[3];
741 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
742
743 // triangle setup
744 Integer<SIMD_T> vAi[3], vBi[3];
745 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
746
747 // determinant
748 Integer<SIMD_T> vDet[2];
749 calcDeterminantIntVertical(vAi, vBi, vDet);
750
751 // cull zero area
752 uint32_t maskLo =
753 SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
754 uint32_t maskHi =
755 SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
756
757 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
758
759 // don't cull degenerate triangles if we're conservatively rasterizing
760 uint32_t origTriMask = triMask;
761 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
762 {
763 triMask &= ~cullZeroAreaMask;
764 }
765
766 // determine front winding tris
767 // CW +det
768 // CCW det < 0;
769 // 0 area triangles are marked as backfacing regardless of winding order,
770 // which is required behavior for conservative rast and wireframe rendering
771 uint32_t frontWindingTris;
772 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
773 {
774 maskLo = SIMD_T::movemask_pd(
775 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
776 maskHi = SIMD_T::movemask_pd(
777 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
778 }
779 else
780 {
781 maskLo = SIMD_T::movemask_pd(
782 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
783 maskHi = SIMD_T::movemask_pd(
784 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
785 }
786 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
787
788 // cull
789 uint32_t cullTris;
790 switch ((SWR_CULLMODE)rastState.cullMode)
791 {
792 case SWR_CULLMODE_BOTH:
793 cullTris = 0xffffffff;
794 break;
795 case SWR_CULLMODE_NONE:
796 cullTris = 0x0;
797 break;
798 case SWR_CULLMODE_FRONT:
799 cullTris = frontWindingTris;
800 break;
801 // 0 area triangles are marked as backfacing, which is required behavior for conservative
802 // rast
803 case SWR_CULLMODE_BACK:
804 cullTris = ~frontWindingTris;
805 break;
806 default:
807 SWR_INVALID("Invalid cull mode: %d", rastState.cullMode);
808 cullTris = 0x0;
809 break;
810 }
811
812 triMask &= ~cullTris;
813
814 if (origTriMask ^ triMask)
815 {
816 RDTSC_EVENT(pDC->pContext->pBucketMgr,
817 FECullZeroAreaAndBackface,
818 _mm_popcnt_u32(origTriMask ^ triMask),
819 0);
820 }
821
822 AR_EVENT(CullInfoEvent(pDC->drawId, cullZeroAreaMask, cullTris, origTriMask));
823
824 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
825 // compute per tri backface
826 uint32_t frontFaceMask = frontWindingTris;
827 uint32_t* pPrimID = (uint32_t*)&primID;
828 const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
829 uint32_t triIndex = 0;
830
831 uint32_t edgeEnable;
832 PFN_WORK_FUNC pfnWork;
833 if (CT::IsConservativeT::value)
834 {
835 // determine which edges of the degenerate tri, if any, are valid to rasterize.
836 // used to call the appropriate templated rasterizer function
837 if (cullZeroAreaMask > 0)
838 {
839 // e0 = v1-v0
840 const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
841 const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
842
843 uint32_t e0Mask =
844 SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
845
846 // e1 = v2-v1
847 const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
848 const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
849
850 uint32_t e1Mask =
851 SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
852
853 // e2 = v0-v2
854 // if v0 == v1 & v1 == v2, v0 == v2
855 uint32_t e2Mask = e0Mask & e1Mask;
856 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
857
858 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
859 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
860 e0Mask = pdep_u32(e0Mask, 0x00249249);
861
862 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
863 e1Mask = pdep_u32(e1Mask, 0x00492492);
864
865 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
866 e2Mask = pdep_u32(e2Mask, 0x00924924);
867
868 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
869 }
870 else
871 {
872 edgeEnable = 0x00FFFFFF;
873 }
874 }
875 else
876 {
877 // degenerate triangles won't be sent to rasterizer; just enable all edges
878 pfnWork = GetRasterizerFunc(rastState.sampleCount,
879 rastState.bIsCenterPattern,
880 (rastState.conservativeRast > 0),
881 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
882 EdgeValToEdgeState(ALL_EDGES_VALID),
883 (state.scissorsTileAligned == false));
884 }
885
886 SIMDBBOX_T<SIMD_T> bbox;
887
888 if (!triMask)
889 {
890 goto endBinTriangles;
891 }
892
893 // Calc bounding box of triangles
894 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
895
896 // determine if triangle falls between pixel centers and discard
897 // only discard for non-MSAA case and when conservative rast is disabled
898 // (xmin + 127) & ~255
899 // (xmax + 128) & ~255
900 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
901 (!CT::IsConservativeT::value))
902 {
903 origTriMask = triMask;
904
905 int cullCenterMask;
906
907 {
908 Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
909 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
910 Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
911 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
912
913 Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
914
915 Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
916 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
917 Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
918 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
919
920 Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
921
922 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
923 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
924 }
925
926 triMask &= ~cullCenterMask;
927
928 if (origTriMask ^ triMask)
929 {
930 RDTSC_EVENT(pDC->pContext->pBucketMgr,
931 FECullBetweenCenters,
932 _mm_popcnt_u32(origTriMask ^ triMask),
933 0);
934 }
935 }
936
937 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
938 // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
939 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
940 {
941 Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
942 if (pa.viewportArrayActive)
943
944 {
945 GatherScissors(&state.scissorsInFixedPoint[0],
946 pViewportIndex,
947 scisXmin,
948 scisYmin,
949 scisXmax,
950 scisYmax);
951 }
952 else // broadcast fast path for non-VPAI case.
953 {
954 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
955 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
956 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
957 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
958 }
959
960 // Make triangle bbox inclusive
961 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
962 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
963
964 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
965 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
966 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
967 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
968 }
969
970 if (CT::IsConservativeT::value)
971 {
972 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the
973 // primitive bbox has some area. Bump the xmax/ymax edges out
974
975 Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
976 bbox.ymax = SIMD_T::blendv_epi32(
977 bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
978
979 Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
980 bbox.xmax = SIMD_T::blendv_epi32(
981 bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
982 }
983
984 // Cull tris completely outside scissor
985 {
986 Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
987 Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
988 Integer<SIMD_T> maskOutsideScissorXY =
989 SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
990 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
991 triMask = triMask & ~maskOutsideScissor;
992 }
993
994 #if KNOB_ENABLE_EARLY_RAST
995 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
996 {
997 // Try early rasterization - culling small triangles which do not cover any pixels
998
999 // convert to ER tiles
1000 SIMDBBOX_T<SIMD_T> er_bbox;
1001
1002 er_bbox.xmin =
1003 SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
1004 er_bbox.xmax =
1005 SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
1006 er_bbox.ymin =
1007 SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
1008 er_bbox.ymax =
1009 SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
1010
1011 Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
1012 Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
1013
1014 // Take only triangles that fit into ER tile
1015 uint32_t oneTileMask =
1016 triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
1017
1018 if (oneTileMask)
1019 {
1020 // determine CW tris (det > 0)
1021 uint32_t maskCwLo = SIMD_T::movemask_pd(
1022 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
1023 uint32_t maskCwHi = SIMD_T::movemask_pd(
1024 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
1025 uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
1026
1027 // Try early rasterization
1028 triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(
1029 pDC, er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
1030
1031 if (!triMask)
1032 {
1033 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1034 return;
1035 }
1036 }
1037 }
1038 #endif
1039
1040 endBinTriangles:
1041
1042
1043 if (!triMask)
1044 {
1045 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1046 return;
1047 }
1048
1049 // Send surviving triangles to the line or point binner based on fill mode
1050 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1051 {
1052 // Simple non-conformant wireframe mode, useful for debugging
1053 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1054 Vec4<SIMD_T> line[2];
1055 Float<SIMD_T> recipW[2];
1056
1057 line[0] = tri[0];
1058 line[1] = tri[1];
1059 recipW[0] = vRecipW0;
1060 recipW[1] = vRecipW1;
1061
1062 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1063 pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1064
1065 line[0] = tri[1];
1066 line[1] = tri[2];
1067 recipW[0] = vRecipW1;
1068 recipW[1] = vRecipW2;
1069
1070 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1071 pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1072
1073 line[0] = tri[2];
1074 line[1] = tri[0];
1075 recipW[0] = vRecipW2;
1076 recipW[1] = vRecipW0;
1077
1078 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1079 pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1080
1081 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1082 return;
1083 }
1084 else if (rastState.fillMode == SWR_FILLMODE_POINT)
1085 {
1086 // Bin 3 points
1087 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1088 pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
1089 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1090 pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
1091 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1092 pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
1093
1094 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1095 return;
1096 }
1097
1098 // Convert triangle bbox to macrotile units.
1099 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1100 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1101 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1102 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1103
1104 OSALIGNSIMD16(uint32_t)
1105 aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1106
1107 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1108 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1109 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1110 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1111
1112 // transpose verts needed for backend
1113 /// @todo modify BE to take non-transformed verts
1114 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1115 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1116 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1117 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1118
1119 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1120 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1121 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1122 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1123
1124 // scan remaining valid triangles and bin each separately
1125 while (_BitScanForward((unsigned long*)&triIndex, triMask))
1126 {
1127 uint32_t linkageCount = state.backendState.numAttributes;
1128 uint32_t numScalarAttribs = linkageCount * 4;
1129
1130 BE_WORK work;
1131 work.type = DRAW;
1132
1133 bool isDegenerate;
1134 if (CT::IsConservativeT::value)
1135 {
1136 // only rasterize valid edges if we have a degenerate primitive
1137 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1138 work.pfnWork =
1139 GetRasterizerFunc(rastState.sampleCount,
1140 rastState.bIsCenterPattern,
1141 (rastState.conservativeRast > 0),
1142 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
1143 EdgeValToEdgeState(triEdgeEnable),
1144 (state.scissorsTileAligned == false));
1145
1146 // Degenerate triangles are required to be constant interpolated
1147 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1148 }
1149 else
1150 {
1151 isDegenerate = false;
1152 work.pfnWork = pfnWork;
1153 }
1154
1155 // Select attribute processor
1156 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs =
1157 GetProcessAttributesFunc(3,
1158 state.backendState.swizzleEnable,
1159 state.backendState.constantInterpolationMask,
1160 isDegenerate);
1161
1162 TRIANGLE_WORK_DESC& desc = work.desc.tri;
1163
1164 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1165 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1166 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1167
1168 auto pArena = pDC->pArena;
1169 SWR_ASSERT(pArena != nullptr);
1170
1171 // store active attribs
1172 float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1173 desc.pAttribs = pAttribs;
1174 desc.numAttribs = linkageCount;
1175 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1176
1177 // store triangle vertex data
1178 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1179
1180 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1181 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1182 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1183 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1184
1185 // store user clip distances
1186 if (state.backendState.clipDistanceMask)
1187 {
1188 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1189 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1190 ProcessUserClipDist<3>(
1191 state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1192 }
1193
1194 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1195 {
1196 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1197 {
1198 #if KNOB_ENABLE_TOSS_POINTS
1199 if (!KNOB_TOSS_SETUP_TRIS)
1200 #endif
1201 {
1202 pTileMgr->enqueue(x, y, &work);
1203 }
1204 }
1205 }
1206
1207 triMask &= ~(1 << triIndex);
1208 }
1209
1210 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1211 }
1212
1213 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1214 void BinTriangles(DRAW_CONTEXT* pDC,
1215 PA_STATE& pa,
1216 uint32_t workerId,
1217 simdvector tri[3],
1218 uint32_t triMask,
1219 simdscalari const& primID,
1220 simdscalari const& viewportIdx,
1221 simdscalari const& rtIdx)
1222 {
1223 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(
1224 pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1225 }
1226
1227 #if USE_SIMD16_FRONTEND
1228 template <typename CT>
BinTriangles_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector tri[3],uint32_t triMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1229 void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT* pDC,
1230 PA_STATE& pa,
1231 uint32_t workerId,
1232 simd16vector tri[3],
1233 uint32_t triMask,
1234 simd16scalari const& primID,
1235 simd16scalari const& viewportIdx,
1236 simd16scalari const& rtIdx)
1237 {
1238 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(
1239 pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1240 }
1241
1242 #endif
1243 struct FEBinTrianglesChooser
1244 {
1245 typedef PFN_PROCESS_PRIMS FuncType;
1246
1247 template <typename... ArgsB>
GetFuncFEBinTrianglesChooser1248 static FuncType GetFunc()
1249 {
1250 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1251 }
1252 };
1253
1254 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)1255 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1256 {
1257 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1258 }
1259
1260 #if USE_SIMD16_FRONTEND
1261 struct FEBinTrianglesChooser_simd16
1262 {
1263 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1264
1265 template <typename... ArgsB>
GetFuncFEBinTrianglesChooser_simd161266 static FuncType GetFunc()
1267 {
1268 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1269 }
1270 };
1271
1272 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc_simd16(bool IsConservative)1273 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1274 {
1275 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1276 }
1277
1278 #endif
1279
1280 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1281 void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
1282 PA_STATE& pa,
1283 uint32_t workerId,
1284 Vec4<SIMD_T> prim[],
1285 uint32_t primMask,
1286 Integer<SIMD_T> const& primID,
1287 Integer<SIMD_T> const& viewportIdx,
1288 Integer<SIMD_T> const& rtIdx)
1289 {
1290 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinPoints, pDC->drawId);
1291
1292 Vec4<SIMD_T>& primVerts = prim[0];
1293
1294 const API_STATE& state = GetApiState(pDC);
1295 const SWR_RASTSTATE& rastState = state.rastState;
1296 const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
1297
1298 // Select attribute processor
1299 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
1300 1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1301
1302 // convert to fixed point
1303 Integer<SIMD_T> vXi, vYi;
1304
1305 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1306 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1307
1308 if (CanUseSimplePoints(pDC))
1309 {
1310 // adjust for ymin-xmin rule
1311 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1312 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1313
1314 // cull points off the ymin-xmin edge of the viewport
1315 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1316 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1317
1318 // compute macro tile coordinates
1319 Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1320 Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1321
1322 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1323
1324 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX);
1325 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY);
1326
1327 // compute raster tile coordinates
1328 Integer<SIMD_T> rasterX =
1329 SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1330 Integer<SIMD_T> rasterY =
1331 SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1332
1333 // compute raster tile relative x,y for coverage mask
1334 Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1335 Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1336
1337 Integer<SIMD_T> tileRelativeX =
1338 SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1339 Integer<SIMD_T> tileRelativeY =
1340 SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1341
1342 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1343 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1344
1345 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX);
1346 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY);
1347
1348 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1349 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1350
1351 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX);
1352 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY);
1353
1354 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1355 SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z);
1356
1357 // store render target array index
1358 const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1359
1360 uint32_t* pPrimID = (uint32_t*)&primID;
1361 uint32_t primIndex = 0;
1362
1363 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1364
1365 // scan remaining valid triangles and bin each separately
1366 while (_BitScanForward((unsigned long*)&primIndex, primMask))
1367 {
1368 uint32_t linkageCount = backendState.numAttributes;
1369 uint32_t numScalarAttribs = linkageCount * 4;
1370
1371 BE_WORK work;
1372 work.type = DRAW;
1373
1374 TRIANGLE_WORK_DESC& desc = work.desc.tri;
1375
1376 // points are always front facing
1377 desc.triFlags.frontFacing = 1;
1378 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1379 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1380
1381 work.pfnWork = RasterizeSimplePoint;
1382
1383 auto pArena = pDC->pArena;
1384 SWR_ASSERT(pArena != nullptr);
1385
1386 // store attributes
1387 float* pAttribs =
1388 (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1389 desc.pAttribs = pAttribs;
1390 desc.numAttribs = linkageCount;
1391
1392 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1393
1394 // store raster tile aligned x, y, perspective correct z
1395 float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1396 desc.pTriBuffer = pTriBuffer;
1397 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1398 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1399 *pTriBuffer = aZ[primIndex];
1400
1401 uint32_t tX = aTileRelativeX[primIndex];
1402 uint32_t tY = aTileRelativeY[primIndex];
1403
1404 // pack the relative x,y into the coverageMask, the rasterizer will
1405 // generate the true coverage mask from it
1406 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1407
1408 // bin it
1409 MacroTileMgr* pTileMgr = pDC->pTileMgr;
1410 #if KNOB_ENABLE_TOSS_POINTS
1411 if (!KNOB_TOSS_SETUP_TRIS)
1412 #endif
1413 {
1414 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1415 }
1416
1417 primMask &= ~(1 << primIndex);
1418 }
1419 }
1420 else
1421 {
1422 // non simple points need to be potentially binned to multiple macro tiles
1423 Float<SIMD_T> vPointSize;
1424
1425 if (rastState.pointParam)
1426 {
1427 Vec4<SIMD_T> size[3];
1428 pa.Assemble(VERTEX_SGV_SLOT, size);
1429 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1430 }
1431 else
1432 {
1433 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1434 }
1435
1436 // bloat point to bbox
1437 SIMDBBOX_T<SIMD_T> bbox;
1438
1439 bbox.xmin = bbox.xmax = vXi;
1440 bbox.ymin = bbox.ymax = vYi;
1441
1442 Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1443 Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1444
1445 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1446 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1447 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1448 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1449
1450 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge
1451 // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
1452 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1453 {
1454 Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1455
1456 if (pa.viewportArrayActive)
1457 {
1458 GatherScissors(&state.scissorsInFixedPoint[0],
1459 pViewportIndex,
1460 scisXmin,
1461 scisYmin,
1462 scisXmax,
1463 scisYmax);
1464 }
1465 else // broadcast fast path for non-VPAI case.
1466 {
1467 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1468 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1469 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1470 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1471 }
1472
1473 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1474 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1475 bbox.xmax =
1476 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1477 bbox.ymax =
1478 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1479 }
1480
1481 // Cull bloated points completely outside scissor
1482 Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1483 Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1484 Integer<SIMD_T> maskOutsideScissorXY =
1485 SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1486 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1487 primMask = primMask & ~maskOutsideScissor;
1488
1489 // Convert bbox to macrotile units.
1490 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1491 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1492 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1493 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1494
1495 OSALIGNSIMD16(uint32_t)
1496 aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1497
1498 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1499 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1500 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1501 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1502
1503 // store render target array index
1504 const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1505
1506 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1507 SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize);
1508
1509 uint32_t* pPrimID = (uint32_t*)&primID;
1510
1511 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1512 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1513 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1514
1515 SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x);
1516 SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y);
1517 SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z);
1518
1519 // scan remaining valid prims and bin each separately
1520 const SWR_BACKEND_STATE& backendState = state.backendState;
1521 uint32_t primIndex;
1522 while (_BitScanForward((unsigned long*)&primIndex, primMask))
1523 {
1524 uint32_t linkageCount = backendState.numAttributes;
1525 uint32_t numScalarAttribs = linkageCount * 4;
1526
1527 BE_WORK work;
1528 work.type = DRAW;
1529
1530 TRIANGLE_WORK_DESC& desc = work.desc.tri;
1531
1532 desc.triFlags.frontFacing = 1;
1533 desc.triFlags.pointSize = aPointSize[primIndex];
1534 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1535 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1536
1537 work.pfnWork = RasterizeTriPoint;
1538
1539 auto pArena = pDC->pArena;
1540 SWR_ASSERT(pArena != nullptr);
1541
1542 // store active attribs
1543 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1544 desc.numAttribs = linkageCount;
1545 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1546
1547 // store point vertex data
1548 float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1549 desc.pTriBuffer = pTriBuffer;
1550 *pTriBuffer++ = aPrimVertsX[primIndex];
1551 *pTriBuffer++ = aPrimVertsY[primIndex];
1552 *pTriBuffer = aPrimVertsZ[primIndex];
1553
1554 // store user clip distances
1555 if (backendState.clipDistanceMask)
1556 {
1557 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1558 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1559 float dists[8];
1560 float one = 1.0f;
1561 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1562 for (uint32_t i = 0; i < numClipDist; i++)
1563 {
1564 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1565 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1566 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1567 }
1568 }
1569
1570 MacroTileMgr* pTileMgr = pDC->pTileMgr;
1571 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1572 {
1573 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1574 {
1575 #if KNOB_ENABLE_TOSS_POINTS
1576 if (!KNOB_TOSS_SETUP_TRIS)
1577 #endif
1578 {
1579 pTileMgr->enqueue(x, y, &work);
1580 }
1581 }
1582 }
1583
1584 primMask &= ~(1 << primIndex);
1585 }
1586 }
1587
1588 RDTSC_END(pDC->pContext->pBucketMgr, FEBinPoints, 1);
1589 }
1590
1591 //////////////////////////////////////////////////////////////////////////
1592 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1593 /// @param pDC - pointer to draw context.
1594 /// @param pa - The primitive assembly object.
1595 /// @param workerId - thread's worker id. Even thread has a unique id.
1596 /// @param tri - Contains point position data for SIMDs worth of points.
1597 /// @param primID - Primitive ID for each point.
1598 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[3],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1599 void BinPointsImpl(DRAW_CONTEXT* pDC,
1600 PA_STATE& pa,
1601 uint32_t workerId,
1602 Vec4<SIMD_T> prim[3],
1603 uint32_t primMask,
1604 Integer<SIMD_T> const& primID,
1605 Integer<SIMD_T> const& viewportIdx,
1606 Integer<SIMD_T> const& rtIdx)
1607 {
1608 const API_STATE& state = GetApiState(pDC);
1609 const SWR_FRONTEND_STATE& feState = state.frontendState;
1610 const SWR_RASTSTATE& rastState = state.rastState;
1611
1612 if (!feState.vpTransformDisable)
1613 {
1614 // perspective divide
1615 Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1616
1617 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1618 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1619 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1620
1621 // viewport transform to screen coords
1622 if (pa.viewportArrayActive)
1623 {
1624 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1625 }
1626 else
1627 {
1628 viewportTransform<1>(prim, state.vpMatrices);
1629 }
1630 }
1631
1632 Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1633
1634 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1635 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1636
1637 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1638 pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1639 }
1640
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1641 void BinPoints(DRAW_CONTEXT* pDC,
1642 PA_STATE& pa,
1643 uint32_t workerId,
1644 simdvector prim[3],
1645 uint32_t primMask,
1646 simdscalari const& primID,
1647 simdscalari const& viewportIdx,
1648 simdscalari const& rtIdx)
1649 {
1650 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1651 pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1652 }
1653
1654 #if USE_SIMD16_FRONTEND
BinPoints_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1655 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC,
1656 PA_STATE& pa,
1657 uint32_t workerId,
1658 simd16vector prim[3],
1659 uint32_t primMask,
1660 simd16scalari const& primID,
1661 simd16scalari const& viewportIdx,
1662 simd16scalari const& rtIdx)
1663 {
1664 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1665 pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1666 }
1667
1668 #endif
1669 //////////////////////////////////////////////////////////////////////////
1670 /// @brief Bin SIMD lines to the backend.
1671 /// @param pDC - pointer to draw context.
1672 /// @param pa - The primitive assembly object.
1673 /// @param workerId - thread's worker id. Even thread has a unique id.
1674 /// @param tri - Contains line position data for SIMDs worth of points.
1675 /// @param primID - Primitive ID for each line.
1676 /// @param viewportIdx - Viewport Array Index for each line.
1677 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[],Float<SIMD_T> recipW[],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1678 void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC,
1679 PA_STATE& pa,
1680 uint32_t workerId,
1681 Vec4<SIMD_T> prim[],
1682 Float<SIMD_T> recipW[],
1683 uint32_t primMask,
1684 Integer<SIMD_T> const& primID,
1685 Integer<SIMD_T> const& viewportIdx,
1686 Integer<SIMD_T> const& rtIdx)
1687 {
1688 const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1689
1690 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinLines, pDC->drawId);
1691
1692 const API_STATE& state = GetApiState(pDC);
1693 const SWR_RASTSTATE& rastState = state.rastState;
1694
1695 // Select attribute processor
1696 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
1697 2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1698
1699 Float<SIMD_T>& vRecipW0 = recipW[0];
1700 Float<SIMD_T>& vRecipW1 = recipW[1];
1701
1702 // convert to fixed point
1703 Integer<SIMD_T> vXi[2], vYi[2];
1704
1705 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1706 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1707 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1708 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1709
1710 // compute x-major vs y-major mask
1711 Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1712 Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1713 Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1714 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1715
1716 // cull zero-length lines
1717 Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1718 vZeroLengthMask =
1719 SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1720
1721 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1722
1723 uint32_t* pPrimID = (uint32_t*)&primID;
1724 const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
1725
1726 // Calc bounding box of lines
1727 SIMDBBOX_T<SIMD_T> bbox;
1728 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1729 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1730 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1731 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1732
1733 // bloat bbox by line width along minor axis
1734 Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1735 Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1736
1737 SIMDBBOX_T<SIMD_T> bloatBox;
1738
1739 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1740 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1741 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1742 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1743
1744 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1745 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1746 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1747 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1748
1749 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
1750 // exclusive.
1751 {
1752 Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1753
1754 if (pa.viewportArrayActive)
1755 {
1756 GatherScissors(&state.scissorsInFixedPoint[0],
1757 pViewportIndex,
1758 scisXmin,
1759 scisYmin,
1760 scisXmax,
1761 scisYmax);
1762 }
1763 else // broadcast fast path for non-VPAI case.
1764 {
1765 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1766 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1767 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1768 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1769 }
1770
1771 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1772 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1773 bbox.xmax =
1774 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1775 bbox.ymax =
1776 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1777 }
1778
1779 // Cull prims completely outside scissor
1780 {
1781 Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1782 Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1783 Integer<SIMD_T> maskOutsideScissorXY =
1784 SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1785 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1786 primMask = primMask & ~maskOutsideScissor;
1787 }
1788
1789 // transpose verts needed for backend
1790 /// @todo modify BE to take non-transformed verts
1791 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1792 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1793 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1794 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1795
1796 if (!primMask)
1797 {
1798 goto endBinLines;
1799 }
1800
1801 // Convert triangle bbox to macrotile units.
1802 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1803 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1804 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1805 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1806
1807 OSALIGNSIMD16(uint32_t)
1808 aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1809
1810 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1811 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1812 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1813 SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1814
1815 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1816 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1817 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1818 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1819
1820 // scan remaining valid prims and bin each separately
1821 unsigned long primIndex;
1822 while (_BitScanForward(&primIndex, primMask))
1823 {
1824 uint32_t linkageCount = state.backendState.numAttributes;
1825 uint32_t numScalarAttribs = linkageCount * 4;
1826
1827 BE_WORK work;
1828 work.type = DRAW;
1829
1830 TRIANGLE_WORK_DESC& desc = work.desc.tri;
1831
1832 desc.triFlags.frontFacing = 1;
1833 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1834 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1835 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1836
1837 work.pfnWork = RasterizeLine;
1838
1839 auto pArena = pDC->pArena;
1840 SWR_ASSERT(pArena != nullptr);
1841
1842 // store active attribs
1843 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1844 desc.numAttribs = linkageCount;
1845 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1846
1847 // store line vertex data
1848 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1849
1850 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1851 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1852 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1853 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1854
1855 // store user clip distances
1856 if (state.backendState.clipDistanceMask)
1857 {
1858 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1859 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1860 ProcessUserClipDist<2>(
1861 state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1862 }
1863
1864 MacroTileMgr* pTileMgr = pDC->pTileMgr;
1865 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1866 {
1867 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1868 {
1869 #if KNOB_ENABLE_TOSS_POINTS
1870 if (!KNOB_TOSS_SETUP_TRIS)
1871 #endif
1872 {
1873 pTileMgr->enqueue(x, y, &work);
1874 }
1875 }
1876 }
1877
1878 primMask &= ~(1 << primIndex);
1879 }
1880
1881 endBinLines:
1882
1883 RDTSC_END(pDC->pContext->pBucketMgr, FEBinLines, 1);
1884 }
1885
1886 //////////////////////////////////////////////////////////////////////////
1887 /// @brief Bin SIMD lines to the backend.
1888 /// @param pDC - pointer to draw context.
1889 /// @param pa - The primitive assembly object.
1890 /// @param workerId - thread's worker id. Even thread has a unique id.
1891 /// @param tri - Contains line position data for SIMDs worth of points.
1892 /// @param primID - Primitive ID for each line.
1893 /// @param viewportIdx - Viewport Array Index for each line.
1894 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[3],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1895 void SIMDCALL BinLinesImpl(DRAW_CONTEXT* pDC,
1896 PA_STATE& pa,
1897 uint32_t workerId,
1898 Vec4<SIMD_T> prim[3],
1899 uint32_t primMask,
1900 Integer<SIMD_T> const& primID,
1901 Integer<SIMD_T> const& viewportIdx,
1902 Integer<SIMD_T> const& rtIdx)
1903 {
1904 const API_STATE& state = GetApiState(pDC);
1905 const SWR_RASTSTATE& rastState = state.rastState;
1906 const SWR_FRONTEND_STATE& feState = state.frontendState;
1907
1908 Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)};
1909
1910 if (!feState.vpTransformDisable)
1911 {
1912 // perspective divide
1913 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1914 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1915
1916 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1917 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1918
1919 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1920 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1921
1922 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1923 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1924
1925 // viewport transform to screen coords
1926 if (pa.viewportArrayActive)
1927 {
1928 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1929 }
1930 else
1931 {
1932 viewportTransform<2>(prim, state.vpMatrices);
1933 }
1934 }
1935
1936 // adjust for pixel center location
1937 Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1938
1939 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1940 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1941
1942 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1943 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1944
1945 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1946 pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx);
1947 }
1948
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1949 void BinLines(DRAW_CONTEXT* pDC,
1950 PA_STATE& pa,
1951 uint32_t workerId,
1952 simdvector prim[],
1953 uint32_t primMask,
1954 simdscalari const& primID,
1955 simdscalari const& viewportIdx,
1956 simdscalari const& rtIdx)
1957 {
1958 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(
1959 pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1960 }
1961
1962 #if USE_SIMD16_FRONTEND
BinLines_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1963 void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC,
1964 PA_STATE& pa,
1965 uint32_t workerId,
1966 simd16vector prim[3],
1967 uint32_t primMask,
1968 simd16scalari const& primID,
1969 simd16scalari const& viewportIdx,
1970 simd16scalari const& rtIdx)
1971 {
1972 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1973 pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1974 }
1975
1976 #endif
1977