1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41 DRAW_CONTEXT *pDC,
42 PA_STATE &pa,
43 uint32_t workerId,
44 typename SIMD_T::Vec4 prim[],
45 typename SIMD_T::Float recipW[],
46 uint32_t primMask,
47 typename SIMD_T::Integer const &primID,
48 typename SIMD_T::Integer const &viewportIdx,
49 typename SIMD_T::Integer const &rtIdx);
50
51 template <typename SIMD_T, uint32_t SIMD_WIDTH>
52 void BinPostSetupPointsImpl(
53 DRAW_CONTEXT *pDC,
54 PA_STATE &pa,
55 uint32_t workerId,
56 typename SIMD_T::Vec4 prim[],
57 uint32_t primMask,
58 typename SIMD_T::Integer const &primID,
59 typename SIMD_T::Integer const &viewportIdx,
60 typename SIMD_T::Integer const &rtIdx);
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief Processes attributes for the backend based on linkage mask and
64 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
65 /// @param pDC - Draw context
66 /// @param pa - Primitive Assembly state
67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
68 /// @param pLinkageMap - maps VS attribute slot to PS slot
69 /// @param triIndex - Triangle to process attributes for
70 /// @param pBuffer - Output result
71 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)72 INLINE void ProcessAttributes(
73 DRAW_CONTEXT *pDC,
74 PA_STATE&pa,
75 uint32_t triIndex,
76 uint32_t primId,
77 float *pBuffer)
78 {
79 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
80 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
81 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
82 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
83 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
84 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
85
86 static const float constTable[3][4] = {
87 { 0.0f, 0.0f, 0.0f, 0.0f },
88 { 0.0f, 0.0f, 0.0f, 1.0f },
89 { 1.0f, 1.0f, 1.0f, 1.0f }
90 };
91
92 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
93 {
94 uint32_t inputSlot;
95 if (IsSwizzledT::value)
96 {
97 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
98 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
99
100 }
101 else
102 {
103 inputSlot = backendState.vertexAttribOffset + i;
104 }
105
106 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
107 float* pAttribStart = pBuffer;
108
109 if (HasConstantInterpT::value || IsDegenerate::value)
110 {
111 if (CheckBit(constantInterpMask, i))
112 {
113 uint32_t vid;
114 uint32_t adjustedTriIndex;
115 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
116 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
117 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
118 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
119 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
120
121 switch (topo) {
122 case TOP_QUAD_LIST:
123 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
124 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
125 break;
126 case TOP_QUAD_STRIP:
127 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
128 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
129 break;
130 case TOP_TRIANGLE_STRIP:
131 adjustedTriIndex = triIndex;
132 vid = (triIndex & 1)
133 ? tristripProvokingVertex[provokingVertex]
134 : provokingVertex;
135 break;
136 default:
137 adjustedTriIndex = triIndex;
138 vid = provokingVertex;
139 break;
140 }
141
142 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
143
144 for (uint32_t i = 0; i < NumVertsT::value; ++i)
145 {
146 SIMD128::store_ps(pBuffer, attrib[vid]);
147 pBuffer += 4;
148 }
149 }
150 else
151 {
152 pa.AssembleSingle(inputSlot, triIndex, attrib);
153
154 for (uint32_t i = 0; i < NumVertsT::value; ++i)
155 {
156 SIMD128::store_ps(pBuffer, attrib[i]);
157 pBuffer += 4;
158 }
159 }
160 }
161 else
162 {
163 pa.AssembleSingle(inputSlot, triIndex, attrib);
164
165 for (uint32_t i = 0; i < NumVertsT::value; ++i)
166 {
167 SIMD128::store_ps(pBuffer, attrib[i]);
168 pBuffer += 4;
169 }
170 }
171
172 // pad out the attrib buffer to 3 verts to ensure the triangle
173 // interpolation code in the pixel shader works correctly for the
174 // 3 topologies - point, line, tri. This effectively zeros out the
175 // effect of the missing vertices in the triangle interpolation.
176 for (uint32_t v = NumVertsT::value; v < 3; ++v)
177 {
178 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
179 pBuffer += 4;
180 }
181
182 // check for constant source overrides
183 if (IsSwizzledT::value)
184 {
185 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
186 if (mask)
187 {
188 DWORD comp;
189 while (_BitScanForward(&comp, mask))
190 {
191 mask &= ~(1 << comp);
192
193 float constantValue = 0.0f;
194 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
195 {
196 case SWR_CONSTANT_SOURCE_CONST_0000:
197 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
198 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
199 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
200 break;
201 case SWR_CONSTANT_SOURCE_PRIM_ID:
202 constantValue = *(float*)&primId;
203 break;
204 }
205
206 // apply constant value to all 3 vertices
207 for (uint32_t v = 0; v < 3; ++v)
208 {
209 pAttribStart[comp + v * 4] = constantValue;
210 }
211 }
212 }
213 }
214 }
215 }
216
217 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
218
219 struct ProcessAttributesChooser
220 {
221 typedef PFN_PROCESS_ATTRIBUTES FuncType;
222
223 template <typename... ArgsB>
GetFuncProcessAttributesChooser224 static FuncType GetFunc()
225 {
226 return ProcessAttributes<ArgsB...>;
227 }
228 };
229
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)230 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
231 {
232 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
233 }
234
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Processes enabled user clip distances. Loads the active clip
237 /// distances from the PA, sets up barycentric equations, and
238 /// stores the results to the output buffer
239 /// @param pa - Primitive Assembly state
240 /// @param primIndex - primitive index to process
241 /// @param clipDistMask - mask of enabled clip distances
242 /// @param pUserClipBuffer - buffer to store results
243 template<uint32_t NumVerts>
ProcessUserClipDist(const SWR_BACKEND_STATE & state,PA_STATE & pa,uint32_t primIndex,float * pRecipW,float * pUserClipBuffer)244 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
245 {
246 DWORD clipDist;
247 uint32_t clipDistMask = state.clipDistanceMask;
248 while (_BitScanForward(&clipDist, clipDistMask))
249 {
250 clipDistMask &= ~(1 << clipDist);
251 uint32_t clipSlot = clipDist >> 2;
252 uint32_t clipComp = clipDist & 0x3;
253 uint32_t clipAttribSlot = clipSlot == 0 ?
254 state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
255
256 simd4scalar primClipDist[3];
257 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
258
259 float vertClipDist[NumVerts];
260 for (uint32_t e = 0; e < NumVerts; ++e)
261 {
262 OSALIGNSIMD(float) aVertClipDist[4];
263 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
264 vertClipDist[e] = aVertClipDist[clipComp];
265 };
266
267 // setup plane equations for barycentric interpolation in the backend
268 float baryCoeff[NumVerts];
269 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
270 for (uint32_t e = 0; e < NumVerts - 1; ++e)
271 {
272 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
273 }
274 baryCoeff[NumVerts - 1] = last;
275
276 for (uint32_t e = 0; e < NumVerts; ++e)
277 {
278 *(pUserClipBuffer++) = baryCoeff[e];
279 }
280 }
281 }
282
283 INLINE
TransposeVertices(simd4scalar (& dst)[8],const simdscalar & src0,const simdscalar & src1,const simdscalar & src2)284 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
285 {
286 vTranspose3x8(dst, src0, src1, src2);
287 }
288
289 INLINE
TransposeVertices(simd4scalar (& dst)[16],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2)290 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
291 {
292 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
293 }
294
295
296 #if KNOB_ENABLE_EARLY_RAST
297
298 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
299 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
300
301
302 template<typename SIMD_T>
303 struct EarlyRastHelper
304 {
305 };
306
307 template<>
308 struct EarlyRastHelper<SIMD256>
309 {
InitShiftCntrlEarlyRastHelper310 static SIMD256::Integer InitShiftCntrl()
311 {
312 return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
313 }
314 };
315
316 #if USE_SIMD16_FRONTEND
317 template<>
318 struct EarlyRastHelper<SIMD512>
319 {
InitShiftCntrlEarlyRastHelper320 static SIMD512::Integer InitShiftCntrl()
321 {
322 return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
323 }
324 };
325
326 #endif
327 //////////////////////////////////////////////////////////////////////////
328 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
329 /// (ER tile) can be rasterized as early as in binner to check if
330 /// they cover any pixels. If not - the triangles can be
331 /// culled in binner.
332 ///
333 /// @param er_bbox - coordinates of ER tile for each triangle
334 /// @param vAi - A coefficients of triangle edges
335 /// @param vBi - B coefficients of triangle edges
336 /// @param vXi - X coordinates of triangle vertices
337 /// @param vYi - Y coordinates of triangle vertices
338 /// @param frontWindingTris - mask indicating CCW/CW triangles
339 /// @param triMask - mask for valid SIMD lanes (triangles)
340 /// @param oneTileMask - defines triangles for ER to work on
341 /// (tris that fit into ER tile)
342 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
EarlyRasterizer(SIMDBBOX_T<SIMD_T> & er_bbox,typename SIMD_T::Integer (& vAi)[3],typename SIMD_T::Integer (& vBi)[3],typename SIMD_T::Integer (& vXi)[3],typename SIMD_T::Integer (& vYi)[3],uint32_t cwTrisMask,uint32_t triMask,uint32_t oneTileMask)343 uint32_t SIMDCALL EarlyRasterizer(
344 SIMDBBOX_T<SIMD_T> &er_bbox,
345 typename SIMD_T::Integer (&vAi)[3],
346 typename SIMD_T::Integer (&vBi)[3],
347 typename SIMD_T::Integer (&vXi)[3],
348 typename SIMD_T::Integer (&vYi)[3],
349 uint32_t cwTrisMask,
350 uint32_t triMask,
351 uint32_t oneTileMask)
352 {
353 // step to pixel center of top-left pixel of the triangle bbox
354 typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
355 vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
356
357 typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
358 vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
359
360 // negate A and B for CW tris
361 typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
362 typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
363 typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
364 typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
365 typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
366 typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
367
368 RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
369
370 typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
371 typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
372 typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
373
374 vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
375 vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
376 vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
377 vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
378 vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
379 vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
380
381 // evaluate edge equations at top-left pixel
382 typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
383 typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
384 typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
385
386 typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
387 typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
388 typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
389
390 typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
391 typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
392 typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
393
394 typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
395 typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
396 typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
397
398 typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
399 typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
400 typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
401
402 vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
403 vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
404 vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
405
406 // top left rule
407 typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
408 typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
409 typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
410
411 // vA < 0
412 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
413 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
414 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
415
416 // vA == 0 && vB < 0
417 typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
418 typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
419 typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
420
421 vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
422 vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
423 vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
424
425 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
426 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
427 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
428
429
430 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
431 // Go down
432 // coverage pixel 0
433 typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
434 vMask0 = SIMD_T::and_si(vMask0, vEdge2);
435
436 // coverage pixel 1
437 typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
438 typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
439 typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
440 typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
441 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
442
443 // coverage pixel 2
444 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
445 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
446 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
447 typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
448 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
449
450 // coverage pixel 3
451 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
452 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
453 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
454 typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
455 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
456
457 // One step to the right and then up
458
459 // coverage pixel 4
460 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
461 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
462 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
463 typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
464 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
465
466 // coverage pixel 5
467 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
468 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
469 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
470 typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
471 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
472
473 // coverage pixel 6
474 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
475 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
476 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
477 typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
478 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
479
480 // coverage pixel 7
481 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
482 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
483 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
484 typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
485 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
486
487 typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
488 vLit1 = SIMD_T::or_si(vLit1, vMask2);
489 vLit1 = SIMD_T::or_si(vLit1, vMask3);
490 vLit1 = SIMD_T::or_si(vLit1, vMask4);
491 vLit1 = SIMD_T::or_si(vLit1, vMask5);
492 vLit1 = SIMD_T::or_si(vLit1, vMask6);
493 vLit1 = SIMD_T::or_si(vLit1, vMask7);
494
495 // Step to the right and go down again
496
497 // coverage pixel 0
498 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
499 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
500 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
501 vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
502 vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
503
504 // coverage pixel 1
505 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
506 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
507 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
508 vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
509 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
510
511 // coverage pixel 2
512 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
513 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
514 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
515 vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
516 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
517
518 // coverage pixel 3
519 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
520 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
521 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
522 vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
523 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
524
525 // And for the last time - to the right and up
526
527 // coverage pixel 4
528 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
529 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
530 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
531 vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
532 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
533
534 // coverage pixel 5
535 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
536 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
537 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
538 vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
539 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
540
541 // coverage pixel 6
542 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
543 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
544 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
545 vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
546 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
547
548 // coverage pixel 7
549 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
550 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
551 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
552 vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
553 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
554
555 typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
556 vLit2 = SIMD_T::or_si(vLit2, vMask2);
557 vLit2 = SIMD_T::or_si(vLit2, vMask3);
558 vLit2 = SIMD_T::or_si(vLit2, vMask4);
559 vLit2 = SIMD_T::or_si(vLit2, vMask5);
560 vLit2 = SIMD_T::or_si(vLit2, vMask6);
561 vLit2 = SIMD_T::or_si(vLit2, vMask7);
562
563 typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
564
565 #else
566 // Generic algorithm sweeping in row by row order
567 typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
568
569 typename SIMD_T::Integer vEdge0N = vEdge0;
570 typename SIMD_T::Integer vEdge1N = vEdge1;
571 typename SIMD_T::Integer vEdge2N = vEdge2;
572
573 for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
574 {
575 // Store edge values at the beginning of the row
576 typename SIMD_T::Integer vRowEdge0 = vEdge0N;
577 typename SIMD_T::Integer vRowEdge1 = vEdge1N;
578 typename SIMD_T::Integer vRowEdge2 = vEdge2N;
579
580 typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
581
582 for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
583 {
584 vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
585 vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
586
587 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
588 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
589 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
590 }
591 vRowMask[row] = vColMask[0];
592 for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
593 {
594 vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
595 }
596 // Restore values and go to the next row
597 vEdge0N = vRowEdge0;
598 vEdge1N = vRowEdge1;
599 vEdge2N = vRowEdge2;
600
601 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
602 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
603 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
604 }
605
606 // compress all masks
607 typename SIMD_T::Integer vLit = vRowMask[0];
608 for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
609 {
610 vLit = SIMD_T::or_si(vLit, vRowMask[row]);
611 }
612
613 #endif
614 // Check which triangles has any pixel lit
615 uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
616 uint32_t maskUnlit = ~maskLit & oneTileMask;
617
618 uint32_t oldTriMask = triMask;
619 triMask &= ~maskUnlit;
620
621 if (triMask ^ oldTriMask)
622 {
623 RDTSC_EVENT(FEEarlyRastExit, _mm_popcnt_u32(triMask & oneTileMask), 0);
624 }
625 return triMask;
626 }
627
628 #endif // Early rasterizer
629
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
632 /// culling, viewport transform, etc.
633 /// @param pDC - pointer to draw context.
634 /// @param pa - The primitive assembly object.
635 /// @param workerId - thread's worker id. Even thread has a unique id.
636 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
637 /// @param primID - Primitive ID for each triangle.
638 /// @param viewportIdx - viewport array index for each triangle.
639 /// @tparam CT - ConservativeRastFETraits
640 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
BinTrianglesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 tri[3],uint32_t triMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)641 void SIMDCALL BinTrianglesImpl(
642 DRAW_CONTEXT *pDC,
643 PA_STATE &pa,
644 uint32_t workerId,
645 typename SIMD_T::Vec4 tri[3],
646 uint32_t triMask,
647 typename SIMD_T::Integer const &primID,
648 typename SIMD_T::Integer const &viewportIdx,
649 typename SIMD_T::Integer const &rtIdx)
650 {
651 SWR_CONTEXT *pContext = pDC->pContext;
652 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
653
654 AR_BEGIN(FEBinTriangles, pDC->drawId);
655
656 const API_STATE& state = GetApiState(pDC);
657 const SWR_RASTSTATE& rastState = state.rastState;
658 const SWR_FRONTEND_STATE& feState = state.frontendState;
659
660 MacroTileMgr *pTileMgr = pDC->pTileMgr;
661
662 typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
663 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
664 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
665
666 if (feState.vpTransformDisable)
667 {
668 // RHW is passed in directly when VP transform is disabled
669 vRecipW0 = tri[0].v[3];
670 vRecipW1 = tri[1].v[3];
671 vRecipW2 = tri[2].v[3];
672 }
673 else
674 {
675 // Perspective divide
676 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
677 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
678 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
679
680 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
681 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
682 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
683
684 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
685 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
686 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
687
688 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
689 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
690 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
691
692 // Viewport transform to screen space coords
693 if (pa.viewportArrayActive)
694 {
695 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
696 }
697 else
698 {
699 viewportTransform<3>(tri, state.vpMatrices);
700 }
701 }
702
703 // Adjust for pixel center location
704 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
705
706 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
707 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
708
709 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
710 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
711
712 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
713 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
714
715 // Set vXi, vYi to required fixed point precision
716 typename SIMD_T::Integer vXi[3], vYi[3];
717 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
718
719 // triangle setup
720 typename SIMD_T::Integer vAi[3], vBi[3];
721 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
722
723 // determinant
724 typename SIMD_T::Integer vDet[2];
725 calcDeterminantIntVertical(vAi, vBi, vDet);
726
727 // cull zero area
728 uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
729 uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
730
731 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
732
733 // don't cull degenerate triangles if we're conservatively rasterizing
734 uint32_t origTriMask = triMask;
735 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
736 {
737 triMask &= ~cullZeroAreaMask;
738 }
739
740 // determine front winding tris
741 // CW +det
742 // CCW det < 0;
743 // 0 area triangles are marked as backfacing regardless of winding order,
744 // which is required behavior for conservative rast and wireframe rendering
745 uint32_t frontWindingTris;
746 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
747 {
748 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
749 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
750 }
751 else
752 {
753 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
754 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
755 }
756 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
757
758 // cull
759 uint32_t cullTris;
760 switch ((SWR_CULLMODE)rastState.cullMode)
761 {
762 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
763 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
764 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
765 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
766 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
767 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
768 }
769
770 triMask &= ~cullTris;
771
772 if (origTriMask ^ triMask)
773 {
774 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
775 }
776
777 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
778 // compute per tri backface
779 uint32_t frontFaceMask = frontWindingTris;
780 uint32_t *pPrimID = (uint32_t *)&primID;
781 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
782 DWORD triIndex = 0;
783
784 uint32_t edgeEnable;
785 PFN_WORK_FUNC pfnWork;
786 if (CT::IsConservativeT::value)
787 {
788 // determine which edges of the degenerate tri, if any, are valid to rasterize.
789 // used to call the appropriate templated rasterizer function
790 if (cullZeroAreaMask > 0)
791 {
792 // e0 = v1-v0
793 const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
794 const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
795
796 uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
797
798 // e1 = v2-v1
799 const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
800 const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
801
802 uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
803
804 // e2 = v0-v2
805 // if v0 == v1 & v1 == v2, v0 == v2
806 uint32_t e2Mask = e0Mask & e1Mask;
807 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
808
809 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
810 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
811 e0Mask = pdep_u32(e0Mask, 0x00249249);
812
813 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
814 e1Mask = pdep_u32(e1Mask, 0x00492492);
815
816 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
817 e2Mask = pdep_u32(e2Mask, 0x00924924);
818
819 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
820 }
821 else
822 {
823 edgeEnable = 0x00FFFFFF;
824 }
825 }
826 else
827 {
828 // degenerate triangles won't be sent to rasterizer; just enable all edges
829 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
830 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
831 }
832
833 SIMDBBOX_T<SIMD_T> bbox;
834
835 if (!triMask)
836 {
837 goto endBinTriangles;
838 }
839
840 // Calc bounding box of triangles
841 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
842
843 // determine if triangle falls between pixel centers and discard
844 // only discard for non-MSAA case and when conservative rast is disabled
845 // (xmin + 127) & ~255
846 // (xmax + 128) & ~255
847 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
848 (!CT::IsConservativeT::value))
849 {
850 origTriMask = triMask;
851
852 int cullCenterMask;
853
854 {
855 typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
856 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
857 typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
858 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
859
860 typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
861
862 typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
863 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
864 typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
865 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
866
867 typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
868
869 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
870 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
871 }
872
873 triMask &= ~cullCenterMask;
874
875 if (origTriMask ^ triMask)
876 {
877 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
878 }
879 }
880
881 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
882 // Gather the AOS effective scissor rects based on the per-prim VP index.
883 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
884 {
885 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
886 if (pa.viewportArrayActive)
887
888 {
889 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
890 }
891 else // broadcast fast path for non-VPAI case.
892 {
893 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
894 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
895 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
896 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
897 }
898
899 // Make triangle bbox inclusive
900 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
901 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
902
903 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
904 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
905 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
906 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
907 }
908
909 if (CT::IsConservativeT::value)
910 {
911 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
912 // some area. Bump the xmax/ymax edges out
913
914 typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
915 bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
916
917 typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
918 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
919 }
920
921 // Cull tris completely outside scissor
922 {
923 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
924 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
925 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
926 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
927 triMask = triMask & ~maskOutsideScissor;
928 }
929
930 #if KNOB_ENABLE_EARLY_RAST
931 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
932 {
933 // Try early rasterization - culling small triangles which do not cover any pixels
934
935 // convert to ER tiles
936 SIMDBBOX_T<SIMD_T> er_bbox;
937
938 er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
939 er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
940 er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
941 er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
942
943 typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
944 typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
945
946 // Take only triangles that fit into ER tile
947 uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
948
949 if (oneTileMask)
950 {
951 // determine CW tris (det > 0)
952 uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
953 uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
954 uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
955
956 // Try early rasterization
957 triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
958
959 if (!triMask)
960 {
961 AR_END(FEBinTriangles, 1);
962 return;
963 }
964 }
965
966 }
967 #endif
968
969 endBinTriangles:
970
971
972 // Send surviving triangles to the line or point binner based on fill mode
973 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
974 {
975 // Simple non-conformant wireframe mode, useful for debugging
976 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
977 typename SIMD_T::Vec4 line[2];
978 typename SIMD_T::Float recipW[2];
979
980 line[0] = tri[0];
981 line[1] = tri[1];
982 recipW[0] = vRecipW0;
983 recipW[1] = vRecipW1;
984
985 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
986
987 line[0] = tri[1];
988 line[1] = tri[2];
989 recipW[0] = vRecipW1;
990 recipW[1] = vRecipW2;
991
992 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
993
994 line[0] = tri[2];
995 line[1] = tri[0];
996 recipW[0] = vRecipW2;
997 recipW[1] = vRecipW0;
998
999 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1000
1001 AR_END(FEBinTriangles, 1);
1002 return;
1003 }
1004 else if (rastState.fillMode == SWR_FILLMODE_POINT)
1005 {
1006 // Bin 3 points
1007 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
1008 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
1009 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
1010
1011 AR_END(FEBinTriangles, 1);
1012 return;
1013 }
1014
1015 // Convert triangle bbox to macrotile units.
1016 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1017 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1018 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1019 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1020
1021 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1022
1023 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1024 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1025 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1026 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1027
1028 // transpose verts needed for backend
1029 /// @todo modify BE to take non-transformed verts
1030 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1031 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1032 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1033 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1034
1035 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1036 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1037 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1038 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1039
1040 // scan remaining valid triangles and bin each separately
1041 while (_BitScanForward(&triIndex, triMask))
1042 {
1043 uint32_t linkageCount = state.backendState.numAttributes;
1044 uint32_t numScalarAttribs = linkageCount * 4;
1045
1046 BE_WORK work;
1047 work.type = DRAW;
1048
1049 bool isDegenerate;
1050 if (CT::IsConservativeT::value)
1051 {
1052 // only rasterize valid edges if we have a degenerate primitive
1053 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1054 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1055 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1056
1057 // Degenerate triangles are required to be constant interpolated
1058 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1059 }
1060 else
1061 {
1062 isDegenerate = false;
1063 work.pfnWork = pfnWork;
1064 }
1065
1066 // Select attribute processor
1067 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1068 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1069
1070 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1071
1072 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1073 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1074 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1075
1076 auto pArena = pDC->pArena;
1077 SWR_ASSERT(pArena != nullptr);
1078
1079 // store active attribs
1080 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1081 desc.pAttribs = pAttribs;
1082 desc.numAttribs = linkageCount;
1083 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1084
1085 // store triangle vertex data
1086 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1087
1088 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1089 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1090 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1091 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1092
1093 // store user clip distances
1094 if (state.backendState.clipDistanceMask)
1095 {
1096 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1097 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1098 ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1099 }
1100
1101 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1102 {
1103 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1104 {
1105 #if KNOB_ENABLE_TOSS_POINTS
1106 if (!KNOB_TOSS_SETUP_TRIS)
1107 #endif
1108 {
1109 pTileMgr->enqueue(x, y, &work);
1110 }
1111 }
1112 }
1113
1114 triMask &= ~(1 << triIndex);
1115 }
1116
1117 AR_END(FEBinTriangles, 1);
1118 }
1119
1120 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1121 void BinTriangles(
1122 DRAW_CONTEXT *pDC,
1123 PA_STATE &pa,
1124 uint32_t workerId,
1125 simdvector tri[3],
1126 uint32_t triMask,
1127 simdscalari const &primID,
1128 simdscalari const &viewportIdx,
1129 simdscalari const &rtIdx)
1130 {
1131 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1132 }
1133
1134 #if USE_SIMD16_FRONTEND
1135 template <typename CT>
BinTriangles_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector tri[3],uint32_t triMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1136 void SIMDCALL BinTriangles_simd16(
1137 DRAW_CONTEXT *pDC,
1138 PA_STATE &pa,
1139 uint32_t workerId,
1140 simd16vector tri[3],
1141 uint32_t triMask,
1142 simd16scalari const &primID,
1143 simd16scalari const &viewportIdx,
1144 simd16scalari const &rtIdx)
1145 {
1146 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1147 }
1148
1149 #endif
1150 struct FEBinTrianglesChooser
1151 {
1152 typedef PFN_PROCESS_PRIMS FuncType;
1153
1154 template <typename... ArgsB>
GetFuncFEBinTrianglesChooser1155 static FuncType GetFunc()
1156 {
1157 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1158 }
1159 };
1160
1161 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)1162 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1163 {
1164 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1165 }
1166
1167 #if USE_SIMD16_FRONTEND
1168 struct FEBinTrianglesChooser_simd16
1169 {
1170 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1171
1172 template <typename... ArgsB>
GetFuncFEBinTrianglesChooser_simd161173 static FuncType GetFunc()
1174 {
1175 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1176 }
1177 };
1178
1179 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc_simd16(bool IsConservative)1180 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1181 {
1182 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1183 }
1184
1185 #endif
1186
1187 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1188 void BinPostSetupPointsImpl(
1189 DRAW_CONTEXT *pDC,
1190 PA_STATE &pa,
1191 uint32_t workerId,
1192 typename SIMD_T::Vec4 prim[],
1193 uint32_t primMask,
1194 typename SIMD_T::Integer const &primID,
1195 typename SIMD_T::Integer const &viewportIdx,
1196 typename SIMD_T::Integer const &rtIdx)
1197 {
1198 SWR_CONTEXT *pContext = pDC->pContext;
1199
1200 AR_BEGIN(FEBinPoints, pDC->drawId);
1201
1202 typename SIMD_T::Vec4 &primVerts = prim[0];
1203
1204 const API_STATE& state = GetApiState(pDC);
1205 const SWR_RASTSTATE& rastState = state.rastState;
1206 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1207
1208 // Select attribute processor
1209 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1210 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1211
1212 // convert to fixed point
1213 typename SIMD_T::Integer vXi, vYi;
1214
1215 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1216 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1217
1218 if (CanUseSimplePoints(pDC))
1219 {
1220 // adjust for ymin-xmin rule
1221 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1222 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1223
1224 // cull points off the ymin-xmin edge of the viewport
1225 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1226 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1227
1228 // compute macro tile coordinates
1229 typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1230 typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1231
1232 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1233
1234 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1235 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1236
1237 // compute raster tile coordinates
1238 typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1239 typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1240
1241 // compute raster tile relative x,y for coverage mask
1242 typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1243 typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1244
1245 typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1246 typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1247
1248 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1249 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1250
1251 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1252 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1253
1254 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1255 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1256
1257 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1258 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1259
1260 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1261 SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1262
1263 // store render target array index
1264 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1265
1266 uint32_t *pPrimID = (uint32_t *)&primID;
1267 DWORD primIndex = 0;
1268
1269 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1270
1271 // scan remaining valid triangles and bin each separately
1272 while (_BitScanForward(&primIndex, primMask))
1273 {
1274 uint32_t linkageCount = backendState.numAttributes;
1275 uint32_t numScalarAttribs = linkageCount * 4;
1276
1277 BE_WORK work;
1278 work.type = DRAW;
1279
1280 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1281
1282 // points are always front facing
1283 desc.triFlags.frontFacing = 1;
1284 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1285 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1286
1287 work.pfnWork = RasterizeSimplePoint;
1288
1289 auto pArena = pDC->pArena;
1290 SWR_ASSERT(pArena != nullptr);
1291
1292 // store attributes
1293 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1294 desc.pAttribs = pAttribs;
1295 desc.numAttribs = linkageCount;
1296
1297 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1298
1299 // store raster tile aligned x, y, perspective correct z
1300 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1301 desc.pTriBuffer = pTriBuffer;
1302 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1303 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1304 *pTriBuffer = aZ[primIndex];
1305
1306 uint32_t tX = aTileRelativeX[primIndex];
1307 uint32_t tY = aTileRelativeY[primIndex];
1308
1309 // pack the relative x,y into the coverageMask, the rasterizer will
1310 // generate the true coverage mask from it
1311 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1312
1313 // bin it
1314 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1315 #if KNOB_ENABLE_TOSS_POINTS
1316 if (!KNOB_TOSS_SETUP_TRIS)
1317 #endif
1318 {
1319 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1320 }
1321
1322 primMask &= ~(1 << primIndex);
1323 }
1324 }
1325 else
1326 {
1327 // non simple points need to be potentially binned to multiple macro tiles
1328 typename SIMD_T::Float vPointSize;
1329
1330 if (rastState.pointParam)
1331 {
1332 typename SIMD_T::Vec4 size[3];
1333 pa.Assemble(VERTEX_SGV_SLOT, size);
1334 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1335 }
1336 else
1337 {
1338 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1339 }
1340
1341 // bloat point to bbox
1342 SIMDBBOX_T<SIMD_T> bbox;
1343
1344 bbox.xmin = bbox.xmax = vXi;
1345 bbox.ymin = bbox.ymax = vYi;
1346
1347 typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1348 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1349
1350 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1351 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1352 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1353 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1354
1355 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1356 // Gather the AOS effective scissor rects based on the per-prim VP index.
1357 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1358 {
1359 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1360
1361 if (pa.viewportArrayActive)
1362 {
1363 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1364 }
1365 else // broadcast fast path for non-VPAI case.
1366 {
1367 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1368 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1369 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1370 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1371 }
1372
1373 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1374 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1375 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1376 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1377 }
1378
1379 // Cull bloated points completely outside scissor
1380 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1381 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1382 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1383 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1384 primMask = primMask & ~maskOutsideScissor;
1385
1386 // Convert bbox to macrotile units.
1387 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1388 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1389 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1390 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1391
1392 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1393
1394 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1395 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1396 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1397 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1398
1399 // store render target array index
1400 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1401
1402 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1403 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1404
1405 uint32_t *pPrimID = (uint32_t *)&primID;
1406
1407 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1408 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1409 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1410
1411 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1412 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1413 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1414
1415 // scan remaining valid prims and bin each separately
1416 const SWR_BACKEND_STATE& backendState = state.backendState;
1417 DWORD primIndex;
1418 while (_BitScanForward(&primIndex, primMask))
1419 {
1420 uint32_t linkageCount = backendState.numAttributes;
1421 uint32_t numScalarAttribs = linkageCount * 4;
1422
1423 BE_WORK work;
1424 work.type = DRAW;
1425
1426 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1427
1428 desc.triFlags.frontFacing = 1;
1429 desc.triFlags.pointSize = aPointSize[primIndex];
1430 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1431 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1432
1433 work.pfnWork = RasterizeTriPoint;
1434
1435 auto pArena = pDC->pArena;
1436 SWR_ASSERT(pArena != nullptr);
1437
1438 // store active attribs
1439 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1440 desc.numAttribs = linkageCount;
1441 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1442
1443 // store point vertex data
1444 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1445 desc.pTriBuffer = pTriBuffer;
1446 *pTriBuffer++ = aPrimVertsX[primIndex];
1447 *pTriBuffer++ = aPrimVertsY[primIndex];
1448 *pTriBuffer = aPrimVertsZ[primIndex];
1449
1450 // store user clip distances
1451 if (backendState.clipDistanceMask)
1452 {
1453 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1454 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1455 float dists[8];
1456 float one = 1.0f;
1457 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1458 for (uint32_t i = 0; i < numClipDist; i++) {
1459 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1460 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1461 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1462 }
1463 }
1464
1465 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1466 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1467 {
1468 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1469 {
1470 #if KNOB_ENABLE_TOSS_POINTS
1471 if (!KNOB_TOSS_SETUP_TRIS)
1472 #endif
1473 {
1474 pTileMgr->enqueue(x, y, &work);
1475 }
1476 }
1477 }
1478
1479 primMask &= ~(1 << primIndex);
1480 }
1481 }
1482
1483 AR_END(FEBinPoints, 1);
1484 }
1485
1486 //////////////////////////////////////////////////////////////////////////
1487 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1488 /// @param pDC - pointer to draw context.
1489 /// @param pa - The primitive assembly object.
1490 /// @param workerId - thread's worker id. Even thread has a unique id.
1491 /// @param tri - Contains point position data for SIMDs worth of points.
1492 /// @param primID - Primitive ID for each point.
1493 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[3],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1494 void BinPointsImpl(
1495 DRAW_CONTEXT *pDC,
1496 PA_STATE &pa,
1497 uint32_t workerId,
1498 typename SIMD_T::Vec4 prim[3],
1499 uint32_t primMask,
1500 typename SIMD_T::Integer const &primID,
1501 typename SIMD_T::Integer const &viewportIdx,
1502 typename SIMD_T::Integer const &rtIdx)
1503 {
1504 const API_STATE& state = GetApiState(pDC);
1505 const SWR_FRONTEND_STATE& feState = state.frontendState;
1506 const SWR_RASTSTATE& rastState = state.rastState;
1507
1508 if (!feState.vpTransformDisable)
1509 {
1510 // perspective divide
1511 typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1512
1513 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1514 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1515 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1516
1517 // viewport transform to screen coords
1518 if (pa.viewportArrayActive)
1519 {
1520 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1521 }
1522 else
1523 {
1524 viewportTransform<1>(prim, state.vpMatrices);
1525 }
1526 }
1527
1528 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1529
1530 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1531 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1532
1533 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1534 pDC,
1535 pa,
1536 workerId,
1537 prim,
1538 primMask,
1539 primID,
1540 viewportIdx,
1541 rtIdx);
1542 }
1543
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1544 void BinPoints(
1545 DRAW_CONTEXT *pDC,
1546 PA_STATE &pa,
1547 uint32_t workerId,
1548 simdvector prim[3],
1549 uint32_t primMask,
1550 simdscalari const &primID,
1551 simdscalari const &viewportIdx,
1552 simdscalari const &rtIdx)
1553 {
1554 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1555 pDC,
1556 pa,
1557 workerId,
1558 prim,
1559 primMask,
1560 primID,
1561 viewportIdx,
1562 rtIdx);
1563 }
1564
1565 #if USE_SIMD16_FRONTEND
BinPoints_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1566 void SIMDCALL BinPoints_simd16(
1567 DRAW_CONTEXT *pDC,
1568 PA_STATE &pa,
1569 uint32_t workerId,
1570 simd16vector prim[3],
1571 uint32_t primMask,
1572 simd16scalari const &primID,
1573 simd16scalari const &viewportIdx,
1574 simd16scalari const & rtIdx)
1575 {
1576 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1577 pDC,
1578 pa,
1579 workerId,
1580 prim,
1581 primMask,
1582 primID,
1583 viewportIdx,
1584 rtIdx);
1585 }
1586
1587 #endif
1588 //////////////////////////////////////////////////////////////////////////
1589 /// @brief Bin SIMD lines to the backend.
1590 /// @param pDC - pointer to draw context.
1591 /// @param pa - The primitive assembly object.
1592 /// @param workerId - thread's worker id. Even thread has a unique id.
1593 /// @param tri - Contains line position data for SIMDs worth of points.
1594 /// @param primID - Primitive ID for each line.
1595 /// @param viewportIdx - Viewport Array Index for each line.
1596 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[],typename SIMD_T::Float recipW[],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1597 void BinPostSetupLinesImpl(
1598 DRAW_CONTEXT *pDC,
1599 PA_STATE &pa,
1600 uint32_t workerId,
1601 typename SIMD_T::Vec4 prim[],
1602 typename SIMD_T::Float recipW[],
1603 uint32_t primMask,
1604 typename SIMD_T::Integer const &primID,
1605 typename SIMD_T::Integer const &viewportIdx,
1606 typename SIMD_T::Integer const &rtIdx)
1607 {
1608 SWR_CONTEXT *pContext = pDC->pContext;
1609 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1610
1611 AR_BEGIN(FEBinLines, pDC->drawId);
1612
1613 const API_STATE &state = GetApiState(pDC);
1614 const SWR_RASTSTATE &rastState = state.rastState;
1615
1616 // Select attribute processor
1617 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1618 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1619
1620 typename SIMD_T::Float &vRecipW0 = recipW[0];
1621 typename SIMD_T::Float &vRecipW1 = recipW[1];
1622
1623 // convert to fixed point
1624 typename SIMD_T::Integer vXi[2], vYi[2];
1625
1626 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1627 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1628 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1629 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1630
1631 // compute x-major vs y-major mask
1632 typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1633 typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1634 typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1635 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1636
1637 // cull zero-length lines
1638 typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1639 vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1640
1641 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1642
1643 uint32_t *pPrimID = (uint32_t *)&primID;
1644 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1645
1646 // Calc bounding box of lines
1647 SIMDBBOX_T<SIMD_T> bbox;
1648 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1649 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1650 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1651 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1652
1653 // bloat bbox by line width along minor axis
1654 typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1655 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1656
1657 SIMDBBOX_T<SIMD_T> bloatBox;
1658
1659 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1660 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1661 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1662 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1663
1664 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1665 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1666 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1667 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1668
1669 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1670 {
1671 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1672
1673 if (pa.viewportArrayActive)
1674 {
1675 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1676 }
1677 else // broadcast fast path for non-VPAI case.
1678 {
1679 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1680 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1681 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1682 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1683 }
1684
1685 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1686 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1687 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1688 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1689 }
1690
1691 // Cull prims completely outside scissor
1692 {
1693 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1694 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1695 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1696 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1697 primMask = primMask & ~maskOutsideScissor;
1698 }
1699
1700 // transpose verts needed for backend
1701 /// @todo modify BE to take non-transformed verts
1702 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1703 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1704 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1705 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1706
1707 if (!primMask)
1708 {
1709 goto endBinLines;
1710 }
1711
1712 // Convert triangle bbox to macrotile units.
1713 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1714 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1715 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1716 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1717
1718 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1719
1720 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1721 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1722 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1723 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1724
1725 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1726 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1727 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1728 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1729
1730 // scan remaining valid prims and bin each separately
1731 DWORD primIndex;
1732 while (_BitScanForward(&primIndex, primMask))
1733 {
1734 uint32_t linkageCount = state.backendState.numAttributes;
1735 uint32_t numScalarAttribs = linkageCount * 4;
1736
1737 BE_WORK work;
1738 work.type = DRAW;
1739
1740 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1741
1742 desc.triFlags.frontFacing = 1;
1743 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1744 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1745 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1746
1747 work.pfnWork = RasterizeLine;
1748
1749 auto pArena = pDC->pArena;
1750 SWR_ASSERT(pArena != nullptr);
1751
1752 // store active attribs
1753 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1754 desc.numAttribs = linkageCount;
1755 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1756
1757 // store line vertex data
1758 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1759
1760 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1761 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1762 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1763 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1764
1765 // store user clip distances
1766 if (state.backendState.clipDistanceMask)
1767 {
1768 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1769 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1770 ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1771 }
1772
1773 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1774 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1775 {
1776 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1777 {
1778 #if KNOB_ENABLE_TOSS_POINTS
1779 if (!KNOB_TOSS_SETUP_TRIS)
1780 #endif
1781 {
1782 pTileMgr->enqueue(x, y, &work);
1783 }
1784 }
1785 }
1786
1787 primMask &= ~(1 << primIndex);
1788 }
1789
1790 endBinLines:
1791
1792 AR_END(FEBinLines, 1);
1793 }
1794
1795 //////////////////////////////////////////////////////////////////////////
1796 /// @brief Bin SIMD lines to the backend.
1797 /// @param pDC - pointer to draw context.
1798 /// @param pa - The primitive assembly object.
1799 /// @param workerId - thread's worker id. Even thread has a unique id.
1800 /// @param tri - Contains line position data for SIMDs worth of points.
1801 /// @param primID - Primitive ID for each line.
1802 /// @param viewportIdx - Viewport Array Index for each line.
1803 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[3],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1804 void SIMDCALL BinLinesImpl(
1805 DRAW_CONTEXT *pDC,
1806 PA_STATE &pa,
1807 uint32_t workerId,
1808 typename SIMD_T::Vec4 prim[3],
1809 uint32_t primMask,
1810 typename SIMD_T::Integer const &primID,
1811 typename SIMD_T::Integer const &viewportIdx,
1812 typename SIMD_T::Integer const & rtIdx)
1813 {
1814 const API_STATE& state = GetApiState(pDC);
1815 const SWR_RASTSTATE& rastState = state.rastState;
1816 const SWR_FRONTEND_STATE& feState = state.frontendState;
1817
1818 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1819
1820 if (!feState.vpTransformDisable)
1821 {
1822 // perspective divide
1823 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1824 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1825
1826 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1827 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1828
1829 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1830 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1831
1832 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1833 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1834
1835 // viewport transform to screen coords
1836 if (pa.viewportArrayActive)
1837 {
1838 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1839 }
1840 else
1841 {
1842 viewportTransform<2>(prim, state.vpMatrices);
1843 }
1844 }
1845
1846 // adjust for pixel center location
1847 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1848
1849 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1850 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1851
1852 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1853 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1854
1855 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1856 pDC,
1857 pa,
1858 workerId,
1859 prim,
1860 vRecipW,
1861 primMask,
1862 primID,
1863 viewportIdx,
1864 rtIdx);
1865 }
1866
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1867 void BinLines(
1868 DRAW_CONTEXT *pDC,
1869 PA_STATE &pa,
1870 uint32_t workerId,
1871 simdvector prim[],
1872 uint32_t primMask,
1873 simdscalari const &primID,
1874 simdscalari const &viewportIdx,
1875 simdscalari const &rtIdx)
1876 {
1877 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1878 }
1879
1880 #if USE_SIMD16_FRONTEND
BinLines_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1881 void SIMDCALL BinLines_simd16(
1882 DRAW_CONTEXT *pDC,
1883 PA_STATE &pa,
1884 uint32_t workerId,
1885 simd16vector prim[3],
1886 uint32_t primMask,
1887 simd16scalari const &primID,
1888 simd16scalari const &viewportIdx,
1889 simd16scalari const &rtIdx)
1890 {
1891 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1892 }
1893
1894 #endif
1895