• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34 
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices[7];
37 
38 enum SWR_CLIPCODES
39 {
40     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42 #define CLIPCODE_SHIFT 23
43     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
44     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
45     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
46     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
47 
48     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
49     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
50 
51     NEGW            = (0x40 << CLIPCODE_SHIFT),
52 
53     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
54     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
55     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
56     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
57 };
58 
59 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
60 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
61 
62 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
63           int *numVerts, float *pOutAttribs);
64 
65 INLINE
ComputeClipCodes(const API_STATE & state,const simdvector & vertex,simdscalar & clipCodes,simdscalari viewportIndexes)66 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
67 {
68     clipCodes = _simd_setzero_ps();
69 
70     // -w
71     simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
72 
73     // FRUSTUM_LEFT
74     simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
75     clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
76 
77     // FRUSTUM_TOP
78     vRes = _simd_cmplt_ps(vertex.y, vNegW);
79     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
80 
81     // FRUSTUM_RIGHT
82     vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
83     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
84 
85     // FRUSTUM_BOTTOM
86     vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
87     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
88 
89     if (state.rastState.depthClipEnable)
90     {
91         // FRUSTUM_NEAR
92         // DX clips depth [0..w], GL clips [-w..w]
93         if (state.rastState.clipHalfZ)
94         {
95             vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
96         }
97         else
98         {
99             vRes = _simd_cmplt_ps(vertex.z, vNegW);
100         }
101         clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
102 
103         // FRUSTUM_FAR
104         vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
105         clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
106     }
107 
108     // NEGW
109     vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
110     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
111 
112     // GUARDBAND_LEFT
113     simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
114     vRes = _simd_cmplt_ps(vertex.x, gbMult);
115     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
116 
117     // GUARDBAND_TOP
118     gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
119     vRes = _simd_cmplt_ps(vertex.y, gbMult);
120     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
121 
122     // GUARDBAND_RIGHT
123     gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
124     vRes = _simd_cmpgt_ps(vertex.x, gbMult);
125     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
126 
127     // GUARDBAND_BOTTOM
128     gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
129     vRes = _simd_cmpgt_ps(vertex.y, gbMult);
130     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
131 }
132 
133 template<uint32_t NumVertsPerPrim>
134 class Clipper
135 {
136 public:
Clipper(uint32_t in_workerId,DRAW_CONTEXT * in_pDC)137     Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
138         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
139     {
140         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
141     }
142 
ComputeClipCodes(simdvector vertex[],simdscalari viewportIndexes)143     void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
144     {
145         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
146         {
147             ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
148         }
149     }
150 
ComputeClipCodeIntersection()151     simdscalar ComputeClipCodeIntersection()
152     {
153         simdscalar result = this->clipCodes[0];
154         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
155         {
156             result = _simd_and_ps(result, this->clipCodes[i]);
157         }
158         return result;
159     }
160 
ComputeClipCodeUnion()161     simdscalar ComputeClipCodeUnion()
162     {
163         simdscalar result = this->clipCodes[0];
164         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
165         {
166             result = _simd_or_ps(result, this->clipCodes[i]);
167         }
168         return result;
169     }
170 
ComputeNegWMask()171     int ComputeNegWMask()
172     {
173         simdscalar clipCodeUnion = ComputeClipCodeUnion();
174         clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
175         return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
176     }
177 
ComputeClipMask()178     int ComputeClipMask()
179     {
180         simdscalar clipUnion = ComputeClipCodeUnion();
181         clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
182         return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
183     }
184 
185     // clipper is responsible for culling any prims with NAN coordinates
ComputeNaNMask(simdvector prim[])186     int ComputeNaNMask(simdvector prim[])
187     {
188         simdscalar vNanMask = _simd_setzero_ps();
189         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
190         {
191             simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
192             vNanMask = _simd_or_ps(vNanMask, vNan01);
193             simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
194             vNanMask = _simd_or_ps(vNanMask, vNan23);
195         }
196 
197         return _simd_movemask_ps(vNanMask);
198     }
199 
ComputeUserClipCullMask(PA_STATE & pa,simdvector prim[])200     int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
201     {
202         uint8_t cullMask = this->state.rastState.cullDistanceMask;
203         simdscalar vClipCullMask = _simd_setzero_ps();
204         DWORD index;
205 
206         simdvector vClipCullDistLo[3];
207         simdvector vClipCullDistHi[3];
208 
209         pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
210         pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
211         while (_BitScanForward(&index, cullMask))
212         {
213             cullMask &= ~(1 << index);
214             uint32_t slot = index >> 2;
215             uint32_t component = index & 0x3;
216 
217             simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
218             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
219             {
220                 simdscalar vCullComp;
221                 if (slot == 0)
222                 {
223                     vCullComp = vClipCullDistLo[e][component];
224                 }
225                 else
226                 {
227                     vCullComp = vClipCullDistHi[e][component];
228                 }
229 
230                 // cull if cull distance < 0 || NAN
231                 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
232                 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
233             }
234             vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
235         }
236 
237         // clipper should also discard any primitive with NAN clip distance
238         uint8_t clipMask = this->state.rastState.clipDistanceMask;
239         while (_BitScanForward(&index, clipMask))
240         {
241             clipMask &= ~(1 << index);
242             uint32_t slot = index >> 2;
243             uint32_t component = index & 0x3;
244 
245             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
246             {
247                 simdscalar vClipComp;
248                 if (slot == 0)
249                 {
250                     vClipComp = vClipCullDistLo[e][component];
251                 }
252                 else
253                 {
254                     vClipComp = vClipCullDistHi[e][component];
255                 }
256 
257                 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
258                 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
259             }
260         }
261 
262         return _simd_movemask_ps(vClipCullMask);
263     }
264 
265     // clip SIMD primitives
ClipSimd(const simdscalar & vPrimMask,const simdscalar & vClipMask,PA_STATE & pa,const simdscalari & vPrimId,const simdscalari & vViewportIdx)266     void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
267     {
268         // input/output vertex store for clipper
269         simdvertex vertices[7]; // maximum 7 verts generated per triangle
270 
271         LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
272         uint32_t provokingVertex = 0;
273         if(pa.binTopology == TOP_TRIANGLE_FAN)
274         {
275             provokingVertex = this->state.frontendState.provokingVertex.triFan;
276         }
277         ///@todo: line topology for wireframe?
278 
279         // assemble pos
280         simdvector tmpVector[NumVertsPerPrim];
281         pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
282         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
283         {
284             vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
285         }
286 
287         // assemble attribs
288         const SWR_BACKEND_STATE& backendState = this->state.backendState;
289 
290         int32_t maxSlot = -1;
291         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
292         {
293             // Compute absolute attrib slot in vertex array
294             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
295             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
296             uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
297 
298             pa.Assemble(inputSlot, tmpVector);
299 
300             // if constant interpolation enabled for this attribute, assign the provoking
301             // vertex values to all edges
302             if (_bittest(&constantInterpMask, slot))
303             {
304                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
305                 {
306                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
307                 }
308             }
309             else
310             {
311                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
312                 {
313                     vertices[i].attrib[inputSlot] = tmpVector[i];
314                 }
315             }
316         }
317 
318         // assemble user clip distances if enabled
319         if (this->state.rastState.clipDistanceMask & 0xf)
320         {
321             pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
322             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
323             {
324                 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
325             }
326         }
327 
328         if (this->state.rastState.clipDistanceMask & 0xf0)
329         {
330             pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
331             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
332             {
333                 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
334             }
335         }
336 
337         uint32_t numAttribs = maxSlot + 1;
338 
339         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
340 
341         // set up new PA for binning clipped primitives
342         PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
343         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
344         if (NumVertsPerPrim == 3)
345         {
346             pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
347             clipTopology = TOP_TRIANGLE_FAN;
348 
349             // so that the binner knows to bloat wide points later
350             if (pa.binTopology == TOP_POINT_LIST)
351                 clipTopology = TOP_POINT_LIST;
352 
353         }
354         else if (NumVertsPerPrim == 2)
355         {
356             pfnBinFunc = BinLines;
357             clipTopology = TOP_LINE_LIST;
358         }
359         else
360         {
361             SWR_ASSERT(0 && "Unexpected points in clipper.");
362         }
363 
364         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
365         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
366         uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
367 
368         const simdscalari vOffsets = _mm256_set_epi32(
369             0 * sizeof(simdvertex),  // unused lane
370             6 * sizeof(simdvertex),
371             5 * sizeof(simdvertex),
372             4 * sizeof(simdvertex),
373             3 * sizeof(simdvertex),
374             2 * sizeof(simdvertex),
375             1 * sizeof(simdvertex),
376             0 * sizeof(simdvertex));
377 
378         // only need to gather 7 verts
379         // @todo dynamic mask based on actual # of verts generated per lane
380         const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
381 
382         uint32_t numClippedPrims = 0;
383         for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
384         {
385             uint32_t numEmittedVerts = pVertexCount[inputPrim];
386             if (numEmittedVerts < NumVertsPerPrim)
387             {
388                 continue;
389             }
390             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
391 
392             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
393             numClippedPrims += numEmittedPrims;
394 
395             // tranpose clipper output so that each lane's vertices are in SIMD order
396             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
397             // for triangle fan
398             simdvertex transposedPrims[2];
399 
400             // transpose pos
401             uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
402             for (uint32_t c = 0; c < 4; ++c)
403             {
404                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
405                 pBase += sizeof(simdscalar);
406             }
407 
408             // transpose attribs
409             pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
410             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
411             {
412                 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
413                 for (uint32_t c = 0; c < 4; ++c)
414                 {
415                     transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
416                     pBase += sizeof(simdscalar);
417                 }
418             }
419 
420             // transpose user clip distances if enabled
421             if (this->state.rastState.clipDistanceMask & 0xf)
422             {
423                 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
424                 for (uint32_t c = 0; c < 4; ++c)
425                 {
426                     transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
427                     pBase += sizeof(simdscalar);
428                 }
429             }
430 
431             if (this->state.rastState.clipDistanceMask & 0xf0)
432             {
433                 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
434                 for (uint32_t c = 0; c < 4; ++c)
435                 {
436                     transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
437                     pBase += sizeof(simdscalar);
438                 }
439             }
440 
441             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
442 
443             while (clipPa.GetNextStreamOutput())
444             {
445                 do
446                 {
447                     simdvector attrib[NumVertsPerPrim];
448                     bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
449                     if (assemble)
450                     {
451                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
452                         pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
453                     }
454                 } while (clipPa.NextPrim());
455             }
456         }
457 
458         // update global pipeline stat
459         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
460     }
461 
462     // execute the clipper stage
ExecuteStage(PA_STATE & pa,simdvector prim[],uint32_t primMask,simdscalari primId,simdscalari viewportIdx)463     void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
464     {
465         SWR_ASSERT(pa.pDC != nullptr);
466         SWR_CONTEXT* pContext = pa.pDC->pContext;
467 
468         // set up binner based on PA state
469         PFN_PROCESS_PRIMS pfnBinner;
470         switch (pa.binTopology)
471         {
472         case TOP_POINT_LIST:
473             pfnBinner = BinPoints;
474             break;
475         case TOP_LINE_LIST:
476         case TOP_LINE_STRIP:
477         case TOP_LINE_LOOP:
478         case TOP_LINE_LIST_ADJ:
479         case TOP_LISTSTRIP_ADJ:
480             pfnBinner = BinLines;
481             break;
482         default:
483             pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
484             break;
485         };
486 
487         // update clipper invocations pipeline stat
488         uint32_t numInvoc = _mm_popcnt_u32(primMask);
489         UPDATE_STAT_FE(CInvocations, numInvoc);
490 
491         ComputeClipCodes(prim, viewportIdx);
492 
493         // cull prims with NAN coords
494         primMask &= ~ComputeNaNMask(prim);
495 
496         // user cull distance cull
497         if (this->state.rastState.cullDistanceMask)
498         {
499             primMask &= ~ComputeUserClipCullMask(pa, prim);
500         }
501 
502         // cull prims outside view frustum
503         simdscalar clipIntersection = ComputeClipCodeIntersection();
504         int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
505 
506         // skip clipping for points
507         uint32_t clipMask = 0;
508         if (NumVertsPerPrim != 1)
509         {
510             clipMask = primMask & ComputeClipMask();
511         }
512 
513         if (clipMask)
514         {
515             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
516             // we have to clip tris, execute the clipper, which will also
517             // call the binner
518             ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
519             AR_END(FEGuardbandClip, 1);
520         }
521         else if (validMask)
522         {
523             // update CPrimitives pipeline state
524             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
525 
526             // forward valid prims directly to binner
527             pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
528         }
529     }
530 
531 private:
ComputeInterpFactor(simdscalar boundaryCoord0,simdscalar boundaryCoord1)532     inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
533     {
534         return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
535     }
536 
ComputeOffsets(uint32_t attrib,simdscalari vIndices,uint32_t component)537     inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
538     {
539         const uint32_t simdVertexStride = sizeof(simdvertex);
540         const uint32_t componentStride = sizeof(simdscalar);
541         const uint32_t attribStride = sizeof(simdvector);
542         const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
543             3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
544 
545         // step to the simdvertex
546         simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
547 
548         // step to the attribute and component
549         vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
550 
551         // step to the lane
552         vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
553 
554         return vOffsets;
555     }
556 
557     // gathers a single component for a given attribute for each SIMD lane
GatherComponent(const float * pBuffer,uint32_t attrib,simdscalar vMask,simdscalari vIndices,uint32_t component)558     inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
559     {
560         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
561         simdscalar vSrc = _mm256_undefined_ps();
562         return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
563     }
564 
ScatterComponent(const float * pBuffer,uint32_t attrib,simdscalar vMask,simdscalari vIndices,uint32_t component,simdscalar vSrc)565     inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
566     {
567         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
568 
569         uint32_t* pOffsets = (uint32_t*)&vOffsets;
570         float* pSrc = (float*)&vSrc;
571         uint32_t mask = _simd_movemask_ps(vMask);
572         DWORD lane;
573         while (_BitScanForward(&lane, mask))
574         {
575             mask &= ~(1 << lane);
576             uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
577             *(float*)pBuf = pSrc[lane];
578         }
579     }
580 
581     template<SWR_CLIPCODES ClippingPlane>
intersect(const simdscalar & vActiveMask,const simdscalari & s,const simdscalari & p,const simdvector & v1,const simdvector & v2,simdscalari & outIndex,const float * pInVerts,uint32_t numInAttribs,float * pOutVerts)582     inline void intersect(
583         const simdscalar& vActiveMask,  // active lanes to operate on
584         const simdscalari& s,           // index to first edge vertex v0 in pInPts.
585         const simdscalari& p,           // index to second edge vertex v1 in pInPts.
586         const simdvector& v1,           // vertex 0 position
587         const simdvector& v2,           // vertex 1 position
588         simdscalari& outIndex,          // output index.
589         const float *pInVerts,          // array of all the input positions.
590         uint32_t numInAttribs,          // number of attributes per vertex.
591         float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
592     {
593         // compute interpolation factor
594         simdscalar t;
595         switch (ClippingPlane)
596         {
597         case FRUSTUM_LEFT:      t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
598         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
599         case FRUSTUM_TOP:       t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
600         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
601         case FRUSTUM_NEAR:
602             // DX Znear plane is 0, GL is -w
603             if (this->state.rastState.clipHalfZ)
604             {
605                 t = ComputeInterpFactor(v1[2], v2[2]);
606             }
607             else
608             {
609                 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
610             }
611             break;
612         case FRUSTUM_FAR:       t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
613         default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
614         };
615 
616         // interpolate position and store
617         for (uint32_t c = 0; c < 4; ++c)
618         {
619             simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
620             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
621         }
622 
623         // interpolate attributes and store
624         for (uint32_t a = 0; a < numInAttribs; ++a)
625         {
626             uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
627             for (uint32_t c = 0; c < 4; ++c)
628             {
629                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
630                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
631                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
632                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
633             }
634         }
635 
636         // interpolate clip distance if enabled
637         if (this->state.rastState.clipDistanceMask & 0xf)
638         {
639             uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
640             for (uint32_t c = 0; c < 4; ++c)
641             {
642                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
643                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
644                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
645                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
646             }
647         }
648 
649         if (this->state.rastState.clipDistanceMask & 0xf0)
650         {
651             uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
652             for (uint32_t c = 0; c < 4; ++c)
653             {
654                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
655                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
656                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
657                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
658             }
659         }
660     }
661 
662     template<SWR_CLIPCODES ClippingPlane>
inside(const simdvector & v)663     inline simdscalar inside(const simdvector& v)
664     {
665         switch (ClippingPlane)
666         {
667         case FRUSTUM_LEFT:      return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
668         case FRUSTUM_RIGHT:     return _simd_cmple_ps(v[0], v[3]);
669         case FRUSTUM_TOP:       return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
670         case FRUSTUM_BOTTOM:    return _simd_cmple_ps(v[1], v[3]);
671         case FRUSTUM_NEAR:      return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
672         case FRUSTUM_FAR:       return _simd_cmple_ps(v[2], v[3]);
673         default:
674             SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
675             return _simd_setzero_ps();
676         }
677     }
678 
679     template<SWR_CLIPCODES ClippingPlane>
ClipTriToPlane(const float * pInVerts,const simdscalari & vNumInPts,uint32_t numInAttribs,float * pOutVerts)680     simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
681     {
682         simdscalari vCurIndex = _simd_setzero_si();
683         simdscalari vOutIndex = _simd_setzero_si();
684         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
685 
686         while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
687         {
688             simdscalari s = vCurIndex;
689             simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
690             simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
691             p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
692 
693             // gather position
694             simdvector vInPos0, vInPos1;
695             for (uint32_t c = 0; c < 4; ++c)
696             {
697                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
698                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
699             }
700 
701             // compute inside mask
702             simdscalar s_in = inside<ClippingPlane>(vInPos0);
703             simdscalar p_in = inside<ClippingPlane>(vInPos1);
704 
705             // compute intersection mask (s_in != p_in)
706             simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
707             intersectMask = _simd_and_ps(intersectMask, vActiveMask);
708 
709             // store s if inside
710             s_in = _simd_and_ps(s_in, vActiveMask);
711             if (!_simd_testz_ps(s_in, s_in))
712             {
713                 // store position
714                 for (uint32_t c = 0; c < 4; ++c)
715                 {
716                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
717                 }
718 
719                 // store attribs
720                 for (uint32_t a = 0; a < numInAttribs; ++a)
721                 {
722                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
723                     for (uint32_t c = 0; c < 4; ++c)
724                     {
725                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
726                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
727                     }
728                 }
729 
730                 // store clip distance if enabled
731                 if (this->state.rastState.clipDistanceMask & 0xf)
732                 {
733                     uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
734                     for (uint32_t c = 0; c < 4; ++c)
735                     {
736                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
737                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
738                     }
739                 }
740 
741                 if (this->state.rastState.clipDistanceMask & 0xf0)
742                 {
743                     uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
744                     for (uint32_t c = 0; c < 4; ++c)
745                     {
746                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
747                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
748                     }
749                 }
750 
751                 // increment outIndex
752                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
753             }
754 
755             // compute and store intersection
756             if (!_simd_testz_ps(intersectMask, intersectMask))
757             {
758                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
759 
760                 // increment outIndex for active lanes
761                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
762             }
763 
764             // increment loop index and update active mask
765             vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
766             vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
767         }
768 
769         return vOutIndex;
770     }
771 
772     template<SWR_CLIPCODES ClippingPlane>
ClipLineToPlane(const float * pInVerts,const simdscalari & vNumInPts,uint32_t numInAttribs,float * pOutVerts)773     simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
774     {
775         simdscalari vCurIndex = _simd_setzero_si();
776         simdscalari vOutIndex = _simd_setzero_si();
777         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
778 
779         if (!_simd_testz_ps(vActiveMask, vActiveMask))
780         {
781             simdscalari s = vCurIndex;
782             simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
783 
784             // gather position
785             simdvector vInPos0, vInPos1;
786             for (uint32_t c = 0; c < 4; ++c)
787             {
788                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
789                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
790             }
791 
792             // compute inside mask
793             simdscalar s_in = inside<ClippingPlane>(vInPos0);
794             simdscalar p_in = inside<ClippingPlane>(vInPos1);
795 
796             // compute intersection mask (s_in != p_in)
797             simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
798             intersectMask = _simd_and_ps(intersectMask, vActiveMask);
799 
800             // store s if inside
801             s_in = _simd_and_ps(s_in, vActiveMask);
802             if (!_simd_testz_ps(s_in, s_in))
803             {
804                 for (uint32_t c = 0; c < 4; ++c)
805                 {
806                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
807                 }
808 
809                 // interpolate attributes and store
810                 for (uint32_t a = 0; a < numInAttribs; ++a)
811                 {
812                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
813                     for (uint32_t c = 0; c < 4; ++c)
814                     {
815                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
816                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
817                     }
818                 }
819 
820                 // increment outIndex
821                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
822             }
823 
824             // compute and store intersection
825             if (!_simd_testz_ps(intersectMask, intersectMask))
826             {
827                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
828 
829                 // increment outIndex for active lanes
830                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
831             }
832 
833             // store p if inside
834             p_in = _simd_and_ps(p_in, vActiveMask);
835             if (!_simd_testz_ps(p_in, p_in))
836             {
837                 for (uint32_t c = 0; c < 4; ++c)
838                 {
839                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
840                 }
841 
842                 // interpolate attributes and store
843                 for (uint32_t a = 0; a < numInAttribs; ++a)
844                 {
845                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
846                     for (uint32_t c = 0; c < 4; ++c)
847                     {
848                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
849                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
850                     }
851                 }
852 
853                 // increment outIndex
854                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
855             }
856         }
857 
858         return vOutIndex;
859     }
860 
861     //////////////////////////////////////////////////////////////////////////
862     /// @brief Vertical clipper. Clips SIMD primitives at a time
863     /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
864     /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
865     /// @param numAttribs - number of valid input attribs, including position
ClipPrims(float * pVertices,const simdscalar & vPrimMask,const simdscalar & vClipMask,int numAttribs)866     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
867     {
868         // temp storage
869         float* pTempVerts = (float*)&tlsTempVertices[0];
870 
871         // zero out num input verts for non-active lanes
872         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
873         vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
874 
875         // clip prims to frustum
876         simdscalari vNumOutPts;
877         if (NumVertsPerPrim == 3)
878         {
879             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
880             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
881             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
882             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
883             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
884             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
885         }
886         else
887         {
888             SWR_ASSERT(NumVertsPerPrim == 2);
889             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
890             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
891             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
892             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
893             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
894             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
895         }
896 
897         // restore num verts for non-clipped, active lanes
898         simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
899         vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
900 
901         return vNumOutPts;
902     }
903 
904     const uint32_t workerId{ 0 };
905     DRAW_CONTEXT* pDC{ nullptr };
906     const API_STATE& state;
907     simdscalar clipCodes[NumVertsPerPrim];
908 };
909 
910 
911 // pipeline stage functions
912 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
913 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
914 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
915