• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 *        N primitives are assembled at a time, where N is the SIMD width.
27 *        A state machine, that is specific for a given topology, drives the
28 *        assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34 
35 #if (KNOB_SIMD_WIDTH == 8)
36 
swizzleLane0(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)37 INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
38 {
39     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
40     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
41     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
42 }
43 
swizzleLane1(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)44 INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
45 {
46     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
47     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
48     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
49 }
50 
swizzleLane2(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)51 INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
52 {
53     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
54     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
55     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
56 }
57 
swizzleLane3(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)58 INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
59 {
60     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
61     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
62     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
63 }
64 
swizzleLane4(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)65 INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
66 {
67     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
68     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
69     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
70 }
71 
swizzleLane5(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)72 INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
73 {
74     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
75     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
76     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
77 }
78 
swizzleLane6(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)79 INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
80 {
81     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
82     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
83     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
84 }
85 
swizzleLane7(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)86 INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
87 {
88     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
89     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
90     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
91 }
92 
swizzleLane0(const simdvector & v)93 INLINE simd4scalar swizzleLane0(const simdvector &v)
94 {
95     return swizzleLane0(v.x, v.y, v.z, v.w);
96 }
97 
swizzleLane1(const simdvector & v)98 INLINE simd4scalar swizzleLane1(const simdvector &v)
99 {
100     return swizzleLane1(v.x, v.y, v.z, v.w);
101 }
102 
swizzleLane2(const simdvector & v)103 INLINE simd4scalar swizzleLane2(const simdvector &v)
104 {
105     return swizzleLane2(v.x, v.y, v.z, v.w);
106 }
107 
swizzleLane3(const simdvector & v)108 INLINE simd4scalar swizzleLane3(const simdvector &v)
109 {
110     return swizzleLane3(v.x, v.y, v.z, v.w);
111 }
112 
swizzleLane4(const simdvector & v)113 INLINE simd4scalar swizzleLane4(const simdvector &v)
114 {
115     return swizzleLane4(v.x, v.y, v.z, v.w);
116 }
117 
swizzleLane5(const simdvector & v)118 INLINE simd4scalar swizzleLane5(const simdvector &v)
119 {
120     return swizzleLane5(v.x, v.y, v.z, v.w);
121 }
122 
swizzleLane6(const simdvector & v)123 INLINE simd4scalar swizzleLane6(const simdvector &v)
124 {
125     return swizzleLane6(v.x, v.y, v.z, v.w);
126 }
127 
swizzleLane7(const simdvector & v)128 INLINE simd4scalar swizzleLane7(const simdvector &v)
129 {
130     return swizzleLane7(v.x, v.y, v.z, v.w);
131 }
132 
swizzleLaneN(const simdvector & v,int lane)133 INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
134 {
135     switch (lane)
136     {
137     case 0:
138         return swizzleLane0(v);
139     case 1:
140         return swizzleLane1(v);
141     case 2:
142         return swizzleLane2(v);
143     case 3:
144         return swizzleLane3(v);
145     case 4:
146         return swizzleLane4(v);
147     case 5:
148         return swizzleLane5(v);
149     case 6:
150         return swizzleLane6(v);
151     case 7:
152         return swizzleLane7(v);
153     default:
154         return _mm_setzero_ps();
155     }
156 }
157 
158 #if ENABLE_AVX512_SIMD16
swizzleLane0(const simd16vector & v)159 INLINE simd4scalar swizzleLane0(const simd16vector &v)
160 {
161     return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
162 }
163 
swizzleLane1(const simd16vector & v)164 INLINE simd4scalar swizzleLane1(const simd16vector &v)
165 {
166     return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
167 }
168 
swizzleLane2(const simd16vector & v)169 INLINE simd4scalar swizzleLane2(const simd16vector &v)
170 {
171     return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
172 }
173 
swizzleLane3(const simd16vector & v)174 INLINE simd4scalar swizzleLane3(const simd16vector &v)
175 {
176     return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
177 }
178 
swizzleLane4(const simd16vector & v)179 INLINE simd4scalar swizzleLane4(const simd16vector &v)
180 {
181     return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
182 }
183 
swizzleLane5(const simd16vector & v)184 INLINE simd4scalar swizzleLane5(const simd16vector &v)
185 {
186     return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
187 }
188 
swizzleLane6(const simd16vector & v)189 INLINE simd4scalar swizzleLane6(const simd16vector &v)
190 {
191     return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
192 }
193 
swizzleLane7(const simd16vector & v)194 INLINE simd4scalar swizzleLane7(const simd16vector &v)
195 {
196     return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
197 }
198 
swizzleLane8(const simd16vector & v)199 INLINE simd4scalar swizzleLane8(const simd16vector &v)
200 {
201     return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
202 }
203 
swizzleLane9(const simd16vector & v)204 INLINE simd4scalar swizzleLane9(const simd16vector &v)
205 {
206     return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
207 }
208 
swizzleLaneA(const simd16vector & v)209 INLINE simd4scalar swizzleLaneA(const simd16vector &v)
210 {
211     return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
212 }
213 
swizzleLaneB(const simd16vector & v)214 INLINE simd4scalar swizzleLaneB(const simd16vector &v)
215 {
216     return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
217 }
218 
swizzleLaneC(const simd16vector & v)219 INLINE simd4scalar swizzleLaneC(const simd16vector &v)
220 {
221     return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
222 }
223 
swizzleLaneD(const simd16vector & v)224 INLINE simd4scalar swizzleLaneD(const simd16vector &v)
225 {
226     return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
227 }
228 
swizzleLaneE(const simd16vector & v)229 INLINE simd4scalar swizzleLaneE(const simd16vector &v)
230 {
231     return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
232 }
233 
swizzleLaneF(const simd16vector & v)234 INLINE simd4scalar swizzleLaneF(const simd16vector &v)
235 {
236     return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
237 }
238 
swizzleLaneN(const simd16vector & v,int lane)239 INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
240 {
241     switch (lane)
242     {
243     case 0:
244         return swizzleLane0(v);
245     case 1:
246         return swizzleLane1(v);
247     case 2:
248         return swizzleLane2(v);
249     case 3:
250         return swizzleLane3(v);
251     case 4:
252         return swizzleLane4(v);
253     case 5:
254         return swizzleLane5(v);
255     case 6:
256         return swizzleLane6(v);
257     case 7:
258         return swizzleLane7(v);
259     case 8:
260         return swizzleLane8(v);
261     case 9:
262         return swizzleLane9(v);
263     case 10:
264         return swizzleLaneA(v);
265     case 11:
266         return swizzleLaneB(v);
267     case 12:
268         return swizzleLaneC(v);
269     case 13:
270         return swizzleLaneD(v);
271     case 14:
272         return swizzleLaneE(v);
273     case 15:
274         return swizzleLaneF(v);
275     default:
276         return _mm_setzero_ps();
277     }
278 }
279 
280 #endif
281 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
282 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
283 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
284 #if ENABLE_AVX512_SIMD16
285 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
286 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
287 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
288 #endif
289 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
290 
291 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
292 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
293 #if ENABLE_AVX512_SIMD16
294 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
295 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
296 #endif
297 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
298 
299 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
300 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
301 #if ENABLE_AVX512_SIMD16
302 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
303 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
304 #endif
305 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
306 
307 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
308 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
309 #if ENABLE_AVX512_SIMD16
310 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
311 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
312 #endif
313 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
314 
315 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
316 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
317 #if ENABLE_AVX512_SIMD16
318 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
319 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
320 #endif
321 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
322 
323 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
324 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
325 #if ENABLE_AVX512_SIMD16
326 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
327 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
328 #endif
329 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
330 
331 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
332 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
333 #if ENABLE_AVX512_SIMD16
334 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
335 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
336 #endif
337 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
338 
339 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
340 #if ENABLE_AVX512_SIMD16
341 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
342 #endif
343 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
344 
345 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
346 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
347 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
348 #if ENABLE_AVX512_SIMD16
349 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
350 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
351 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
352 #endif
353 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
354 
355 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])356 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
357 {
358     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
359     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
360     // Each attribute has 4 components.
361 
362     /// @todo Optimize this
363 
364 #if USE_SIMD16_FRONTEND
365     if (pa.useAlternateOffset)
366     {
367         primIndex += KNOB_SIMD_WIDTH;
368     }
369 
370 #endif
371     float* pOutVec = (float*)verts;
372 
373     for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
374     {
375         uint32_t input_cp = primIndex * TotalControlPoints + cp;
376 #if USE_SIMD16_FRONTEND
377         uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
378         uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
379 
380 #else
381         uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
382         uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
383 
384 #endif
385         // Loop over all components of the attribute
386         for (uint32_t i = 0; i < 4; ++i)
387         {
388 #if USE_SIMD16_FRONTEND
389             const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
390 #else
391             const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
392 #endif
393             pOutVec[cp * 4 + i] = pInputVec[input_lane];
394         }
395     }
396 }
397 
398 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])399 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
400 {
401     SetNextPaState(
402         pa,
403         PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
404         PaPatchListSingle<TotalControlPoints>);
405 
406     return false;
407 }
408 
409 template<uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])410 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
411 {
412     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
413     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
414     // Each attribute has 4 components.
415 
416     /// @todo Optimize this
417 
418 #if USE_SIMD16_FRONTEND
419     uint32_t lane_offset = 0;
420 
421     if (pa.useAlternateOffset)
422     {
423         lane_offset = KNOB_SIMD_WIDTH;
424     }
425 
426 #endif
427     // Loop over all components of the attribute
428     for (uint32_t i = 0; i < 4; ++i)
429     {
430         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
431         {
432             float vec[KNOB_SIMD_WIDTH];
433             for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
434             {
435 #if USE_SIMD16_FRONTEND
436                 uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
437                 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
438                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
439 
440                 const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
441 #else
442                 uint32_t input_cp = lane * TotalControlPoints + cp;
443                 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
444                 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
445 
446                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
447 #endif
448                 vec[lane] = pInputVec[input_lane];
449             }
450             verts[cp][i] = _simd_loadu_ps(vec);
451         }
452     }
453 
454     SetNextPaState(
455         pa,
456         PaPatchList<TotalControlPoints>,
457         PaPatchListSingle<TotalControlPoints>,
458         0,
459         PA_STATE_OPT::SIMD_WIDTH,
460         true);
461 
462     return true;
463 }
464 
465 #if ENABLE_AVX512_SIMD16
466 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])467 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
468 {
469     SetNextPaState_simd16(
470         pa,
471         PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
472         PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
473         PaPatchListSingle<TotalControlPoints>);
474 
475     return false;
476 }
477 
478 template<uint32_t TotalControlPoints>
PaPatchListTerm_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])479 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
480 {
481     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
482     // KNOB_SIMD16_WIDTH * 1 patch.  This function is called once per attribute.
483     // Each attribute has 4 components.
484 
485     /// @todo Optimize this
486 
487     // Loop over all components of the attribute
488     for (uint32_t i = 0; i < 4; ++i)
489     {
490         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
491         {
492             float vec[KNOB_SIMD16_WIDTH];
493             for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
494             {
495                 uint32_t input_cp = lane * TotalControlPoints + cp;
496                 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
497                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
498 
499                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
500                 vec[lane] = pInputVec[input_lane];
501             }
502             verts[cp][i] = _simd16_loadu_ps(vec);
503         }
504     }
505 
506     SetNextPaState_simd16(
507         pa,
508         PaPatchList_simd16<TotalControlPoints>,
509         PaPatchList<TotalControlPoints>,
510         PaPatchListSingle<TotalControlPoints>,
511         0,
512         PA_STATE_OPT::SIMD_WIDTH,
513         true);
514 
515     return true;
516 }
517 
518 #endif
519 #define PA_PATCH_LIST_TERMINATOR(N) \
520     template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
521                            { return PaPatchListTerm<N>(pa, slot, verts); }
522 PA_PATCH_LIST_TERMINATOR(1)
523 PA_PATCH_LIST_TERMINATOR(2)
524 PA_PATCH_LIST_TERMINATOR(3)
525 PA_PATCH_LIST_TERMINATOR(4)
526 PA_PATCH_LIST_TERMINATOR(5)
527 PA_PATCH_LIST_TERMINATOR(6)
528 PA_PATCH_LIST_TERMINATOR(7)
529 PA_PATCH_LIST_TERMINATOR(8)
530 PA_PATCH_LIST_TERMINATOR(9)
531 PA_PATCH_LIST_TERMINATOR(10)
532 PA_PATCH_LIST_TERMINATOR(11)
533 PA_PATCH_LIST_TERMINATOR(12)
534 PA_PATCH_LIST_TERMINATOR(13)
535 PA_PATCH_LIST_TERMINATOR(14)
536 PA_PATCH_LIST_TERMINATOR(15)
537 PA_PATCH_LIST_TERMINATOR(16)
538 PA_PATCH_LIST_TERMINATOR(17)
539 PA_PATCH_LIST_TERMINATOR(18)
540 PA_PATCH_LIST_TERMINATOR(19)
541 PA_PATCH_LIST_TERMINATOR(20)
542 PA_PATCH_LIST_TERMINATOR(21)
543 PA_PATCH_LIST_TERMINATOR(22)
544 PA_PATCH_LIST_TERMINATOR(23)
545 PA_PATCH_LIST_TERMINATOR(24)
546 PA_PATCH_LIST_TERMINATOR(25)
547 PA_PATCH_LIST_TERMINATOR(26)
548 PA_PATCH_LIST_TERMINATOR(27)
549 PA_PATCH_LIST_TERMINATOR(28)
550 PA_PATCH_LIST_TERMINATOR(29)
551 PA_PATCH_LIST_TERMINATOR(30)
552 PA_PATCH_LIST_TERMINATOR(31)
553 PA_PATCH_LIST_TERMINATOR(32)
554 #undef PA_PATCH_LIST_TERMINATOR
555 
556 #if ENABLE_AVX512_SIMD16
557 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
558     template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
559                            { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
560 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
561 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
562 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
563 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
564 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
565 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
566 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
567 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
568 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
569 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
570 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
571 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
572 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
573 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
574 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
575 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
576 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
577 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
578 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
579 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
580 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
581 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
582 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
583 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
584 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
585 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
586 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
587 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
588 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
589 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
590 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
591 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
592 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
593 
594 #endif
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])595 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
596 {
597     SetNextPaState(pa, PaTriList1, PaTriListSingle0);
598     return false;    // Not enough vertices to assemble 4 or 8 triangles.
599 }
600 
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])601 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
602 {
603     SetNextPaState(pa, PaTriList2, PaTriListSingle0);
604     return false;    // Not enough vertices to assemble 8 triangles.
605 }
606 
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])607 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
608 {
609 #if KNOB_ARCH == KNOB_ARCH_AVX
610 #if USE_SIMD16_FRONTEND
611     simdvector a;
612     simdvector b;
613     simdvector c;
614 
615     if (!pa.useAlternateOffset)
616     {
617         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
618         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
619 
620         for (uint32_t i = 0; i < 4; i += 1)
621         {
622             a[i] = _simd16_extract_ps(a_16[i], 0);
623             b[i] = _simd16_extract_ps(a_16[i], 1);
624             c[i] = _simd16_extract_ps(b_16[i], 0);
625         }
626     }
627     else
628     {
629         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
630         const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
631 
632         for (uint32_t i = 0; i < 4; i += 1)
633         {
634             a[i] = _simd16_extract_ps(b_16[i], 1);
635             b[i] = _simd16_extract_ps(c_16[i], 0);
636             c[i] = _simd16_extract_ps(c_16[i], 1);
637         }
638     }
639 
640 #else
641     simdvector &a = PaGetSimdVector(pa, 0, slot);
642     simdvector &b = PaGetSimdVector(pa, 1, slot);
643     simdvector &c = PaGetSimdVector(pa, 2, slot);
644 
645 #endif
646     simdscalar s;
647 
648     // Tri Pattern - provoking vertex is always v0
649     //  v0 -> 0 3 6 9  12 15 18 21
650     //  v1 -> 1 4 7 10 13 16 19 22
651     //  v2 -> 2 5 8 11 14 17 20 23
652 
653     for (int i = 0; i < 4; ++i)
654     {
655         simdvector& v0 = verts[0];
656         v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
657         v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
658         v0[i] = _mm256_permute_ps(v0[i], 0x6C);
659         s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
660         v0[i] = _simd_blend_ps(v0[i], s, 0x44);
661 
662         simdvector& v1 = verts[1];
663         v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
664         v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
665         v1[i] = _mm256_permute_ps(v1[i], 0xB1);
666         s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
667         v1[i] = _simd_blend_ps(v1[i], s, 0x66);
668 
669         simdvector& v2 = verts[2];
670         v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
671         v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
672         v2[i] = _mm256_permute_ps(v2[i], 0xC6);
673         s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
674         v2[i] = _simd_blend_ps(v2[i], s, 0x22);
675     }
676 
677 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
678     const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
679     const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
680     const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
681 
682 #if USE_SIMD16_FRONTEND
683     simdvector a;
684     simdvector b;
685     simdvector c;
686 
687     if (!pa.useAlternateOffset)
688     {
689         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
690         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
691 
692         for (uint32_t i = 0; i < 4; i += 1)
693         {
694             a[i] = _simd16_extract_ps(a_16[i], 0);
695             b[i] = _simd16_extract_ps(a_16[i], 1);
696             c[i] = _simd16_extract_ps(b_16[i], 0);
697         }
698     }
699     else
700     {
701         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
702         const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
703 
704         for (uint32_t i = 0; i < 4; i += 1)
705         {
706             a[i] = _simd16_extract_ps(b_16[i], 1);
707             b[i] = _simd16_extract_ps(c_16[i], 0);
708             c[i] = _simd16_extract_ps(c_16[i], 1);
709         }
710     }
711 
712 #else
713     const simdvector &a = PaGetSimdVector(pa, 0, slot);
714     const simdvector &b = PaGetSimdVector(pa, 1, slot);
715     const simdvector &c = PaGetSimdVector(pa, 2, slot);
716 
717 #endif
718     //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
719     //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
720     //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
721 
722     simdvector &v0 = verts[0];
723     simdvector &v1 = verts[1];
724     simdvector &v2 = verts[2];
725 
726     // for simd x, y, z, and w
727     for (int i = 0; i < 4; ++i)
728     {
729         simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
730         simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
731         simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
732 
733         v0[i] = _simd_permute_ps(temp0, perm0);
734         v1[i] = _simd_permute_ps(temp1, perm1);
735         v2[i] = _simd_permute_ps(temp2, perm2);
736     }
737 
738 #endif
739     SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
740     return true;
741 }
742 
743 #if ENABLE_AVX512_SIMD16
PaTriList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])744 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
745 {
746     SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
747     return false;    // Not enough vertices to assemble 16 triangles
748 }
749 
PaTriList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])750 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
751 {
752     SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
753     return false;    // Not enough vertices to assemble 16 triangles
754 }
755 
PaTriList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])756 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
757 {
758     const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 5, 2, 15, 12,  9, 6, 3, 0);
759     const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 6, 3,  0, 13, 10, 7, 4, 1);
760     const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 7, 4,  1, 14, 11, 8, 5, 2);
761 
762     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
763     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
764     const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
765 
766     simd16vector &v0 = verts[0];
767     simd16vector &v1 = verts[1];
768     simd16vector &v2 = verts[2];
769 
770     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
771     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
772     //  v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
773 
774     // for simd16 x, y, z, and w
775     for (int i = 0; i < 4; i += 1)
776     {
777         simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
778         simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
779         simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
780 
781         v0[i] = _simd16_permute_ps(temp0, perm0);
782         v1[i] = _simd16_permute_ps(temp1, perm1);
783         v2[i] = _simd16_permute_ps(temp2, perm2);
784     }
785 
786     SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
787     return true;
788 }
789 
790 #endif
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])791 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
792 {
793 #if USE_SIMD16_FRONTEND
794     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
795     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
796     const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
797 
798     if (pa.useAlternateOffset)
799     {
800         primIndex += KNOB_SIMD_WIDTH;
801     }
802 
803     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
804     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
805     //  v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
806 
807     switch (primIndex)
808     {
809     case 0:
810         verts[0] = swizzleLane0(a);
811         verts[1] = swizzleLane1(a);
812         verts[2] = swizzleLane2(a);
813         break;
814     case 1:
815         verts[0] = swizzleLane3(a);
816         verts[1] = swizzleLane4(a);
817         verts[2] = swizzleLane5(a);
818         break;
819     case 2:
820         verts[0] = swizzleLane6(a);
821         verts[1] = swizzleLane7(a);
822         verts[2] = swizzleLane8(a);
823         break;
824     case 3:
825         verts[0] = swizzleLane9(a);
826         verts[1] = swizzleLaneA(a);
827         verts[2] = swizzleLaneB(a);
828         break;
829     case 4:
830         verts[0] = swizzleLaneC(a);
831         verts[1] = swizzleLaneD(a);
832         verts[2] = swizzleLaneE(a);
833         break;
834     case 5:
835         verts[0] = swizzleLaneF(a);
836         verts[1] = swizzleLane0(b);
837         verts[2] = swizzleLane1(b);
838         break;
839     case 6:
840         verts[0] = swizzleLane2(b);
841         verts[1] = swizzleLane3(b);
842         verts[2] = swizzleLane4(b);
843         break;
844     case 7:
845         verts[0] = swizzleLane5(b);
846         verts[1] = swizzleLane6(b);
847         verts[2] = swizzleLane7(b);
848         break;
849     case 8:
850         verts[0] = swizzleLane8(b);
851         verts[1] = swizzleLane9(b);
852         verts[2] = swizzleLaneA(b);
853         break;
854     case 9:
855         verts[0] = swizzleLaneB(b);
856         verts[1] = swizzleLaneC(b);
857         verts[2] = swizzleLaneD(b);
858         break;
859     case 10:
860         verts[0] = swizzleLaneE(b);
861         verts[1] = swizzleLaneF(b);
862         verts[2] = swizzleLane0(c);
863         break;
864     case 11:
865         verts[0] = swizzleLane1(c);
866         verts[1] = swizzleLane2(c);
867         verts[2] = swizzleLane3(c);
868         break;
869     case 12:
870         verts[0] = swizzleLane4(c);
871         verts[1] = swizzleLane5(c);
872         verts[2] = swizzleLane6(c);
873         break;
874     case 13:
875         verts[0] = swizzleLane7(c);
876         verts[1] = swizzleLane8(c);
877         verts[2] = swizzleLane9(c);
878         break;
879     case 14:
880         verts[0] = swizzleLaneA(c);
881         verts[1] = swizzleLaneB(c);
882         verts[2] = swizzleLaneC(c);
883         break;
884     case 15:
885         verts[0] = swizzleLaneD(c);
886         verts[1] = swizzleLaneE(c);
887         verts[2] = swizzleLaneF(c);
888         break;
889     };
890 #else
891     // We have 12 simdscalars contained within 3 simdvectors which
892     // hold at least 8 triangles worth of data. We want to assemble a single
893     // triangle with data in horizontal form.
894 
895     const simdvector &a = PaGetSimdVector(pa, 0, slot);
896     const simdvector &b = PaGetSimdVector(pa, 1, slot);
897     const simdvector &c = PaGetSimdVector(pa, 2, slot);
898 
899     // Convert from vertical to horizontal.
900     // Tri Pattern - provoking vertex is always v0
901     //  v0 -> 0 3 6 9  12 15 18 21
902     //  v1 -> 1 4 7 10 13 16 19 22
903     //  v2 -> 2 5 8 11 14 17 20 23
904 
905     switch (primIndex)
906     {
907     case 0:
908         verts[0] = swizzleLane0(a);
909         verts[1] = swizzleLane1(a);
910         verts[2] = swizzleLane2(a);
911         break;
912     case 1:
913         verts[0] = swizzleLane3(a);
914         verts[1] = swizzleLane4(a);
915         verts[2] = swizzleLane5(a);
916         break;
917     case 2:
918         verts[0] = swizzleLane6(a);
919         verts[1] = swizzleLane7(a);
920         verts[2] = swizzleLane0(b);
921         break;
922     case 3:
923         verts[0] = swizzleLane1(b);
924         verts[1] = swizzleLane2(b);
925         verts[2] = swizzleLane3(b);
926         break;
927     case 4:
928         verts[0] = swizzleLane4(b);
929         verts[1] = swizzleLane5(b);
930         verts[2] = swizzleLane6(b);
931         break;
932     case 5:
933         verts[0] = swizzleLane7(b);
934         verts[1] = swizzleLane0(c);
935         verts[2] = swizzleLane1(c);
936         break;
937     case 6:
938         verts[0] = swizzleLane2(c);
939         verts[1] = swizzleLane3(c);
940         verts[2] = swizzleLane4(c);
941         break;
942     case 7:
943         verts[0] = swizzleLane5(c);
944         verts[1] = swizzleLane6(c);
945         verts[2] = swizzleLane7(c);
946         break;
947     };
948 #endif
949 }
950 
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])951 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
952 {
953     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
954     return false;    // Not enough vertices to assemble 8 triangles.
955 }
956 
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])957 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
958 {
959 #if USE_SIMD16_FRONTEND
960     simdvector a;
961     simdvector b;
962 
963     if (!pa.useAlternateOffset)
964     {
965         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
966 
967         for (uint32_t i = 0; i < 4; i += 1)
968         {
969             a[i] = _simd16_extract_ps(a_16[i], 0);
970             b[i] = _simd16_extract_ps(a_16[i], 1);
971         }
972     }
973     else
974     {
975         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
976 
977         for (uint32_t i = 0; i < 4; i += 1)
978         {
979             a[i] = _simd16_extract_ps(b_16[i], 0);
980             b[i] = _simd16_extract_ps(b_16[i], 1);
981         }
982     }
983 
984 #else
985     simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
986     simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
987 
988 #endif
989     simdscalar s;
990 
991     for(int i = 0; i < 4; ++i)
992     {
993         simdscalar a0 = a[i];
994         simdscalar b0 = b[i];
995 
996         // Tri Pattern - provoking vertex is always v0
997         //  v0 -> 01234567
998         //  v1 -> 13355779
999         //  v2 -> 22446688
1000         simdvector& v0 = verts[0];
1001         v0[i] = a0;
1002 
1003         //  s -> 4567891011
1004         s = _simd_permute2f128_ps(a0, b0, 0x21);
1005         //  s -> 23456789
1006         s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1007 
1008         simdvector& v1 = verts[1];
1009         //  v1 -> 13355779
1010         v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
1011 
1012         simdvector& v2 = verts[2];
1013         //  v2 -> 22446688
1014         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
1015     }
1016 
1017     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1018     return true;
1019 }
1020 
1021 #if  ENABLE_AVX512_SIMD16
PaTriStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1022 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1023 {
1024     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
1025     return false;    // Not enough vertices to assemble 16 triangles.
1026 }
1027 
PaTriStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1028 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1029 {
1030     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1031     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1032 
1033     simd16vector &v0 = verts[0];
1034     simd16vector &v1 = verts[1];
1035     simd16vector &v2 = verts[2];
1036 
1037     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1038     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1039     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1040 
1041     // for simd16 x, y, z, and w
1042     for (int i = 0; i < 4; i += 1)
1043     {
1044         simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
1045         simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1046 
1047         simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000);                                // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
1048         simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
1049 
1050         v0[i] = a[i];                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1051         v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1052         v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1053     }
1054 
1055     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1056     return true;
1057 }
1058 
1059 #endif
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1060 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1061 {
1062 #if USE_SIMD16_FRONTEND
1063     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1064     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1065 
1066     if (pa.useAlternateOffset)
1067     {
1068         primIndex += KNOB_SIMD_WIDTH;
1069     }
1070 
1071     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1072     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1073     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1074 
1075     switch (primIndex)
1076     {
1077     case 0:
1078         verts[0] = swizzleLane0(a);
1079         verts[1] = swizzleLane1(a);
1080         verts[2] = swizzleLane2(a);
1081         break;
1082     case 1:
1083         verts[0] = swizzleLane1(a);
1084         verts[1] = swizzleLane3(a);
1085         verts[2] = swizzleLane2(a);
1086         break;
1087     case 2:
1088         verts[0] = swizzleLane2(a);
1089         verts[1] = swizzleLane3(a);
1090         verts[2] = swizzleLane4(a);
1091         break;
1092     case 3:
1093         verts[0] = swizzleLane3(a);
1094         verts[1] = swizzleLane5(a);
1095         verts[2] = swizzleLane4(a);
1096         break;
1097     case 4:
1098         verts[0] = swizzleLane4(a);
1099         verts[1] = swizzleLane5(a);
1100         verts[2] = swizzleLane6(a);
1101         break;
1102     case 5:
1103         verts[0] = swizzleLane5(a);
1104         verts[1] = swizzleLane7(a);
1105         verts[2] = swizzleLane6(a);
1106         break;
1107     case 6:
1108         verts[0] = swizzleLane6(a);
1109         verts[1] = swizzleLane7(a);
1110         verts[2] = swizzleLane8(a);
1111         break;
1112     case 7:
1113         verts[0] = swizzleLane7(a);
1114         verts[1] = swizzleLane9(a);
1115         verts[2] = swizzleLane8(a);
1116         break;
1117     case 8:
1118         verts[0] = swizzleLane8(a);
1119         verts[1] = swizzleLane9(a);
1120         verts[2] = swizzleLaneA(a);
1121         break;
1122     case 9:
1123         verts[0] = swizzleLane9(a);
1124         verts[1] = swizzleLaneB(a);
1125         verts[2] = swizzleLaneA(a);
1126         break;
1127     case 10:
1128         verts[0] = swizzleLaneA(a);
1129         verts[1] = swizzleLaneB(a);
1130         verts[2] = swizzleLaneC(a);
1131         break;
1132     case 11:
1133         verts[0] = swizzleLaneB(a);
1134         verts[1] = swizzleLaneD(a);
1135         verts[2] = swizzleLaneC(a);
1136         break;
1137     case 12:
1138         verts[0] = swizzleLaneC(a);
1139         verts[1] = swizzleLaneD(a);
1140         verts[2] = swizzleLaneE(a);
1141         break;
1142     case 13:
1143         verts[0] = swizzleLaneD(a);
1144         verts[1] = swizzleLaneF(a);
1145         verts[2] = swizzleLaneE(a);
1146         break;
1147     case 14:
1148         verts[0] = swizzleLaneE(a);
1149         verts[1] = swizzleLaneF(a);
1150         verts[2] = swizzleLane0(b);
1151         break;
1152     case 15:
1153         verts[0] = swizzleLaneF(a);
1154         verts[1] = swizzleLane1(b);
1155         verts[2] = swizzleLane0(b);
1156         break;
1157     };
1158 #else
1159     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
1160     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
1161 
1162     // Convert from vertical to horizontal.
1163     // Tri Pattern - provoking vertex is always v0
1164     //  v0 -> 01234567
1165     //  v1 -> 13355779
1166     //  v2 -> 22446688
1167 
1168     switch (primIndex)
1169     {
1170     case 0:
1171         verts[0] = swizzleLane0(a);
1172         verts[1] = swizzleLane1(a);
1173         verts[2] = swizzleLane2(a);
1174         break;
1175     case 1:
1176         verts[0] = swizzleLane1(a);
1177         verts[1] = swizzleLane3(a);
1178         verts[2] = swizzleLane2(a);
1179         break;
1180     case 2:
1181         verts[0] = swizzleLane2(a);
1182         verts[1] = swizzleLane3(a);
1183         verts[2] = swizzleLane4(a);
1184         break;
1185     case 3:
1186         verts[0] = swizzleLane3(a);
1187         verts[1] = swizzleLane5(a);
1188         verts[2] = swizzleLane4(a);
1189         break;
1190     case 4:
1191         verts[0] = swizzleLane4(a);
1192         verts[1] = swizzleLane5(a);
1193         verts[2] = swizzleLane6(a);
1194         break;
1195     case 5:
1196         verts[0] = swizzleLane5(a);
1197         verts[1] = swizzleLane7(a);
1198         verts[2] = swizzleLane6(a);
1199         break;
1200     case 6:
1201         verts[0] = swizzleLane6(a);
1202         verts[1] = swizzleLane7(a);
1203         verts[2] = swizzleLane0(b);
1204         break;
1205     case 7:
1206         verts[0] = swizzleLane7(a);
1207         verts[1] = swizzleLane1(b);
1208         verts[2] = swizzleLane0(b);
1209         break;
1210     };
1211 #endif
1212 }
1213 
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1214 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1215 {
1216     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
1217     return false;    // Not enough vertices to assemble 8 triangles.
1218 }
1219 
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1220 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1221 {
1222 #if USE_SIMD16_FRONTEND
1223     simdvector leadVert;
1224     simdvector a;
1225     simdvector b;
1226 
1227     const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1228 
1229     if (!pa.useAlternateOffset)
1230     {
1231         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1232 
1233         for (uint32_t i = 0; i < 4; i += 1)
1234         {
1235             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1236 
1237             a[i] = _simd16_extract_ps(a_16[i], 0);
1238             b[i] = _simd16_extract_ps(a_16[i], 1);
1239         }
1240     }
1241     else
1242     {
1243         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1244 
1245         for (uint32_t i = 0; i < 4; i += 1)
1246         {
1247             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1248 
1249             a[i] = _simd16_extract_ps(b_16[i], 0);
1250             b[i] = _simd16_extract_ps(b_16[i], 1);
1251         }
1252     }
1253 
1254 #else
1255     const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot);
1256     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
1257     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
1258 
1259 #endif
1260     simdscalar s;
1261 
1262     // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
1263     for(int i = 0; i < 4; ++i)
1264     {
1265         simdscalar a0 = a[i];
1266         simdscalar b0 = b[i];
1267 
1268         simdscalar comp = leadVert[i];
1269 
1270         simdvector& v0 = verts[0];
1271         v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
1272         v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00);
1273 
1274         simdvector& v2 = verts[2];
1275         s = _simd_permute2f128_ps(a0, b0, 0x21);
1276         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1277 
1278         simdvector& v1 = verts[1];
1279         v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
1280     }
1281 
1282     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1283     return true;
1284 }
1285 
1286 #if ENABLE_AVX512_SIMD16
PaTriFan0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1287 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1288 {
1289     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
1290     return false;    // Not enough vertices to assemble 16 triangles.
1291 }
1292 
PaTriFan1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1293 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1294 {
1295     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
1296     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1297     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1298 
1299     simd16vector &v0 = verts[0];
1300     simd16vector &v1 = verts[1];
1301     simd16vector &v2 = verts[2];
1302 
1303     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1304     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1305     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1306 
1307     // for simd16 x, y, z, and w
1308     for (uint32_t i = 0; i < 4; i += 1)
1309     {
1310         simd16scalar shuff = _simd16_shuffle_ps(a[i], a[i], _MM_SHUFFLE(0, 0, 0, 0));               // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
1311 
1312         v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                        // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1313 
1314         simd16scalar temp0 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1315         simd16scalar temp1 = _simd16_permute2f128_ps(c[i], c[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
1316 
1317         simd16scalar blend = _simd16_blend_ps(temp0, temp1, 0xF000);                                // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
1318 
1319         v2[i] = _simd16_shuffle_ps(b[i], blend, _MM_SHUFFLE(1, 0, 3, 2));                           // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1320         v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1321     }
1322 
1323     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1324     return true;
1325 }
1326 
1327 #endif
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1328 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1329 {
1330 #if USE_SIMD16_FRONTEND
1331     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
1332     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1333     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1334 
1335     if (pa.useAlternateOffset)
1336     {
1337         primIndex += KNOB_SIMD_WIDTH;
1338     }
1339 
1340     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1341     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1342     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1343 
1344     // vert 0 from leading vertex
1345     verts[0] = swizzleLane0(a);
1346 
1347     // vert 1
1348     if (primIndex < 15)
1349     {
1350         verts[1] = swizzleLaneN(b, primIndex + 1);
1351     }
1352     else
1353     {
1354         verts[1] = swizzleLane0(c);
1355     }
1356 
1357     // vert 2
1358     if (primIndex < 14)
1359     {
1360         verts[2] = swizzleLaneN(b, primIndex + 2);
1361     }
1362     else
1363     {
1364         verts[2] = swizzleLaneN(c, primIndex - 14);
1365     }
1366 #else
1367     const simdvector &a = PaGetSimdVector(pa, pa.first, slot);
1368     const simdvector &b = PaGetSimdVector(pa, pa.prev, slot);
1369     const simdvector &c = PaGetSimdVector(pa, pa.cur, slot);
1370 
1371     // vert 0 from leading vertex
1372     verts[0] = swizzleLane0(a);
1373 
1374     // vert 1
1375     if (primIndex < 7)
1376     {
1377         verts[1] = swizzleLaneN(b, primIndex + 1);
1378     }
1379     else
1380     {
1381         verts[1] = swizzleLane0(c);
1382     }
1383 
1384     // vert 2
1385     if (primIndex < 6)
1386     {
1387         verts[2] = swizzleLaneN(b, primIndex + 2);
1388     }
1389     else
1390     {
1391         verts[2] = swizzleLaneN(c, primIndex - 6);
1392     }
1393 #endif
1394 }
1395 
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1396 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1397 {
1398     SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
1399     return false;    // Not enough vertices to assemble 8 triangles.
1400 }
1401 
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1402 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1403 {
1404 #if USE_SIMD16_FRONTEND
1405     simdvector a;
1406     simdvector b;
1407 
1408     if (!pa.useAlternateOffset)
1409     {
1410         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1411 
1412         for (uint32_t i = 0; i < 4; i += 1)
1413         {
1414             a[i] = _simd16_extract_ps(a_16[i], 0);
1415             b[i] = _simd16_extract_ps(a_16[i], 1);
1416         }
1417     }
1418     else
1419     {
1420         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1421 
1422         for (uint32_t i = 0; i < 4; i += 1)
1423         {
1424             a[i] = _simd16_extract_ps(b_16[i], 0);
1425             b[i] = _simd16_extract_ps(b_16[i], 1);
1426         }
1427     }
1428 
1429 #else
1430     simdvector &a = PaGetSimdVector(pa, 0, slot);
1431     simdvector &b = PaGetSimdVector(pa, 1, slot);
1432 
1433 #endif
1434     simdscalar s1, s2;
1435 
1436     for(int i = 0; i < 4; ++i)
1437     {
1438         simdscalar a0 = a[i];
1439         simdscalar b0 = b[i];
1440 
1441         s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
1442         s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
1443 
1444         simdvector& v0 = verts[0];
1445         v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
1446 
1447         simdvector& v1 = verts[1];
1448         v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
1449 
1450         simdvector& v2 = verts[2];
1451         v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
1452     }
1453 
1454     SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1455     return true;
1456 }
1457 
1458 #if ENABLE_AVX512_SIMD16
PaQuadList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1459 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1460 {
1461     SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
1462     return false;    // Not enough vertices to assemble 16 triangles.
1463 }
1464 
PaQuadList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1465 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1466 {
1467     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1468     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1469 
1470     simd16vector &v0 = verts[0];
1471     simd16vector &v1 = verts[1];
1472     simd16vector &v2 = verts[2];
1473 
1474     //  v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
1475     //  v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
1476     //  v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1477 
1478     // for simd16 x, y, z, and w
1479     for (uint32_t i = 0; i < 4; i += 1)
1480     {
1481         simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) = 10 00 10 00  // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
1482         simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) = 11 01 11 01  // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1483 
1484         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                          // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
1485         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                          // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
1486         v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                          // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1487     }
1488 
1489     SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1490     return true;
1491 }
1492 
1493 #endif
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1494 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1495 {
1496 #if USE_SIMD16_FRONTEND
1497     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1498     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1499 
1500     if (pa.useAlternateOffset)
1501     {
1502         primIndex += KNOB_SIMD_WIDTH;
1503     }
1504 
1505     switch (primIndex)
1506     {
1507     case 0:
1508         // triangle 0 - 0 1 2
1509         verts[0] = swizzleLane0(a);
1510         verts[1] = swizzleLane1(a);
1511         verts[2] = swizzleLane2(a);
1512         break;
1513     case 1:
1514         // triangle 1 - 0 2 3
1515         verts[0] = swizzleLane0(a);
1516         verts[1] = swizzleLane2(a);
1517         verts[2] = swizzleLane3(a);
1518         break;
1519     case 2:
1520         // triangle 2 - 4 5 6
1521         verts[0] = swizzleLane4(a);
1522         verts[1] = swizzleLane5(a);
1523         verts[2] = swizzleLane6(a);
1524         break;
1525     case 3:
1526         // triangle 3 - 4 6 7
1527         verts[0] = swizzleLane4(a);
1528         verts[1] = swizzleLane6(a);
1529         verts[2] = swizzleLane7(a);
1530         break;
1531     case 4:
1532         // triangle 4 - 8 9 A
1533         verts[0] = swizzleLane8(a);
1534         verts[1] = swizzleLane9(a);
1535         verts[2] = swizzleLaneA(a);
1536         break;
1537     case 5:
1538         // triangle 5 - 8 A B
1539         verts[0] = swizzleLane8(a);
1540         verts[1] = swizzleLaneA(a);
1541         verts[2] = swizzleLaneB(a);
1542         break;
1543     case 6:
1544         // triangle 6 - C D E
1545         verts[0] = swizzleLaneC(a);
1546         verts[1] = swizzleLaneD(a);
1547         verts[2] = swizzleLaneE(a);
1548         break;
1549     case 7:
1550         // triangle 7 - C E F
1551         verts[0] = swizzleLaneC(a);
1552         verts[1] = swizzleLaneE(a);
1553         verts[2] = swizzleLaneF(a);
1554         break;
1555     case 8:
1556         // triangle 0 - 0 1 2
1557         verts[0] = swizzleLane0(b);
1558         verts[1] = swizzleLane1(b);
1559         verts[2] = swizzleLane2(b);
1560         break;
1561     case 9:
1562         // triangle 1 - 0 2 3
1563         verts[0] = swizzleLane0(b);
1564         verts[1] = swizzleLane2(b);
1565         verts[2] = swizzleLane3(b);
1566         break;
1567     case 10:
1568         // triangle 2 - 4 5 6
1569         verts[0] = swizzleLane4(b);
1570         verts[1] = swizzleLane5(b);
1571         verts[2] = swizzleLane6(b);
1572         break;
1573     case 11:
1574         // triangle 3 - 4 6 7
1575         verts[0] = swizzleLane4(b);
1576         verts[1] = swizzleLane6(b);
1577         verts[2] = swizzleLane7(b);
1578         break;
1579     case 12:
1580         // triangle 4 - 8 9 A
1581         verts[0] = swizzleLane8(b);
1582         verts[1] = swizzleLane9(b);
1583         verts[2] = swizzleLaneA(b);
1584         break;
1585     case 13:
1586         // triangle 5 - 8 A B
1587         verts[0] = swizzleLane8(b);
1588         verts[1] = swizzleLaneA(b);
1589         verts[2] = swizzleLaneB(b);
1590         break;
1591     case 14:
1592         // triangle 6 - C D E
1593         verts[0] = swizzleLaneC(b);
1594         verts[1] = swizzleLaneD(b);
1595         verts[2] = swizzleLaneE(b);
1596         break;
1597     case 15:
1598         // triangle 7 - C E F
1599         verts[0] = swizzleLaneC(b);
1600         verts[1] = swizzleLaneE(b);
1601         verts[2] = swizzleLaneF(b);
1602         break;
1603     }
1604 #else
1605     const simdvector &a = PaGetSimdVector(pa, 0, slot);
1606     const simdvector &b = PaGetSimdVector(pa, 1, slot);
1607 
1608     switch (primIndex)
1609     {
1610     case 0:
1611         // triangle 0 - 0 1 2
1612         verts[0] = swizzleLane0(a);
1613         verts[1] = swizzleLane1(a);
1614         verts[2] = swizzleLane2(a);
1615         break;
1616     case 1:
1617         // triangle 1 - 0 2 3
1618         verts[0] = swizzleLane0(a);
1619         verts[1] = swizzleLane2(a);
1620         verts[2] = swizzleLane3(a);
1621         break;
1622     case 2:
1623         // triangle 2 - 4 5 6
1624         verts[0] = swizzleLane4(a);
1625         verts[1] = swizzleLane5(a);
1626         verts[2] = swizzleLane6(a);
1627         break;
1628     case 3:
1629         // triangle 3 - 4 6 7
1630         verts[0] = swizzleLane4(a);
1631         verts[1] = swizzleLane6(a);
1632         verts[2] = swizzleLane7(a);
1633         break;
1634     case 4:
1635         // triangle 4 - 8 9 10 (0 1 2)
1636         verts[0] = swizzleLane0(b);
1637         verts[1] = swizzleLane1(b);
1638         verts[2] = swizzleLane2(b);
1639         break;
1640     case 5:
1641         // triangle 1 - 0 2 3
1642         verts[0] = swizzleLane0(b);
1643         verts[1] = swizzleLane2(b);
1644         verts[2] = swizzleLane3(b);
1645         break;
1646     case 6:
1647         // triangle 2 - 4 5 6
1648         verts[0] = swizzleLane4(b);
1649         verts[1] = swizzleLane5(b);
1650         verts[2] = swizzleLane6(b);
1651         break;
1652     case 7:
1653         // triangle 3 - 4 6 7
1654         verts[0] = swizzleLane4(b);
1655         verts[1] = swizzleLane6(b);
1656         verts[2] = swizzleLane7(b);
1657         break;
1658     }
1659 #endif
1660 }
1661 
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1662 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1663 {
1664     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
1665     return false;
1666 }
1667 
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1668 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1669 {
1670     PaLineStrip1(pa, slot, verts);
1671 
1672     if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
1673     {
1674         // loop reconnect now
1675         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1676 
1677 #if USE_SIMD16_FRONTEND
1678         simdvector first;
1679 
1680         const simd16vector &first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1681 
1682         if (!pa.useAlternateOffset)
1683         {
1684             for (uint32_t i = 0; i < 4; i += 1)
1685             {
1686                 first[i] = _simd16_extract_ps(first_16[i], 0);
1687             }
1688         }
1689         else
1690         {
1691             for (uint32_t i = 0; i < 4; i += 1)
1692             {
1693                 first[i] = _simd16_extract_ps(first_16[i], 1);
1694             }
1695         }
1696 
1697 #else
1698         simdvector &first = PaGetSimdVector(pa, pa.first, slot);
1699 
1700 #endif
1701         for (int i = 0; i < 4; i++)
1702         {
1703             float *firstVtx = (float *)&(first[i]);
1704             float *targetVtx = (float *)&(verts[1][i]);
1705             targetVtx[lane] = firstVtx[0];
1706         }
1707     }
1708 
1709     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1710     return true;
1711 }
1712 
1713 #if ENABLE_AVX512_SIMD16
PaLineLoop0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1714 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1715 {
1716     SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
1717     return false;
1718 }
1719 
PaLineLoop1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1720 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1721 {
1722     PaLineStrip1_simd16(pa, slot, verts);
1723 
1724     if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
1725     {
1726         // loop reconnect now
1727         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1728 
1729         const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
1730 
1731         for (int i = 0; i < 4; i++)
1732         {
1733             float *firstVtx = (float *)&(first[i]);
1734             float *targetVtx = (float *)&(verts[1][i]);
1735             targetVtx[lane] = firstVtx[0];
1736         }
1737     }
1738 
1739     SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1740     return true;
1741 }
1742 
1743 #endif
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1744 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1745 {
1746     PaLineStripSingle0(pa, slot, primIndex, verts);
1747 
1748     if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
1749     {
1750 #if USE_SIMD16_FRONTEND
1751         const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
1752 
1753         verts[1] = swizzleLane0(first);
1754 #else
1755         const simdvector &first = PaGetSimdVector(pa, pa.first, slot);
1756 
1757         verts[1] = swizzleLane0(first);
1758 #endif
1759     }
1760 }
1761 
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1762 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1763 {
1764     SetNextPaState(pa, PaLineList1, PaLineListSingle0);
1765     return false;    // Not enough vertices to assemble 8 lines
1766 }
1767 
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1768 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1769 {
1770 #if USE_SIMD16_FRONTEND
1771     simdvector a;
1772     simdvector b;
1773 
1774     if (!pa.useAlternateOffset)
1775     {
1776         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1777 
1778         for (uint32_t i = 0; i < 4; i += 1)
1779         {
1780             a[i] = _simd16_extract_ps(a_16[i], 0);
1781             b[i] = _simd16_extract_ps(a_16[i], 1);
1782         }
1783     }
1784     else
1785     {
1786         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1787 
1788         for (uint32_t i = 0; i < 4; i += 1)
1789         {
1790             a[i] = _simd16_extract_ps(b_16[i], 0);
1791             b[i] = _simd16_extract_ps(b_16[i], 1);
1792         }
1793     }
1794 
1795 #else
1796     simdvector &a = PaGetSimdVector(pa, 0, slot);
1797     simdvector &b = PaGetSimdVector(pa, 1, slot);
1798 
1799 #endif
1800     /// @todo: verify provoking vertex is correct
1801     // Line list 0  1  2  3  4  5  6  7
1802     //           8  9 10 11 12 13 14 15
1803 
1804     // shuffle:
1805     //           0 2 4 6 8 10 12 14
1806     //           1 3 5 7 9 11 13 15
1807 
1808     for (uint32_t i = 0; i < 4; ++i)
1809     {
1810         // 0 1 2 3 8 9 10 11
1811         __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
1812         // 4 5 6 7 12 13 14 15
1813         __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
1814 
1815         // 0 2 4 6 8 10 12 14
1816         verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
1817         // 1 3 5 7 9 11 13 15
1818         verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
1819     }
1820 
1821     SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1822     return true;
1823 }
1824 
1825 #if ENABLE_AVX512_SIMD16
PaLineList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1826 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1827 {
1828     SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
1829     return false;    // Not enough vertices to assemble 16 lines
1830 }
1831 
PaLineList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1832 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1833 {
1834     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1835     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1836 
1837     simd16vector &v0 = verts[0];
1838     simd16vector &v1 = verts[1];
1839 
1840     // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1841     // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
1842 
1843     // for simd16 x, y, z, and w
1844     for (int i = 0; i < 4; i += 1)
1845     {
1846         simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) 10 00 10 00    // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
1847         simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) 11 01 11 01    // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1848 
1849         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                          // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1850         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                          // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
1851     }
1852 
1853     SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1854     return true;
1855 }
1856 
1857 #endif
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1858 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1859 {
1860 #if USE_SIMD16_FRONTEND
1861     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1862     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1863 
1864     if (pa.useAlternateOffset)
1865     {
1866         primIndex += KNOB_SIMD_WIDTH;
1867     }
1868 
1869     switch (primIndex)
1870     {
1871     case 0:
1872         verts[0] = swizzleLane0(a);
1873         verts[1] = swizzleLane1(a);
1874         break;
1875     case 1:
1876         verts[0] = swizzleLane2(a);
1877         verts[1] = swizzleLane3(a);
1878         break;
1879     case 2:
1880         verts[0] = swizzleLane4(a);
1881         verts[1] = swizzleLane5(a);
1882         break;
1883     case 3:
1884         verts[0] = swizzleLane6(a);
1885         verts[1] = swizzleLane7(a);
1886         break;
1887     case 4:
1888         verts[0] = swizzleLane8(a);
1889         verts[1] = swizzleLane9(a);
1890         break;
1891     case 5:
1892         verts[0] = swizzleLaneA(a);
1893         verts[1] = swizzleLaneB(a);
1894         break;
1895     case 6:
1896         verts[0] = swizzleLaneC(a);
1897         verts[1] = swizzleLaneD(a);
1898         break;
1899     case 7:
1900         verts[0] = swizzleLaneE(a);
1901         verts[1] = swizzleLaneF(a);
1902         break;
1903     case 8:
1904         verts[0] = swizzleLane0(b);
1905         verts[1] = swizzleLane1(b);
1906         break;
1907     case 9:
1908         verts[0] = swizzleLane2(b);
1909         verts[1] = swizzleLane3(b);
1910         break;
1911     case 10:
1912         verts[0] = swizzleLane4(b);
1913         verts[1] = swizzleLane5(b);
1914         break;
1915     case 11:
1916         verts[0] = swizzleLane6(b);
1917         verts[1] = swizzleLane7(b);
1918         break;
1919     case 12:
1920         verts[0] = swizzleLane8(b);
1921         verts[1] = swizzleLane9(b);
1922         break;
1923     case 13:
1924         verts[0] = swizzleLaneA(b);
1925         verts[1] = swizzleLaneB(b);
1926         break;
1927     case 14:
1928         verts[0] = swizzleLaneC(b);
1929         verts[1] = swizzleLaneD(b);
1930         break;
1931     case 15:
1932         verts[0] = swizzleLaneE(b);
1933         verts[1] = swizzleLaneF(b);
1934         break;
1935     }
1936 #else
1937     const simdvector &a = PaGetSimdVector(pa, 0, slot);
1938     const simdvector &b = PaGetSimdVector(pa, 1, slot);
1939 
1940     switch (primIndex)
1941     {
1942     case 0:
1943         verts[0] = swizzleLane0(a);
1944         verts[1] = swizzleLane1(a);
1945         break;
1946     case 1:
1947         verts[0] = swizzleLane2(a);
1948         verts[1] = swizzleLane3(a);
1949         break;
1950     case 2:
1951         verts[0] = swizzleLane4(a);
1952         verts[1] = swizzleLane5(a);
1953         break;
1954     case 3:
1955         verts[0] = swizzleLane6(a);
1956         verts[1] = swizzleLane7(a);
1957         break;
1958     case 4:
1959         verts[0] = swizzleLane0(b);
1960         verts[1] = swizzleLane1(b);
1961         break;
1962     case 5:
1963         verts[0] = swizzleLane2(b);
1964         verts[1] = swizzleLane3(b);
1965         break;
1966     case 6:
1967         verts[0] = swizzleLane4(b);
1968         verts[1] = swizzleLane5(b);
1969         break;
1970     case 7:
1971         verts[0] = swizzleLane6(b);
1972         verts[1] = swizzleLane7(b);
1973         break;
1974     }
1975 #endif
1976 }
1977 
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1978 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1979 {
1980     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
1981     return false;    // Not enough vertices to assemble 8 lines
1982 }
1983 
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1984 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1985 {
1986 #if USE_SIMD16_FRONTEND
1987     simdvector a;
1988     simdvector b;
1989 
1990     if (!pa.useAlternateOffset)
1991     {
1992         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1993 
1994         for (uint32_t i = 0; i < 4; i += 1)
1995         {
1996             a[i] = _simd16_extract_ps(a_16[i], 0);
1997             b[i] = _simd16_extract_ps(a_16[i], 1);
1998         }
1999     }
2000     else
2001     {
2002         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
2003 
2004         for (uint32_t i = 0; i < 4; i += 1)
2005         {
2006             a[i] = _simd16_extract_ps(b_16[i], 0);
2007             b[i] = _simd16_extract_ps(b_16[i], 1);
2008         }
2009     }
2010 
2011 #else
2012     simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
2013     simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
2014 
2015 #endif
2016     /// @todo: verify provoking vertex is correct
2017     // Line list 0  1  2  3  4  5  6  7
2018     //           8  9 10 11 12 13 14 15
2019 
2020     // shuffle:
2021     //           0  1  2  3  4  5  6  7
2022     //           1  2  3  4  5  6  7  8
2023 
2024     verts[0] = a;
2025 
2026     for(uint32_t i = 0; i < 4; ++i)
2027     {
2028         // 1 2 3 x 5 6 7 x
2029         __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
2030         // 4 5 6 7 8 9 10 11
2031         __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
2032 
2033         // x x x 4 x x x 8
2034         __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
2035 
2036         verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
2037     }
2038 
2039     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2040     return true;
2041 }
2042 
2043 #if ENABLE_AVX512_SIMD16
PaLineStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2044 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2045 {
2046     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
2047     return false;    // Not enough vertices to assemble 16 lines
2048 }
2049 
PaLineStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2050 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2051 {
2052     const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
2053 
2054     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2055     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2056 
2057     simd16vector &v0 = verts[0];
2058     simd16vector &v1 = verts[1];
2059 
2060     // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2061     // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2062 
2063     v0 = a;                                                                                         // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2064 
2065     // for simd16 x, y, z, and w
2066     for (int i = 0; i < 4; i += 1)
2067     {
2068         simd16scalar temp = _simd16_blend_ps(a[i], b[i], 0x0001);                                   // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2069 
2070         v1[i] = _simd16_permute_ps(temp, perm);                                                     // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2071     }
2072 
2073     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2074     return true;
2075 }
2076 
2077 #endif
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2078 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2079 {
2080 #if USE_SIMD16_FRONTEND
2081     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2082     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2083 
2084     if (pa.useAlternateOffset)
2085     {
2086         primIndex += KNOB_SIMD_WIDTH;
2087     }
2088 
2089     switch (primIndex)
2090     {
2091     case 0:
2092         verts[0] = swizzleLane0(a);
2093         verts[1] = swizzleLane1(a);
2094         break;
2095     case 1:
2096         verts[0] = swizzleLane1(a);
2097         verts[1] = swizzleLane2(a);
2098         break;
2099     case 2:
2100         verts[0] = swizzleLane2(a);
2101         verts[1] = swizzleLane3(a);
2102         break;
2103     case 3:
2104         verts[0] = swizzleLane3(a);
2105         verts[1] = swizzleLane4(a);
2106         break;
2107     case 4:
2108         verts[0] = swizzleLane4(a);
2109         verts[1] = swizzleLane5(a);
2110         break;
2111     case 5:
2112         verts[0] = swizzleLane5(a);
2113         verts[1] = swizzleLane6(a);
2114         break;
2115     case 6:
2116         verts[0] = swizzleLane6(a);
2117         verts[1] = swizzleLane7(a);
2118         break;
2119     case 7:
2120         verts[0] = swizzleLane7(a);
2121         verts[1] = swizzleLane8(a);
2122         break;
2123     case 8:
2124         verts[0] = swizzleLane8(a);
2125         verts[1] = swizzleLane9(a);
2126         break;
2127     case 9:
2128         verts[0] = swizzleLane9(a);
2129         verts[1] = swizzleLaneA(a);
2130         break;
2131     case 10:
2132         verts[0] = swizzleLaneA(a);
2133         verts[1] = swizzleLaneB(a);
2134         break;
2135     case 11:
2136         verts[0] = swizzleLaneB(a);
2137         verts[1] = swizzleLaneC(a);
2138         break;
2139     case 12:
2140         verts[0] = swizzleLaneC(a);
2141         verts[1] = swizzleLaneD(a);
2142         break;
2143     case 13:
2144         verts[0] = swizzleLaneD(a);
2145         verts[1] = swizzleLaneE(a);
2146         break;
2147     case 14:
2148         verts[0] = swizzleLaneE(a);
2149         verts[1] = swizzleLaneF(a);
2150         break;
2151     case 15:
2152         verts[0] = swizzleLaneF(a);
2153         verts[1] = swizzleLane0(b);
2154         break;
2155     }
2156 #else
2157     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
2158     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
2159 
2160     switch (primIndex)
2161     {
2162     case 0:
2163         verts[0] = swizzleLane0(a);
2164         verts[1] = swizzleLane1(a);
2165         break;
2166     case 1:
2167         verts[0] = swizzleLane1(a);
2168         verts[1] = swizzleLane2(a);
2169         break;
2170     case 2:
2171         verts[0] = swizzleLane2(a);
2172         verts[1] = swizzleLane3(a);
2173         break;
2174     case 3:
2175         verts[0] = swizzleLane3(a);
2176         verts[1] = swizzleLane4(a);
2177         break;
2178     case 4:
2179         verts[0] = swizzleLane4(a);
2180         verts[1] = swizzleLane5(a);
2181         break;
2182     case 5:
2183         verts[0] = swizzleLane5(a);
2184         verts[1] = swizzleLane6(a);
2185         break;
2186     case 6:
2187         verts[0] = swizzleLane6(a);
2188         verts[1] = swizzleLane7(a);
2189         break;
2190     case 7:
2191         verts[0] = swizzleLane7(a);
2192         verts[1] = swizzleLane0(b);
2193         break;
2194     }
2195 #endif
2196 }
2197 
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2198 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2199 {
2200 #if USE_SIMD16_FRONTEND
2201     simdvector a;
2202 
2203     const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2204 
2205     if (!pa.useAlternateOffset)
2206     {
2207         for (uint32_t i = 0; i < 4; i += 1)
2208         {
2209             a[i] = _simd16_extract_ps(a_16[i], 0);
2210         }
2211     }
2212     else
2213     {
2214         for (uint32_t i = 0; i < 4; i += 1)
2215         {
2216             a[i] = _simd16_extract_ps(a_16[i], 1);
2217         }
2218     }
2219 
2220 #else
2221     simdvector &a = PaGetSimdVector(pa, 0, slot);
2222 
2223 #endif
2224     verts[0] = a;  // points only have 1 vertex.
2225 
2226     SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2227     return true;
2228 }
2229 
2230 #if ENABLE_AVX512_SIMD16
PaPoints0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2231 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2232 {
2233     simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot);
2234 
2235     verts[0] = a;  // points only have 1 vertex.
2236 
2237     SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2238     return true;
2239 }
2240 
2241 #endif
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2242 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2243 {
2244 #if USE_SIMD16_FRONTEND
2245     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
2246 
2247     if (pa.useAlternateOffset)
2248     {
2249         primIndex += KNOB_SIMD_WIDTH;
2250     }
2251 
2252     verts[0] = swizzleLaneN(a, primIndex);
2253 #else
2254     const simdvector &a = PaGetSimdVector(pa, 0, slot);
2255 
2256     verts[0] = swizzleLaneN(a, primIndex);
2257 #endif
2258 }
2259 
2260 //////////////////////////////////////////////////////////////////////////
2261 /// @brief State 1 for RECT_LIST topology.
2262 ///        There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2263 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2264 {
2265     SetNextPaState(pa, PaRectList1, PaRectListSingle0);
2266     return false;
2267 }
2268 
2269 //////////////////////////////////////////////////////////////////////////
2270 /// @brief State 1 for RECT_LIST topology.
2271 ///   Rect lists has the following format.
2272 ///             w          x          y           z
2273 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
2274 ///         | \ |      | \ |      | \ |       | \ |
2275 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
2276 ///            v0         v3         v6          v9
2277 ///
2278 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2279 ///
2280 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2281 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2282 ///   etc.
2283 ///
2284 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2285 ///   where v0 contains all the first vertices for 8 triangles.
2286 ///
2287 ///     Result:
2288 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2289 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2290 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2291 ///
2292 /// @param pa - State for PA state machine.
2293 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2294 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2295 bool PaRectList1(
2296     PA_STATE_OPT& pa,
2297     uint32_t slot,
2298     simdvector verts[])
2299 {
2300     // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
2301 #if USE_SIMD16_FRONTEND
2302     simdvector a;
2303     simdvector b;
2304 
2305     if (!pa.useAlternateOffset)
2306     {
2307         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2308 
2309         for (uint32_t i = 0; i < 4; i += 1)
2310         {
2311             a[i] = _simd16_extract_ps(a_16[i], 0);
2312             b[i] = _simd16_extract_ps(a_16[i], 1);
2313         }
2314     }
2315     else
2316     {
2317         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2318 
2319         for (uint32_t i = 0; i < 4; i += 1)
2320         {
2321             a[i] = _simd16_extract_ps(b_16[i], 0);
2322             b[i] = _simd16_extract_ps(b_16[i], 1);;
2323         }
2324     }
2325 
2326 #else
2327     simdvector &a = PaGetSimdVector(pa, 0, slot);           // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
2328     simdvector &b = PaGetSimdVector(pa, 1, slot);           // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
2329 
2330 #endif
2331     __m256 tmp0, tmp1, tmp2;
2332 
2333     // Loop over each component in the simdvector.
2334     for(int i = 0; i < 4; ++i)
2335     {
2336         simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2337         tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2338         v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
2339         tmp1  = _mm256_permute_ps(v0[i], 0xF0);             // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
2340         v0[i] = _mm256_permute_ps(v0[i], 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
2341         v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
2342 
2343         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2344         ///      AVX2 should make this much cheaper.
2345         simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2346         v1[i] = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
2347         tmp1  = _mm256_permute_ps(a[i], 0x43);              // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
2348         tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);         // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
2349         tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);    // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
2350         v1[i] = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
2351         v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
2352         v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2353 
2354         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2355         simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
2356         v2[i] = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
2357         tmp1  = _mm256_permute_ps(tmp2, 0x31);              // tmp1 = { v2, *, v5, *, *, *, *, * }
2358         v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
2359 
2360         // Need to compute 4th implied vertex for the rectangle.
2361         tmp2  = _mm256_sub_ps(v0[i], v1[i]);
2362         tmp2  = _mm256_add_ps(tmp2, v2[i]);                 // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
2363         tmp2  = _mm256_permute_ps(tmp2, 0xA0);              // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
2364         v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
2365     }
2366 
2367     SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2368     return true;
2369 }
2370 
2371 //////////////////////////////////////////////////////////////////////////
2372 /// @brief State 2 for RECT_LIST topology.
2373 ///        Not implemented unless there is a use case for more then 8 rects.
2374 /// @param pa - State for PA state machine.
2375 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2376 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2377 bool PaRectList2(
2378     PA_STATE_OPT& pa,
2379     uint32_t slot,
2380     simdvector verts[])
2381 {
2382     SWR_INVALID("Is rect list used for anything other then clears?");
2383     SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2384     return true;
2385 }
2386 
2387 #if ENABLE_AVX512_SIMD16
2388 //////////////////////////////////////////////////////////////////////////
2389 /// @brief State 1 for RECT_LIST topology.
2390 ///        There is not enough to assemble 8 triangles.
PaRectList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2391 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2392 {
2393     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
2394     return false;
2395 }
2396 
2397 //////////////////////////////////////////////////////////////////////////
2398 /// @brief State 1 for RECT_LIST topology.
2399 ///   Rect lists has the following format.
2400 ///             w          x          y           z
2401 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
2402 ///         | \ |      | \ |      | \ |       | \ |
2403 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
2404 ///            v0         v3         v6          v9
2405 ///
2406 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2407 ///
2408 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2409 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2410 ///   etc.
2411 ///
2412 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2413 ///   where v0 contains all the first vertices for 8 triangles.
2414 ///
2415 ///     Result:
2416 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2417 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2418 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2419 ///
2420 /// @param pa - State for PA state machine.
2421 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2422 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2423 bool PaRectList1_simd16(
2424     PA_STATE_OPT& pa,
2425     uint32_t slot,
2426     simd16vector verts[])
2427 {
2428     simdvector a;
2429     simdvector b;
2430 
2431     if (!pa.useAlternateOffset)
2432     {
2433         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7, v8, v9, v10, v11, v12, v13, v14, v15 }
2434 
2435         for (uint32_t i = 0; i < 4; i += 1)
2436         {
2437             a[i] = _simd16_extract_ps(a_16[i], 0);
2438             b[i] = _simd16_extract_ps(a_16[i], 1);
2439         }
2440     }
2441     else
2442     {
2443         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
2444 
2445         for (uint32_t i = 0; i < 4; i += 1)
2446         {
2447             a[i] = _simd16_extract_ps(b_16[i], 0);
2448             b[i] = _simd16_extract_ps(b_16[i], 1);
2449         }
2450     }
2451 
2452     simd16vector &v0 = verts[0];                            // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2453     simd16vector &v1 = verts[1];                            // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2454     simd16vector &v2 = verts[2];                            // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
2455 
2456     // Loop over each component in the simdvector.
2457     for (int i = 0; i < 4; i += 1)
2458     {
2459         simdscalar v0_lo;                                   // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2460         simdscalar v1_lo;                                   // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2461         simdscalar v2_lo;                                   // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
2462 
2463         __m256 tmp0, tmp1, tmp2;
2464 
2465         tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2466         v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
2467         tmp1 = _mm256_permute_ps(v0_lo, 0xF0);              // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
2468         v0_lo = _mm256_permute_ps(v0_lo, 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
2469         v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
2470 
2471         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2472         ///      AVX2 should make this much cheaper.
2473         v1_lo = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
2474         tmp1 = _mm256_permute_ps(a[i], 0x43);               // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
2475         tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0);          // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
2476         tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);     // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
2477         v1_lo = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
2478         v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
2479         v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2480 
2481         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2482         v2_lo = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
2483         tmp1 = _mm256_permute_ps(tmp2, 0x31);               // tmp1 = { v2, *, v5, *, *, *, *, * }
2484         v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
2485 
2486         // Need to compute 4th implied vertex for the rectangle.
2487         tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
2488         tmp2 = _mm256_add_ps(tmp2, v2_lo);                  // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
2489         tmp2 = _mm256_permute_ps(tmp2, 0xA0);               // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
2490         v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
2491 
2492         v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
2493         v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
2494         v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
2495     }
2496 
2497     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2498     return true;
2499 }
2500 
2501 //////////////////////////////////////////////////////////////////////////
2502 /// @brief State 2 for RECT_LIST topology.
2503 ///        Not implemented unless there is a use case for more then 8 rects.
2504 /// @param pa - State for PA state machine.
2505 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2506 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2507 bool PaRectList2_simd16(
2508     PA_STATE_OPT& pa,
2509     uint32_t slot,
2510     simd16vector verts[])
2511 {
2512     SWR_INVALID("Is rect list used for anything other then clears?");
2513     SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2514     return true;
2515 }
2516 
2517 #endif
2518 //////////////////////////////////////////////////////////////////////////
2519 /// @brief This procedure is called by the Binner to assemble the attributes.
2520 ///        Unlike position, which is stored vertically, the attributes are
2521 ///        stored horizontally. The outputs from the VS, labeled as 'a' and
2522 ///        'b' are vertical. This function needs to transpose the lanes
2523 ///        containing the vertical attribute data into horizontal form.
2524 /// @param pa - State for PA state machine.
2525 /// @param slot - Index into VS output for a given attribute.
2526 /// @param primIndex - Binner processes each triangle individually.
2527 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2528 void PaRectListSingle0(
2529     PA_STATE_OPT& pa,
2530     uint32_t slot,
2531     uint32_t primIndex,
2532     simd4scalar verts[])
2533 {
2534     // We have 12 simdscalars contained within 3 simdvectors which
2535     // hold at least 8 triangles worth of data. We want to assemble a single
2536     // triangle with data in horizontal form.
2537 #if USE_SIMD16_FRONTEND
2538     simdvector a;
2539     simdvector b;
2540 
2541     if (!pa.useAlternateOffset)
2542     {
2543         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2544 
2545         for (uint32_t i = 0; i < 4; i += 1)
2546         {
2547             a[i] = _simd16_extract_ps(a_16[i], 0);
2548             b[i] = _simd16_extract_ps(a_16[i], 1);
2549         }
2550     }
2551     else
2552     {
2553         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2554 
2555         for (uint32_t i = 0; i < 4; i += 1)
2556         {
2557             a[i] = _simd16_extract_ps(b_16[i], 0);
2558             b[i] = _simd16_extract_ps(b_16[i], 1);;
2559         }
2560     }
2561 
2562 #else
2563     simdvector& a = PaGetSimdVector(pa, 0, slot);
2564 
2565 #endif
2566     // Convert from vertical to horizontal.
2567     switch(primIndex)
2568     {
2569     case 0:
2570         verts[0] = swizzleLane0(a);
2571         verts[1] = swizzleLane1(a);
2572         verts[2] = swizzleLane2(a);
2573         break;
2574     case 1:
2575         verts[0] = swizzleLane0(a);
2576         verts[1] = swizzleLane2(a);
2577         verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
2578         break;
2579     case 2:
2580     case 3:
2581     case 4:
2582     case 5:
2583     case 6:
2584     case 7:
2585         SWR_INVALID("Invalid primIndex: %d", primIndex);
2586         break;
2587     };
2588 }
2589 
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,uint32_t in_vertexStride,bool in_isStreaming,uint32_t numVertsPerPrim,PRIMITIVE_TOPOLOGY topo)2590 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
2591     uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) :
2592     PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
2593     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
2594 {
2595     const API_STATE& state = GetApiState(pDC);
2596 
2597     this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
2598 
2599 #if ENABLE_AVX512_SIMD16
2600     pfnPaFunc_simd16 = nullptr;
2601 
2602 #endif
2603     switch (this->binTopology)
2604     {
2605         case TOP_TRIANGLE_LIST:
2606             this->pfnPaFunc = PaTriList0;
2607 #if ENABLE_AVX512_SIMD16
2608             this->pfnPaFunc_simd16 = PaTriList0_simd16;
2609 #endif
2610             break;
2611         case TOP_TRIANGLE_STRIP:
2612             this->pfnPaFunc = PaTriStrip0;
2613 #if ENABLE_AVX512_SIMD16
2614             this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2615 #endif
2616             break;
2617         case TOP_TRIANGLE_FAN:
2618             this->pfnPaFunc = PaTriFan0;
2619 #if ENABLE_AVX512_SIMD16
2620             this->pfnPaFunc_simd16 = PaTriFan0_simd16;
2621 #endif
2622             break;
2623         case TOP_QUAD_LIST:
2624             this->pfnPaFunc = PaQuadList0;
2625 #if ENABLE_AVX512_SIMD16
2626             this->pfnPaFunc_simd16 = PaQuadList0_simd16;
2627 #endif
2628             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
2629             break;
2630         case TOP_QUAD_STRIP:
2631             // quad strip pattern when decomposed into triangles is the same as verts strips
2632             this->pfnPaFunc = PaTriStrip0;
2633 #if ENABLE_AVX512_SIMD16
2634             this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2635 #endif
2636             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
2637             break;
2638         case TOP_LINE_LIST:
2639             this->pfnPaFunc = PaLineList0;
2640 #if ENABLE_AVX512_SIMD16
2641             this->pfnPaFunc_simd16 = PaLineList0_simd16;
2642 #endif
2643             this->numPrims = in_numPrims;
2644             break;
2645         case TOP_LINE_STRIP:
2646             this->pfnPaFunc = PaLineStrip0;
2647 #if ENABLE_AVX512_SIMD16
2648             this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
2649 #endif
2650             this->numPrims = in_numPrims;
2651             break;
2652         case TOP_LINE_LOOP:
2653             this->pfnPaFunc = PaLineLoop0;
2654 #if ENABLE_AVX512_SIMD16
2655             this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
2656 #endif
2657             this->numPrims = in_numPrims;
2658             break;
2659         case TOP_POINT_LIST:
2660             this->pfnPaFunc = PaPoints0;
2661 #if ENABLE_AVX512_SIMD16
2662             this->pfnPaFunc_simd16 = PaPoints0_simd16;
2663 #endif
2664             this->numPrims = in_numPrims;
2665             break;
2666         case TOP_RECT_LIST:
2667             this->pfnPaFunc = PaRectList0;
2668 #if ENABLE_AVX512_SIMD16
2669             this->pfnPaFunc_simd16 = PaRectList0_simd16;
2670 #endif
2671             this->numPrims = in_numPrims * 2;
2672             break;
2673 
2674         case TOP_PATCHLIST_1:
2675             this->pfnPaFunc = PaPatchList<1>;
2676 #if ENABLE_AVX512_SIMD16
2677             this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
2678 #endif
2679             break;
2680         case TOP_PATCHLIST_2:
2681             this->pfnPaFunc = PaPatchList<2>;
2682 #if ENABLE_AVX512_SIMD16
2683             this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
2684 #endif
2685             break;
2686         case TOP_PATCHLIST_3:
2687             this->pfnPaFunc = PaPatchList<3>;
2688 #if ENABLE_AVX512_SIMD16
2689             this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
2690 #endif
2691             break;
2692         case TOP_PATCHLIST_4:
2693             this->pfnPaFunc = PaPatchList<4>;
2694 #if ENABLE_AVX512_SIMD16
2695             this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
2696 #endif
2697             break;
2698         case TOP_PATCHLIST_5:
2699             this->pfnPaFunc = PaPatchList<5>;
2700 #if ENABLE_AVX512_SIMD16
2701             this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
2702 #endif
2703             break;
2704         case TOP_PATCHLIST_6:
2705             this->pfnPaFunc = PaPatchList<6>;
2706 #if ENABLE_AVX512_SIMD16
2707             this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
2708 #endif
2709             break;
2710         case TOP_PATCHLIST_7:
2711             this->pfnPaFunc = PaPatchList<7>;
2712 #if ENABLE_AVX512_SIMD16
2713             this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
2714 #endif
2715             break;
2716         case TOP_PATCHLIST_8:
2717             this->pfnPaFunc = PaPatchList<8>;
2718 #if ENABLE_AVX512_SIMD16
2719             this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
2720 #endif
2721             break;
2722         case TOP_PATCHLIST_9:
2723             this->pfnPaFunc = PaPatchList<9>;
2724 #if ENABLE_AVX512_SIMD16
2725             this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
2726 #endif
2727             break;
2728         case TOP_PATCHLIST_10:
2729             this->pfnPaFunc = PaPatchList<10>;
2730 #if ENABLE_AVX512_SIMD16
2731             this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
2732 #endif
2733             break;
2734         case TOP_PATCHLIST_11:
2735             this->pfnPaFunc = PaPatchList<11>;
2736 #if ENABLE_AVX512_SIMD16
2737             this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
2738 #endif
2739             break;
2740         case TOP_PATCHLIST_12:
2741             this->pfnPaFunc = PaPatchList<12>;
2742 #if ENABLE_AVX512_SIMD16
2743             this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
2744 #endif
2745             break;
2746         case TOP_PATCHLIST_13:
2747             this->pfnPaFunc = PaPatchList<13>;
2748 #if ENABLE_AVX512_SIMD16
2749             this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
2750 #endif
2751             break;
2752         case TOP_PATCHLIST_14:
2753             this->pfnPaFunc = PaPatchList<14>;
2754 #if ENABLE_AVX512_SIMD16
2755             this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
2756 #endif
2757             break;
2758         case TOP_PATCHLIST_15:
2759             this->pfnPaFunc = PaPatchList<15>;
2760 #if ENABLE_AVX512_SIMD16
2761             this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
2762 #endif
2763             break;
2764         case TOP_PATCHLIST_16:
2765             this->pfnPaFunc = PaPatchList<16>;
2766 #if ENABLE_AVX512_SIMD16
2767             this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
2768 #endif
2769             break;
2770         case TOP_PATCHLIST_17:
2771             this->pfnPaFunc = PaPatchList<17>;
2772 #if ENABLE_AVX512_SIMD16
2773             this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
2774 #endif
2775             break;
2776         case TOP_PATCHLIST_18:
2777             this->pfnPaFunc = PaPatchList<18>;
2778 #if ENABLE_AVX512_SIMD16
2779             this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
2780 #endif
2781             break;
2782         case TOP_PATCHLIST_19:
2783             this->pfnPaFunc = PaPatchList<19>;
2784 #if ENABLE_AVX512_SIMD16
2785             this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
2786 #endif
2787             break;
2788         case TOP_PATCHLIST_20:
2789             this->pfnPaFunc = PaPatchList<20>;
2790 #if ENABLE_AVX512_SIMD16
2791             this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
2792 #endif
2793             break;
2794         case TOP_PATCHLIST_21:
2795             this->pfnPaFunc = PaPatchList<21>;
2796 #if ENABLE_AVX512_SIMD16
2797             this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
2798 #endif
2799             break;
2800         case TOP_PATCHLIST_22:
2801             this->pfnPaFunc = PaPatchList<22>;
2802 #if ENABLE_AVX512_SIMD16
2803             this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
2804 #endif
2805             break;
2806         case TOP_PATCHLIST_23:
2807             this->pfnPaFunc = PaPatchList<23>;
2808 #if ENABLE_AVX512_SIMD16
2809             this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
2810 #endif
2811             break;
2812         case TOP_PATCHLIST_24:
2813             this->pfnPaFunc = PaPatchList<24>;
2814 #if ENABLE_AVX512_SIMD16
2815             this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
2816 #endif
2817             break;
2818         case TOP_PATCHLIST_25:
2819             this->pfnPaFunc = PaPatchList<25>;
2820 #if ENABLE_AVX512_SIMD16
2821             this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
2822 #endif
2823             break;
2824         case TOP_PATCHLIST_26:
2825             this->pfnPaFunc = PaPatchList<26>;
2826 #if ENABLE_AVX512_SIMD16
2827             this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
2828 #endif
2829             break;
2830         case TOP_PATCHLIST_27:
2831             this->pfnPaFunc = PaPatchList<27>;
2832 #if ENABLE_AVX512_SIMD16
2833             this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
2834 #endif
2835             break;
2836         case TOP_PATCHLIST_28:
2837             this->pfnPaFunc = PaPatchList<28>;
2838 #if ENABLE_AVX512_SIMD16
2839             this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
2840 #endif
2841             break;
2842         case TOP_PATCHLIST_29:
2843             this->pfnPaFunc = PaPatchList<29>;
2844 #if ENABLE_AVX512_SIMD16
2845             this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
2846 #endif
2847             break;
2848         case TOP_PATCHLIST_30:
2849             this->pfnPaFunc = PaPatchList<30>;
2850 #if ENABLE_AVX512_SIMD16
2851             this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
2852 #endif
2853             break;
2854         case TOP_PATCHLIST_31:
2855             this->pfnPaFunc = PaPatchList<31>;
2856 #if ENABLE_AVX512_SIMD16
2857             this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
2858 #endif
2859             break;
2860         case TOP_PATCHLIST_32:
2861             this->pfnPaFunc = PaPatchList<32>;
2862 #if ENABLE_AVX512_SIMD16
2863             this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
2864 #endif
2865             break;
2866 
2867         default:
2868             SWR_INVALID("Invalid topology: %d", this->binTopology);
2869             break;
2870     };
2871 
2872     this->pfnPaFuncReset = this->pfnPaFunc;
2873 #if ENABLE_AVX512_SIMD16
2874     this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
2875 #endif
2876 
2877 #if USE_SIMD16_FRONTEND
2878     simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2879     simd16scalari id82 = _simd16_set_epi32( 7,  7,  6,  6,  5,  5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
2880 
2881 #else
2882     simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2883     simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
2884 
2885 #endif
2886     switch(this->binTopology)
2887     {
2888         case TOP_TRIANGLE_LIST:
2889         case TOP_TRIANGLE_STRIP:
2890         case TOP_TRIANGLE_FAN:
2891         case TOP_LINE_STRIP:
2892         case TOP_LINE_LIST:
2893         case TOP_LINE_LOOP:
2894 #if USE_SIMD16_FRONTEND
2895             this->primIDIncr = 16;
2896             this->primID = id16;
2897 #else
2898             this->primIDIncr = 8;
2899             this->primID = id8;
2900 #endif
2901             break;
2902         case TOP_QUAD_LIST:
2903         case TOP_QUAD_STRIP:
2904         case TOP_RECT_LIST:
2905 #if USE_SIMD16_FRONTEND
2906             this->primIDIncr = 8;
2907             this->primID = id82;
2908 #else
2909             this->primIDIncr = 4;
2910             this->primID = id4;
2911 #endif
2912             break;
2913         case TOP_POINT_LIST:
2914 #if USE_SIMD16_FRONTEND
2915             this->primIDIncr = 16;
2916             this->primID = id16;
2917 #else
2918             this->primIDIncr = 8;
2919             this->primID = id8;
2920 #endif
2921             break;
2922         case TOP_PATCHLIST_1:
2923         case TOP_PATCHLIST_2:
2924         case TOP_PATCHLIST_3:
2925         case TOP_PATCHLIST_4:
2926         case TOP_PATCHLIST_5:
2927         case TOP_PATCHLIST_6:
2928         case TOP_PATCHLIST_7:
2929         case TOP_PATCHLIST_8:
2930         case TOP_PATCHLIST_9:
2931         case TOP_PATCHLIST_10:
2932         case TOP_PATCHLIST_11:
2933         case TOP_PATCHLIST_12:
2934         case TOP_PATCHLIST_13:
2935         case TOP_PATCHLIST_14:
2936         case TOP_PATCHLIST_15:
2937         case TOP_PATCHLIST_16:
2938         case TOP_PATCHLIST_17:
2939         case TOP_PATCHLIST_18:
2940         case TOP_PATCHLIST_19:
2941         case TOP_PATCHLIST_20:
2942         case TOP_PATCHLIST_21:
2943         case TOP_PATCHLIST_22:
2944         case TOP_PATCHLIST_23:
2945         case TOP_PATCHLIST_24:
2946         case TOP_PATCHLIST_25:
2947         case TOP_PATCHLIST_26:
2948         case TOP_PATCHLIST_27:
2949         case TOP_PATCHLIST_28:
2950         case TOP_PATCHLIST_29:
2951         case TOP_PATCHLIST_30:
2952         case TOP_PATCHLIST_31:
2953         case TOP_PATCHLIST_32:
2954             // Always run KNOB_SIMD_WIDTH number of patches at a time.
2955 #if USE_SIMD16_FRONTEND
2956             this->primIDIncr = 16;
2957             this->primID = id16;
2958 #else
2959             this->primIDIncr = 8;
2960             this->primID = id8;
2961 #endif
2962             break;
2963 
2964         default:
2965             SWR_INVALID("Invalid topology: %d", this->binTopology);
2966             break;
2967     };
2968 
2969 }
2970 #endif
2971