1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34
35 #if (KNOB_SIMD_WIDTH == 8)
36
swizzleLane0(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)37 INLINE simd4scalar swizzleLane0(const simdscalar& x,
38 const simdscalar& y,
39 const simdscalar& z,
40 const simdscalar& w)
41 {
42 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
43 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
44 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
45 }
46
swizzleLane1(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)47 INLINE simd4scalar swizzleLane1(const simdscalar& x,
48 const simdscalar& y,
49 const simdscalar& z,
50 const simdscalar& w)
51 {
52 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
53 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
54 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
55 }
56
swizzleLane2(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)57 INLINE simd4scalar swizzleLane2(const simdscalar& x,
58 const simdscalar& y,
59 const simdscalar& z,
60 const simdscalar& w)
61 {
62 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
63 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
64 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
65 }
66
swizzleLane3(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)67 INLINE simd4scalar swizzleLane3(const simdscalar& x,
68 const simdscalar& y,
69 const simdscalar& z,
70 const simdscalar& w)
71 {
72 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
73 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
74 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
75 }
76
swizzleLane4(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)77 INLINE simd4scalar swizzleLane4(const simdscalar& x,
78 const simdscalar& y,
79 const simdscalar& z,
80 const simdscalar& w)
81 {
82 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
83 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
84 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
85 }
86
swizzleLane5(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)87 INLINE simd4scalar swizzleLane5(const simdscalar& x,
88 const simdscalar& y,
89 const simdscalar& z,
90 const simdscalar& w)
91 {
92 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
93 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
94 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
95 }
96
swizzleLane6(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)97 INLINE simd4scalar swizzleLane6(const simdscalar& x,
98 const simdscalar& y,
99 const simdscalar& z,
100 const simdscalar& w)
101 {
102 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
103 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
104 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
105 }
106
swizzleLane7(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)107 INLINE simd4scalar swizzleLane7(const simdscalar& x,
108 const simdscalar& y,
109 const simdscalar& z,
110 const simdscalar& w)
111 {
112 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
113 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
114 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
115 }
116
swizzleLane0(const simdvector & v)117 INLINE simd4scalar swizzleLane0(const simdvector& v)
118 {
119 return swizzleLane0(v.x, v.y, v.z, v.w);
120 }
121
swizzleLane1(const simdvector & v)122 INLINE simd4scalar swizzleLane1(const simdvector& v)
123 {
124 return swizzleLane1(v.x, v.y, v.z, v.w);
125 }
126
swizzleLane2(const simdvector & v)127 INLINE simd4scalar swizzleLane2(const simdvector& v)
128 {
129 return swizzleLane2(v.x, v.y, v.z, v.w);
130 }
131
swizzleLane3(const simdvector & v)132 INLINE simd4scalar swizzleLane3(const simdvector& v)
133 {
134 return swizzleLane3(v.x, v.y, v.z, v.w);
135 }
136
swizzleLane4(const simdvector & v)137 INLINE simd4scalar swizzleLane4(const simdvector& v)
138 {
139 return swizzleLane4(v.x, v.y, v.z, v.w);
140 }
141
swizzleLane5(const simdvector & v)142 INLINE simd4scalar swizzleLane5(const simdvector& v)
143 {
144 return swizzleLane5(v.x, v.y, v.z, v.w);
145 }
146
swizzleLane6(const simdvector & v)147 INLINE simd4scalar swizzleLane6(const simdvector& v)
148 {
149 return swizzleLane6(v.x, v.y, v.z, v.w);
150 }
151
swizzleLane7(const simdvector & v)152 INLINE simd4scalar swizzleLane7(const simdvector& v)
153 {
154 return swizzleLane7(v.x, v.y, v.z, v.w);
155 }
156
swizzleLaneN(const simdvector & v,int lane)157 INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane)
158 {
159 switch (lane)
160 {
161 case 0:
162 return swizzleLane0(v);
163 case 1:
164 return swizzleLane1(v);
165 case 2:
166 return swizzleLane2(v);
167 case 3:
168 return swizzleLane3(v);
169 case 4:
170 return swizzleLane4(v);
171 case 5:
172 return swizzleLane5(v);
173 case 6:
174 return swizzleLane6(v);
175 case 7:
176 return swizzleLane7(v);
177 default:
178 return _mm_setzero_ps();
179 }
180 }
181
182 #if ENABLE_AVX512_SIMD16
swizzleLane0(const simd16vector & v)183 INLINE simd4scalar swizzleLane0(const simd16vector& v)
184 {
185 return swizzleLane0(_simd16_extract_ps(v.x, 0),
186 _simd16_extract_ps(v.y, 0),
187 _simd16_extract_ps(v.z, 0),
188 _simd16_extract_ps(v.w, 0));
189 }
190
swizzleLane1(const simd16vector & v)191 INLINE simd4scalar swizzleLane1(const simd16vector& v)
192 {
193 return swizzleLane1(_simd16_extract_ps(v.x, 0),
194 _simd16_extract_ps(v.y, 0),
195 _simd16_extract_ps(v.z, 0),
196 _simd16_extract_ps(v.w, 0));
197 }
198
swizzleLane2(const simd16vector & v)199 INLINE simd4scalar swizzleLane2(const simd16vector& v)
200 {
201 return swizzleLane2(_simd16_extract_ps(v.x, 0),
202 _simd16_extract_ps(v.y, 0),
203 _simd16_extract_ps(v.z, 0),
204 _simd16_extract_ps(v.w, 0));
205 }
206
swizzleLane3(const simd16vector & v)207 INLINE simd4scalar swizzleLane3(const simd16vector& v)
208 {
209 return swizzleLane3(_simd16_extract_ps(v.x, 0),
210 _simd16_extract_ps(v.y, 0),
211 _simd16_extract_ps(v.z, 0),
212 _simd16_extract_ps(v.w, 0));
213 }
214
swizzleLane4(const simd16vector & v)215 INLINE simd4scalar swizzleLane4(const simd16vector& v)
216 {
217 return swizzleLane4(_simd16_extract_ps(v.x, 0),
218 _simd16_extract_ps(v.y, 0),
219 _simd16_extract_ps(v.z, 0),
220 _simd16_extract_ps(v.w, 0));
221 }
222
swizzleLane5(const simd16vector & v)223 INLINE simd4scalar swizzleLane5(const simd16vector& v)
224 {
225 return swizzleLane5(_simd16_extract_ps(v.x, 0),
226 _simd16_extract_ps(v.y, 0),
227 _simd16_extract_ps(v.z, 0),
228 _simd16_extract_ps(v.w, 0));
229 }
230
swizzleLane6(const simd16vector & v)231 INLINE simd4scalar swizzleLane6(const simd16vector& v)
232 {
233 return swizzleLane6(_simd16_extract_ps(v.x, 0),
234 _simd16_extract_ps(v.y, 0),
235 _simd16_extract_ps(v.z, 0),
236 _simd16_extract_ps(v.w, 0));
237 }
238
swizzleLane7(const simd16vector & v)239 INLINE simd4scalar swizzleLane7(const simd16vector& v)
240 {
241 return swizzleLane7(_simd16_extract_ps(v.x, 0),
242 _simd16_extract_ps(v.y, 0),
243 _simd16_extract_ps(v.z, 0),
244 _simd16_extract_ps(v.w, 0));
245 }
246
swizzleLane8(const simd16vector & v)247 INLINE simd4scalar swizzleLane8(const simd16vector& v)
248 {
249 return swizzleLane0(_simd16_extract_ps(v.x, 1),
250 _simd16_extract_ps(v.y, 1),
251 _simd16_extract_ps(v.z, 1),
252 _simd16_extract_ps(v.w, 1));
253 }
254
swizzleLane9(const simd16vector & v)255 INLINE simd4scalar swizzleLane9(const simd16vector& v)
256 {
257 return swizzleLane1(_simd16_extract_ps(v.x, 1),
258 _simd16_extract_ps(v.y, 1),
259 _simd16_extract_ps(v.z, 1),
260 _simd16_extract_ps(v.w, 1));
261 }
262
swizzleLaneA(const simd16vector & v)263 INLINE simd4scalar swizzleLaneA(const simd16vector& v)
264 {
265 return swizzleLane2(_simd16_extract_ps(v.x, 1),
266 _simd16_extract_ps(v.y, 1),
267 _simd16_extract_ps(v.z, 1),
268 _simd16_extract_ps(v.w, 1));
269 }
270
swizzleLaneB(const simd16vector & v)271 INLINE simd4scalar swizzleLaneB(const simd16vector& v)
272 {
273 return swizzleLane3(_simd16_extract_ps(v.x, 1),
274 _simd16_extract_ps(v.y, 1),
275 _simd16_extract_ps(v.z, 1),
276 _simd16_extract_ps(v.w, 1));
277 }
278
swizzleLaneC(const simd16vector & v)279 INLINE simd4scalar swizzleLaneC(const simd16vector& v)
280 {
281 return swizzleLane4(_simd16_extract_ps(v.x, 1),
282 _simd16_extract_ps(v.y, 1),
283 _simd16_extract_ps(v.z, 1),
284 _simd16_extract_ps(v.w, 1));
285 }
286
swizzleLaneD(const simd16vector & v)287 INLINE simd4scalar swizzleLaneD(const simd16vector& v)
288 {
289 return swizzleLane5(_simd16_extract_ps(v.x, 1),
290 _simd16_extract_ps(v.y, 1),
291 _simd16_extract_ps(v.z, 1),
292 _simd16_extract_ps(v.w, 1));
293 }
294
swizzleLaneE(const simd16vector & v)295 INLINE simd4scalar swizzleLaneE(const simd16vector& v)
296 {
297 return swizzleLane6(_simd16_extract_ps(v.x, 1),
298 _simd16_extract_ps(v.y, 1),
299 _simd16_extract_ps(v.z, 1),
300 _simd16_extract_ps(v.w, 1));
301 }
302
swizzleLaneF(const simd16vector & v)303 INLINE simd4scalar swizzleLaneF(const simd16vector& v)
304 {
305 return swizzleLane7(_simd16_extract_ps(v.x, 1),
306 _simd16_extract_ps(v.y, 1),
307 _simd16_extract_ps(v.z, 1),
308 _simd16_extract_ps(v.w, 1));
309 }
310
swizzleLaneN(const simd16vector & v,int lane)311 INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane)
312 {
313 switch (lane)
314 {
315 case 0:
316 return swizzleLane0(v);
317 case 1:
318 return swizzleLane1(v);
319 case 2:
320 return swizzleLane2(v);
321 case 3:
322 return swizzleLane3(v);
323 case 4:
324 return swizzleLane4(v);
325 case 5:
326 return swizzleLane5(v);
327 case 6:
328 return swizzleLane6(v);
329 case 7:
330 return swizzleLane7(v);
331 case 8:
332 return swizzleLane8(v);
333 case 9:
334 return swizzleLane9(v);
335 case 10:
336 return swizzleLaneA(v);
337 case 11:
338 return swizzleLaneB(v);
339 case 12:
340 return swizzleLaneC(v);
341 case 13:
342 return swizzleLaneD(v);
343 case 14:
344 return swizzleLaneE(v);
345 case 15:
346 return swizzleLaneF(v);
347 default:
348 return _mm_setzero_ps();
349 }
350 }
351
352 #endif
353 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
354 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
355 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
356 #if ENABLE_AVX512_SIMD16
357 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
358 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
359 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
360 #endif
361 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
362
363 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
364 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
365 #if ENABLE_AVX512_SIMD16
366 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
367 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
368 #endif
369 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
370
371 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
372 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
373 #if ENABLE_AVX512_SIMD16
374 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
375 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
376 #endif
377 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
378
379 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
380 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
381 #if ENABLE_AVX512_SIMD16
382 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
383 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
384 #endif
385 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
386
387 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
388 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
389 #if ENABLE_AVX512_SIMD16
390 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
391 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
392 #endif
393 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
394
395 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
396 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
397 #if ENABLE_AVX512_SIMD16
398 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
399 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
400 #endif
401 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
402
403 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
404 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
405 #if ENABLE_AVX512_SIMD16
406 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
407 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
408 #endif
409 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
410
411 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
412 #if ENABLE_AVX512_SIMD16
413 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
414 #endif
415 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
416
417 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
418 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
419 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
420 #if ENABLE_AVX512_SIMD16
421 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
422 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
423 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
424 #endif
425 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
426
427 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])428 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
429 {
430 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
431 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
432 // Each attribute has 4 components.
433
434 /// @todo Optimize this
435
436 #if USE_SIMD16_FRONTEND
437 if (pa.useAlternateOffset)
438 {
439 primIndex += KNOB_SIMD_WIDTH;
440 }
441
442 #endif
443 float* pOutVec = (float*)verts;
444
445 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
446 {
447 uint32_t input_cp = primIndex * TotalControlPoints + cp;
448 #if USE_SIMD16_FRONTEND
449 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
450 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
451
452 #else
453 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
454 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
455
456 #endif
457 // Loop over all components of the attribute
458 for (uint32_t i = 0; i < 4; ++i)
459 {
460 #if USE_SIMD16_FRONTEND
461 const float* pInputVec =
462 (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
463 #else
464 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
465 #endif
466 pOutVec[cp * 4 + i] = pInputVec[input_lane];
467 }
468 }
469 }
470
471 template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])472 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
473 {
474 SetNextPaState(pa,
475 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
476 PaPatchListSingle<TotalControlPoints>);
477
478 return false;
479 }
480
481 template <uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])482 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
483 {
484 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
485 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
486 // Each attribute has 4 components.
487
488 /// @todo Optimize this
489
490 #if USE_SIMD16_FRONTEND
491 uint32_t lane_offset = 0;
492
493 if (pa.useAlternateOffset)
494 {
495 lane_offset = KNOB_SIMD_WIDTH;
496 }
497
498 #endif
499 // Loop over all components of the attribute
500 for (uint32_t i = 0; i < 4; ++i)
501 {
502 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
503 {
504 float vec[KNOB_SIMD_WIDTH];
505 for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
506 {
507 #if USE_SIMD16_FRONTEND
508 uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
509 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
510 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
511
512 const float* pInputVec =
513 (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
514 #else
515 uint32_t input_cp = lane * TotalControlPoints + cp;
516 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
517 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
518
519 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
520 #endif
521 vec[lane] = pInputVec[input_lane];
522 }
523 verts[cp][i] = _simd_loadu_ps(vec);
524 }
525 }
526
527 SetNextPaState(pa,
528 PaPatchList<TotalControlPoints>,
529 PaPatchListSingle<TotalControlPoints>,
530 0,
531 PA_STATE_OPT::SIMD_WIDTH,
532 true);
533
534 return true;
535 }
536
537 #if ENABLE_AVX512_SIMD16
538 template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])539 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
540 {
541 SetNextPaState_simd16(pa,
542 PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
543 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
544 PaPatchListSingle<TotalControlPoints>);
545
546 return false;
547 }
548
549 template <uint32_t TotalControlPoints>
PaPatchListTerm_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])550 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
551 {
552 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
553 // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute.
554 // Each attribute has 4 components.
555
556 /// @todo Optimize this
557
558 // Loop over all components of the attribute
559 for (uint32_t i = 0; i < 4; ++i)
560 {
561 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
562 {
563 float vec[KNOB_SIMD16_WIDTH];
564 for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
565 {
566 uint32_t input_cp = lane * TotalControlPoints + cp;
567 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
568 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
569
570 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
571 vec[lane] = pInputVec[input_lane];
572 }
573 verts[cp][i] = _simd16_loadu_ps(vec);
574 }
575 }
576
577 SetNextPaState_simd16(pa,
578 PaPatchList_simd16<TotalControlPoints>,
579 PaPatchList<TotalControlPoints>,
580 PaPatchListSingle<TotalControlPoints>,
581 0,
582 PA_STATE_OPT::SIMD_WIDTH,
583 true);
584
585 return true;
586 }
587
588 #endif
589 #define PA_PATCH_LIST_TERMINATOR(N) \
590 template <> \
591 bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \
592 { \
593 return PaPatchListTerm<N>(pa, slot, verts); \
594 }
595 PA_PATCH_LIST_TERMINATOR(1)
596 PA_PATCH_LIST_TERMINATOR(2)
597 PA_PATCH_LIST_TERMINATOR(3)
598 PA_PATCH_LIST_TERMINATOR(4)
599 PA_PATCH_LIST_TERMINATOR(5)
600 PA_PATCH_LIST_TERMINATOR(6)
601 PA_PATCH_LIST_TERMINATOR(7)
602 PA_PATCH_LIST_TERMINATOR(8)
603 PA_PATCH_LIST_TERMINATOR(9)
604 PA_PATCH_LIST_TERMINATOR(10)
605 PA_PATCH_LIST_TERMINATOR(11)
606 PA_PATCH_LIST_TERMINATOR(12)
607 PA_PATCH_LIST_TERMINATOR(13)
608 PA_PATCH_LIST_TERMINATOR(14)
609 PA_PATCH_LIST_TERMINATOR(15)
610 PA_PATCH_LIST_TERMINATOR(16)
611 PA_PATCH_LIST_TERMINATOR(17)
612 PA_PATCH_LIST_TERMINATOR(18)
613 PA_PATCH_LIST_TERMINATOR(19)
614 PA_PATCH_LIST_TERMINATOR(20)
615 PA_PATCH_LIST_TERMINATOR(21)
616 PA_PATCH_LIST_TERMINATOR(22)
617 PA_PATCH_LIST_TERMINATOR(23)
618 PA_PATCH_LIST_TERMINATOR(24)
619 PA_PATCH_LIST_TERMINATOR(25)
620 PA_PATCH_LIST_TERMINATOR(26)
621 PA_PATCH_LIST_TERMINATOR(27)
622 PA_PATCH_LIST_TERMINATOR(28)
623 PA_PATCH_LIST_TERMINATOR(29)
624 PA_PATCH_LIST_TERMINATOR(30)
625 PA_PATCH_LIST_TERMINATOR(31)
626 PA_PATCH_LIST_TERMINATOR(32)
627 #undef PA_PATCH_LIST_TERMINATOR
628
629 #if ENABLE_AVX512_SIMD16
630 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
631 template <> \
632 bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \
633 { \
634 return PaPatchListTerm_simd16<N>(pa, slot, verts); \
635 }
636 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
637 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
638 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
639 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
640 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
641 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
642 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
643 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
644 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
645 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
646 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
647 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
648 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
649 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
650 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
651 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
652 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
653 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
654 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
655 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
656 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
657 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
658 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
659 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
660 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
661 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
662 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
663 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
664 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
665 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
666 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
667 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
668 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
669
670 #endif
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])671 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
672 {
673 SetNextPaState(pa, PaTriList1, PaTriListSingle0);
674 return false; // Not enough vertices to assemble 4 or 8 triangles.
675 }
676
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])677 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
678 {
679 SetNextPaState(pa, PaTriList2, PaTriListSingle0);
680 return false; // Not enough vertices to assemble 8 triangles.
681 }
682
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])683 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
684 {
685 #if KNOB_ARCH == KNOB_ARCH_AVX
686 #if USE_SIMD16_FRONTEND
687 simdvector a;
688 simdvector b;
689 simdvector c;
690
691 if (!pa.useAlternateOffset)
692 {
693 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
694 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
695
696 for (uint32_t i = 0; i < 4; i += 1)
697 {
698 a[i] = _simd16_extract_ps(a_16[i], 0);
699 b[i] = _simd16_extract_ps(a_16[i], 1);
700 c[i] = _simd16_extract_ps(b_16[i], 0);
701 }
702 }
703 else
704 {
705 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
706 const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
707
708 for (uint32_t i = 0; i < 4; i += 1)
709 {
710 a[i] = _simd16_extract_ps(b_16[i], 1);
711 b[i] = _simd16_extract_ps(c_16[i], 0);
712 c[i] = _simd16_extract_ps(c_16[i], 1);
713 }
714 }
715
716 #else
717 simdvector& a = PaGetSimdVector(pa, 0, slot);
718 simdvector& b = PaGetSimdVector(pa, 1, slot);
719 simdvector& c = PaGetSimdVector(pa, 2, slot);
720
721 #endif
722 simdscalar s;
723
724 // Tri Pattern - provoking vertex is always v0
725 // v0 -> 0 3 6 9 12 15 18 21
726 // v1 -> 1 4 7 10 13 16 19 22
727 // v2 -> 2 5 8 11 14 17 20 23
728
729 for (int i = 0; i < 4; ++i)
730 {
731 simdvector& v0 = verts[0];
732 v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
733 v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
734 v0[i] = _simd_permute_ps_i(v0[i], 0x6C);
735 s = _simd_permute2f128_ps(v0[i], v0[i], 0x21);
736 v0[i] = _simd_blend_ps(v0[i], s, 0x44);
737
738 simdvector& v1 = verts[1];
739 v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
740 v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
741 v1[i] = _simd_permute_ps_i(v1[i], 0xB1);
742 s = _simd_permute2f128_ps(v1[i], v1[i], 0x21);
743 v1[i] = _simd_blend_ps(v1[i], s, 0x66);
744
745 simdvector& v2 = verts[2];
746 v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
747 v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
748 v2[i] = _simd_permute_ps_i(v2[i], 0xC6);
749 s = _simd_permute2f128_ps(v2[i], v2[i], 0x21);
750 v2[i] = _simd_blend_ps(v2[i], s, 0x22);
751 }
752
753 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
754 const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
755 const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
756 const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
757
758 #if USE_SIMD16_FRONTEND
759 simdvector a;
760 simdvector b;
761 simdvector c;
762
763 if (!pa.useAlternateOffset)
764 {
765 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
766 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
767
768 for (uint32_t i = 0; i < 4; i += 1)
769 {
770 a[i] = _simd16_extract_ps(a_16[i], 0);
771 b[i] = _simd16_extract_ps(a_16[i], 1);
772 c[i] = _simd16_extract_ps(b_16[i], 0);
773 }
774 }
775 else
776 {
777 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
778 const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
779
780 for (uint32_t i = 0; i < 4; i += 1)
781 {
782 a[i] = _simd16_extract_ps(b_16[i], 1);
783 b[i] = _simd16_extract_ps(c_16[i], 0);
784 c[i] = _simd16_extract_ps(c_16[i], 1);
785 }
786 }
787
788 #else
789 const simdvector& a = PaGetSimdVector(pa, 0, slot);
790 const simdvector& b = PaGetSimdVector(pa, 1, slot);
791 const simdvector& c = PaGetSimdVector(pa, 2, slot);
792
793 #endif
794 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
795 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
796 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
797
798 simdvector& v0 = verts[0];
799 simdvector& v1 = verts[1];
800 simdvector& v2 = verts[2];
801
802 // for simd x, y, z, and w
803 for (int i = 0; i < 4; ++i)
804 {
805 simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
806 simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
807 simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
808
809 v0[i] = _simd_permute_ps(temp0, perm0);
810 v1[i] = _simd_permute_ps(temp1, perm1);
811 v2[i] = _simd_permute_ps(temp2, perm2);
812 }
813
814 #endif
815 SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
816 return true;
817 }
818
819 #if ENABLE_AVX512_SIMD16
PaTriList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])820 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
821 {
822 SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
823 return false; // Not enough vertices to assemble 16 triangles
824 }
825
PaTriList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])826 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
827 {
828 SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
829 return false; // Not enough vertices to assemble 16 triangles
830 }
831
PaTriList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])832 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
833 {
834 // clang-format off
835
836 #if KNOB_ARCH >= KNOB_ARCH_AVX2
837 const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
838 const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
839 const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
840 #else // KNOB_ARCH == KNOB_ARCH_AVX
841 simd16scalar perm0 = _simd16_setzero_ps();
842 simd16scalar perm1 = _simd16_setzero_ps();
843 simd16scalar perm2 = _simd16_setzero_ps();
844 #endif
845
846 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
847 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
848 const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
849
850 const simd16mask mask0 = 0x4924;
851 const simd16mask mask1 = 0x2492;
852 const simd16mask mask2 = 0x9249;
853
854 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
855 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
856 // v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
857
858 simd16vector& v0 = verts[0];
859 simd16vector& v1 = verts[1];
860 simd16vector& v2 = verts[2];
861
862 // for simd16 x, y, z, and w
863 for (int i = 0; i < 4; i += 1)
864 {
865 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
866 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
867 simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
868
869 simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1);
870 simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0);
871 simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask1), tempc, mask2);
872
873 #if KNOB_ARCH >= KNOB_ARCH_AVX2
874 v0[i] = _simd16_permute_ps(temp0, perm0);
875 v1[i] = _simd16_permute_ps(temp1, perm1);
876 v2[i] = _simd16_permute_ps(temp2, perm2);
877 #else // #if KNOB_ARCH == KNOB_ARCH_AVX
878
879 // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code)
880
881 temp0 = _simd16_permute_ps_i(temp0, 0x6C); // (0, 3, 2, 1) => 00 11 01 10 => 0x6C
882 perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
883 temp0 = _simd16_blend_ps(temp0, perm0, 0x4444); // 0010 0010 0010 0010
884 perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
885 v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838); // 0001 1100 0001 1100
886
887 temp1 = _simd16_permute_ps_i(temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
888 perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
889 temp1 = _simd16_blend_ps(temp1, perm1, 0x6666); // 0010 0010 0010 0010
890 perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
891 v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818); // 0001 1000 0001 1000
892
893 temp2 = _simd16_permute_ps_i(temp2, 0xC6); // (2, 1, 0, 3) => 01 10 00 11 => 0xC6
894 perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
895 temp2 = _simd16_blend_ps(temp2, perm2, 0x2222); // 0100 0100 0100 0100
896 perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
897 v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C); // 0011 1000 0011 1000
898 #endif
899 }
900
901 SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
902 return true;
903
904 // clang-format on
905 }
906
907 #endif
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])908 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
909 {
910 #if USE_SIMD16_FRONTEND
911 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
912 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
913 const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
914
915 if (pa.useAlternateOffset)
916 {
917 primIndex += KNOB_SIMD_WIDTH;
918 }
919
920 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
921 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
922 // v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
923
924 switch (primIndex)
925 {
926 case 0:
927 verts[0] = swizzleLane0(a);
928 verts[1] = swizzleLane1(a);
929 verts[2] = swizzleLane2(a);
930 break;
931 case 1:
932 verts[0] = swizzleLane3(a);
933 verts[1] = swizzleLane4(a);
934 verts[2] = swizzleLane5(a);
935 break;
936 case 2:
937 verts[0] = swizzleLane6(a);
938 verts[1] = swizzleLane7(a);
939 verts[2] = swizzleLane8(a);
940 break;
941 case 3:
942 verts[0] = swizzleLane9(a);
943 verts[1] = swizzleLaneA(a);
944 verts[2] = swizzleLaneB(a);
945 break;
946 case 4:
947 verts[0] = swizzleLaneC(a);
948 verts[1] = swizzleLaneD(a);
949 verts[2] = swizzleLaneE(a);
950 break;
951 case 5:
952 verts[0] = swizzleLaneF(a);
953 verts[1] = swizzleLane0(b);
954 verts[2] = swizzleLane1(b);
955 break;
956 case 6:
957 verts[0] = swizzleLane2(b);
958 verts[1] = swizzleLane3(b);
959 verts[2] = swizzleLane4(b);
960 break;
961 case 7:
962 verts[0] = swizzleLane5(b);
963 verts[1] = swizzleLane6(b);
964 verts[2] = swizzleLane7(b);
965 break;
966 case 8:
967 verts[0] = swizzleLane8(b);
968 verts[1] = swizzleLane9(b);
969 verts[2] = swizzleLaneA(b);
970 break;
971 case 9:
972 verts[0] = swizzleLaneB(b);
973 verts[1] = swizzleLaneC(b);
974 verts[2] = swizzleLaneD(b);
975 break;
976 case 10:
977 verts[0] = swizzleLaneE(b);
978 verts[1] = swizzleLaneF(b);
979 verts[2] = swizzleLane0(c);
980 break;
981 case 11:
982 verts[0] = swizzleLane1(c);
983 verts[1] = swizzleLane2(c);
984 verts[2] = swizzleLane3(c);
985 break;
986 case 12:
987 verts[0] = swizzleLane4(c);
988 verts[1] = swizzleLane5(c);
989 verts[2] = swizzleLane6(c);
990 break;
991 case 13:
992 verts[0] = swizzleLane7(c);
993 verts[1] = swizzleLane8(c);
994 verts[2] = swizzleLane9(c);
995 break;
996 case 14:
997 verts[0] = swizzleLaneA(c);
998 verts[1] = swizzleLaneB(c);
999 verts[2] = swizzleLaneC(c);
1000 break;
1001 case 15:
1002 verts[0] = swizzleLaneD(c);
1003 verts[1] = swizzleLaneE(c);
1004 verts[2] = swizzleLaneF(c);
1005 break;
1006 };
1007 #else
1008 // We have 12 simdscalars contained within 3 simdvectors which
1009 // hold at least 8 triangles worth of data. We want to assemble a single
1010 // triangle with data in horizontal form.
1011
1012 const simdvector& a = PaGetSimdVector(pa, 0, slot);
1013 const simdvector& b = PaGetSimdVector(pa, 1, slot);
1014 const simdvector& c = PaGetSimdVector(pa, 2, slot);
1015
1016 // Convert from vertical to horizontal.
1017 // Tri Pattern - provoking vertex is always v0
1018 // v0 -> 0 3 6 9 12 15 18 21
1019 // v1 -> 1 4 7 10 13 16 19 22
1020 // v2 -> 2 5 8 11 14 17 20 23
1021
1022 switch (primIndex)
1023 {
1024 case 0:
1025 verts[0] = swizzleLane0(a);
1026 verts[1] = swizzleLane1(a);
1027 verts[2] = swizzleLane2(a);
1028 break;
1029 case 1:
1030 verts[0] = swizzleLane3(a);
1031 verts[1] = swizzleLane4(a);
1032 verts[2] = swizzleLane5(a);
1033 break;
1034 case 2:
1035 verts[0] = swizzleLane6(a);
1036 verts[1] = swizzleLane7(a);
1037 verts[2] = swizzleLane0(b);
1038 break;
1039 case 3:
1040 verts[0] = swizzleLane1(b);
1041 verts[1] = swizzleLane2(b);
1042 verts[2] = swizzleLane3(b);
1043 break;
1044 case 4:
1045 verts[0] = swizzleLane4(b);
1046 verts[1] = swizzleLane5(b);
1047 verts[2] = swizzleLane6(b);
1048 break;
1049 case 5:
1050 verts[0] = swizzleLane7(b);
1051 verts[1] = swizzleLane0(c);
1052 verts[2] = swizzleLane1(c);
1053 break;
1054 case 6:
1055 verts[0] = swizzleLane2(c);
1056 verts[1] = swizzleLane3(c);
1057 verts[2] = swizzleLane4(c);
1058 break;
1059 case 7:
1060 verts[0] = swizzleLane5(c);
1061 verts[1] = swizzleLane6(c);
1062 verts[2] = swizzleLane7(c);
1063 break;
1064 };
1065 #endif
1066 }
1067
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1068 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1069 {
1070 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
1071 return false; // Not enough vertices to assemble 8 triangles.
1072 }
1073
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1074 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1075 {
1076 #if USE_SIMD16_FRONTEND
1077 simdvector a;
1078 simdvector b;
1079
1080 if (!pa.useAlternateOffset)
1081 {
1082 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1083
1084 for (uint32_t i = 0; i < 4; i += 1)
1085 {
1086 a[i] = _simd16_extract_ps(a_16[i], 0);
1087 b[i] = _simd16_extract_ps(a_16[i], 1);
1088 }
1089 }
1090 else
1091 {
1092 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1093
1094 for (uint32_t i = 0; i < 4; i += 1)
1095 {
1096 a[i] = _simd16_extract_ps(b_16[i], 0);
1097 b[i] = _simd16_extract_ps(b_16[i], 1);
1098 }
1099 }
1100
1101 #else
1102 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1103 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1104
1105 #endif
1106 simdscalar s;
1107
1108 for (int i = 0; i < 4; ++i)
1109 {
1110 simdscalar a0 = a[i];
1111 simdscalar b0 = b[i];
1112
1113 // Tri Pattern - provoking vertex is always v0
1114 // v0 -> 01234567
1115 // v1 -> 13355779
1116 // v2 -> 22446688
1117 simdvector& v0 = verts[0];
1118 v0[i] = a0;
1119
1120 // s -> 4567891011
1121 s = _simd_permute2f128_ps(a0, b0, 0x21);
1122 // s -> 23456789
1123 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1124
1125 simdvector& v1 = verts[1];
1126 // v1 -> 13355779
1127 v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
1128
1129 simdvector& v2 = verts[2];
1130 // v2 -> 22446688
1131 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
1132 }
1133
1134 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1135 return true;
1136 }
1137
1138 #if ENABLE_AVX512_SIMD16
PaTriStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1139 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1140 {
1141 SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
1142 return false; // Not enough vertices to assemble 16 triangles.
1143 }
1144
PaTriStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1145 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1146 {
1147 // clang-format off
1148
1149 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1150 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1151
1152 const simd16mask mask0 = 0xF000;
1153
1154 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1155 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1156 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1157
1158 simd16vector& v0 = verts[0];
1159 simd16vector& v1 = verts[1];
1160 simd16vector& v2 = verts[2];
1161
1162 // for simd16 x, y, z, and w
1163 for (int i = 0; i < 4; i += 1)
1164 {
1165 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1166 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1167
1168 simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
1169 simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1170
1171 simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
1172 simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
1173
1174 v0[i] = tempa; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1175 v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1176 v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1177 }
1178
1179 SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1180 return true;
1181
1182 // clang-format on
1183 }
1184
1185 #endif
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1186 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1187 {
1188 #if USE_SIMD16_FRONTEND
1189 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1190 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1191
1192 if (pa.useAlternateOffset)
1193 {
1194 primIndex += KNOB_SIMD_WIDTH;
1195 }
1196
1197 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1198 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1199 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1200
1201 switch (primIndex)
1202 {
1203 case 0:
1204 verts[0] = swizzleLane0(a);
1205 verts[1] = swizzleLane1(a);
1206 verts[2] = swizzleLane2(a);
1207 break;
1208 case 1:
1209 verts[0] = swizzleLane1(a);
1210 verts[1] = swizzleLane3(a);
1211 verts[2] = swizzleLane2(a);
1212 break;
1213 case 2:
1214 verts[0] = swizzleLane2(a);
1215 verts[1] = swizzleLane3(a);
1216 verts[2] = swizzleLane4(a);
1217 break;
1218 case 3:
1219 verts[0] = swizzleLane3(a);
1220 verts[1] = swizzleLane5(a);
1221 verts[2] = swizzleLane4(a);
1222 break;
1223 case 4:
1224 verts[0] = swizzleLane4(a);
1225 verts[1] = swizzleLane5(a);
1226 verts[2] = swizzleLane6(a);
1227 break;
1228 case 5:
1229 verts[0] = swizzleLane5(a);
1230 verts[1] = swizzleLane7(a);
1231 verts[2] = swizzleLane6(a);
1232 break;
1233 case 6:
1234 verts[0] = swizzleLane6(a);
1235 verts[1] = swizzleLane7(a);
1236 verts[2] = swizzleLane8(a);
1237 break;
1238 case 7:
1239 verts[0] = swizzleLane7(a);
1240 verts[1] = swizzleLane9(a);
1241 verts[2] = swizzleLane8(a);
1242 break;
1243 case 8:
1244 verts[0] = swizzleLane8(a);
1245 verts[1] = swizzleLane9(a);
1246 verts[2] = swizzleLaneA(a);
1247 break;
1248 case 9:
1249 verts[0] = swizzleLane9(a);
1250 verts[1] = swizzleLaneB(a);
1251 verts[2] = swizzleLaneA(a);
1252 break;
1253 case 10:
1254 verts[0] = swizzleLaneA(a);
1255 verts[1] = swizzleLaneB(a);
1256 verts[2] = swizzleLaneC(a);
1257 break;
1258 case 11:
1259 verts[0] = swizzleLaneB(a);
1260 verts[1] = swizzleLaneD(a);
1261 verts[2] = swizzleLaneC(a);
1262 break;
1263 case 12:
1264 verts[0] = swizzleLaneC(a);
1265 verts[1] = swizzleLaneD(a);
1266 verts[2] = swizzleLaneE(a);
1267 break;
1268 case 13:
1269 verts[0] = swizzleLaneD(a);
1270 verts[1] = swizzleLaneF(a);
1271 verts[2] = swizzleLaneE(a);
1272 break;
1273 case 14:
1274 verts[0] = swizzleLaneE(a);
1275 verts[1] = swizzleLaneF(a);
1276 verts[2] = swizzleLane0(b);
1277 break;
1278 case 15:
1279 verts[0] = swizzleLaneF(a);
1280 verts[1] = swizzleLane1(b);
1281 verts[2] = swizzleLane0(b);
1282 break;
1283 };
1284 #else
1285 const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1286 const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1287
1288 // Convert from vertical to horizontal.
1289 // Tri Pattern - provoking vertex is always v0
1290 // v0 -> 01234567
1291 // v1 -> 13355779
1292 // v2 -> 22446688
1293
1294 switch (primIndex)
1295 {
1296 case 0:
1297 verts[0] = swizzleLane0(a);
1298 verts[1] = swizzleLane1(a);
1299 verts[2] = swizzleLane2(a);
1300 break;
1301 case 1:
1302 verts[0] = swizzleLane1(a);
1303 verts[1] = swizzleLane3(a);
1304 verts[2] = swizzleLane2(a);
1305 break;
1306 case 2:
1307 verts[0] = swizzleLane2(a);
1308 verts[1] = swizzleLane3(a);
1309 verts[2] = swizzleLane4(a);
1310 break;
1311 case 3:
1312 verts[0] = swizzleLane3(a);
1313 verts[1] = swizzleLane5(a);
1314 verts[2] = swizzleLane4(a);
1315 break;
1316 case 4:
1317 verts[0] = swizzleLane4(a);
1318 verts[1] = swizzleLane5(a);
1319 verts[2] = swizzleLane6(a);
1320 break;
1321 case 5:
1322 verts[0] = swizzleLane5(a);
1323 verts[1] = swizzleLane7(a);
1324 verts[2] = swizzleLane6(a);
1325 break;
1326 case 6:
1327 verts[0] = swizzleLane6(a);
1328 verts[1] = swizzleLane7(a);
1329 verts[2] = swizzleLane0(b);
1330 break;
1331 case 7:
1332 verts[0] = swizzleLane7(a);
1333 verts[1] = swizzleLane1(b);
1334 verts[2] = swizzleLane0(b);
1335 break;
1336 };
1337 #endif
1338 }
1339
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1340 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1341 {
1342 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
1343 return false; // Not enough vertices to assemble 8 triangles.
1344 }
1345
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1346 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1347 {
1348 #if USE_SIMD16_FRONTEND
1349 simdvector leadVert;
1350 simdvector a;
1351 simdvector b;
1352
1353 const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1354
1355 if (!pa.useAlternateOffset)
1356 {
1357 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1358
1359 for (uint32_t i = 0; i < 4; i += 1)
1360 {
1361 leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1362
1363 a[i] = _simd16_extract_ps(a_16[i], 0);
1364 b[i] = _simd16_extract_ps(a_16[i], 1);
1365 }
1366 }
1367 else
1368 {
1369 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1370
1371 for (uint32_t i = 0; i < 4; i += 1)
1372 {
1373 leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1374
1375 a[i] = _simd16_extract_ps(b_16[i], 0);
1376 b[i] = _simd16_extract_ps(b_16[i], 1);
1377 }
1378 }
1379
1380 #else
1381 const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot);
1382 const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1383 const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1384
1385 #endif
1386 simdscalar s;
1387
1388 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
1389 for (int i = 0; i < 4; ++i)
1390 {
1391 simdscalar a0 = a[i];
1392 simdscalar b0 = b[i];
1393
1394 simdscalar comp = leadVert[i];
1395
1396 simdvector& v0 = verts[0];
1397 v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
1398 v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00);
1399
1400 simdvector& v2 = verts[2];
1401 s = _simd_permute2f128_ps(a0, b0, 0x21);
1402 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1403
1404 simdvector& v1 = verts[1];
1405 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
1406 }
1407
1408 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1409 return true;
1410 }
1411
1412 #if ENABLE_AVX512_SIMD16
PaTriFan0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1413 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1414 {
1415 SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
1416 return false; // Not enough vertices to assemble 16 triangles.
1417 }
1418
PaTriFan1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1419 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1420 {
1421 // clang-format off
1422
1423 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
1424 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1425 const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1426
1427 const simd16mask mask0 = 0xF000;
1428
1429 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1430 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1431 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1432
1433 simd16vector& v0 = verts[0];
1434 simd16vector& v1 = verts[1];
1435 simd16vector& v2 = verts[2];
1436
1437 // for simd16 x, y, z, and w
1438 for (uint32_t i = 0; i < 4; i += 1)
1439 {
1440 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1441 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1442 simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
1443
1444 simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
1445
1446 v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1447
1448 simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1449 simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
1450
1451 simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
1452
1453 simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1454
1455 v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1456 v2[i] = temp2; // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1457 }
1458
1459 SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1460 return true;
1461
1462 // clang-format on
1463 }
1464
1465 #endif
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1466 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1467 {
1468 #if USE_SIMD16_FRONTEND
1469 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
1470 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1471 const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1472
1473 if (pa.useAlternateOffset)
1474 {
1475 primIndex += KNOB_SIMD_WIDTH;
1476 }
1477
1478 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1479 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1480 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1481
1482 // vert 0 from leading vertex
1483 verts[0] = swizzleLane0(a);
1484
1485 // vert 1
1486 if (primIndex < 15)
1487 {
1488 verts[1] = swizzleLaneN(b, primIndex + 1);
1489 }
1490 else
1491 {
1492 verts[1] = swizzleLane0(c);
1493 }
1494
1495 // vert 2
1496 if (primIndex < 14)
1497 {
1498 verts[2] = swizzleLaneN(b, primIndex + 2);
1499 }
1500 else
1501 {
1502 verts[2] = swizzleLaneN(c, primIndex - 14);
1503 }
1504 #else
1505 const simdvector& a = PaGetSimdVector(pa, pa.first, slot);
1506 const simdvector& b = PaGetSimdVector(pa, pa.prev, slot);
1507 const simdvector& c = PaGetSimdVector(pa, pa.cur, slot);
1508
1509 // vert 0 from leading vertex
1510 verts[0] = swizzleLane0(a);
1511
1512 // vert 1
1513 if (primIndex < 7)
1514 {
1515 verts[1] = swizzleLaneN(b, primIndex + 1);
1516 }
1517 else
1518 {
1519 verts[1] = swizzleLane0(c);
1520 }
1521
1522 // vert 2
1523 if (primIndex < 6)
1524 {
1525 verts[2] = swizzleLaneN(b, primIndex + 2);
1526 }
1527 else
1528 {
1529 verts[2] = swizzleLaneN(c, primIndex - 6);
1530 }
1531 #endif
1532 }
1533
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1534 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1535 {
1536 SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
1537 return false; // Not enough vertices to assemble 8 triangles.
1538 }
1539
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1540 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1541 {
1542 #if USE_SIMD16_FRONTEND
1543 simdvector a;
1544 simdvector b;
1545
1546 if (!pa.useAlternateOffset)
1547 {
1548 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1549
1550 for (uint32_t i = 0; i < 4; i += 1)
1551 {
1552 a[i] = _simd16_extract_ps(a_16[i], 0);
1553 b[i] = _simd16_extract_ps(a_16[i], 1);
1554 }
1555 }
1556 else
1557 {
1558 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1559
1560 for (uint32_t i = 0; i < 4; i += 1)
1561 {
1562 a[i] = _simd16_extract_ps(b_16[i], 0);
1563 b[i] = _simd16_extract_ps(b_16[i], 1);
1564 }
1565 }
1566
1567 #else
1568 simdvector& a = PaGetSimdVector(pa, 0, slot);
1569 simdvector& b = PaGetSimdVector(pa, 1, slot);
1570
1571 #endif
1572 simdscalar s1, s2;
1573
1574 for (int i = 0; i < 4; ++i)
1575 {
1576 simdscalar a0 = a[i];
1577 simdscalar b0 = b[i];
1578
1579 s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
1580 s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
1581
1582 simdvector& v0 = verts[0];
1583 v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
1584
1585 simdvector& v1 = verts[1];
1586 v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
1587
1588 simdvector& v2 = verts[2];
1589 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
1590 }
1591
1592 SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1593 return true;
1594 }
1595
1596 #if ENABLE_AVX512_SIMD16
PaQuadList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1597 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1598 {
1599 SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
1600 return false; // Not enough vertices to assemble 16 triangles.
1601 }
1602
PaQuadList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1603 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1604 {
1605 // clang-format off
1606
1607 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1608 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1609
1610 // v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
1611 // v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
1612 // v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1613
1614 simd16vector& v0 = verts[0];
1615 simd16vector& v1 = verts[1];
1616 simd16vector& v2 = verts[2];
1617
1618 // for simd16 x, y, z, and w
1619 for (uint32_t i = 0; i < 4; i += 1)
1620 {
1621 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1622 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1623
1624 simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
1625 simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1626
1627 v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
1628 v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
1629 v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1630 }
1631
1632 SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1633 return true;
1634
1635 // clang-format on
1636 }
1637
1638 #endif
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1639 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1640 {
1641 #if USE_SIMD16_FRONTEND
1642 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1643 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1644
1645 if (pa.useAlternateOffset)
1646 {
1647 primIndex += KNOB_SIMD_WIDTH;
1648 }
1649
1650 switch (primIndex)
1651 {
1652 case 0:
1653 // triangle 0 - 0 1 2
1654 verts[0] = swizzleLane0(a);
1655 verts[1] = swizzleLane1(a);
1656 verts[2] = swizzleLane2(a);
1657 break;
1658 case 1:
1659 // triangle 1 - 0 2 3
1660 verts[0] = swizzleLane0(a);
1661 verts[1] = swizzleLane2(a);
1662 verts[2] = swizzleLane3(a);
1663 break;
1664 case 2:
1665 // triangle 2 - 4 5 6
1666 verts[0] = swizzleLane4(a);
1667 verts[1] = swizzleLane5(a);
1668 verts[2] = swizzleLane6(a);
1669 break;
1670 case 3:
1671 // triangle 3 - 4 6 7
1672 verts[0] = swizzleLane4(a);
1673 verts[1] = swizzleLane6(a);
1674 verts[2] = swizzleLane7(a);
1675 break;
1676 case 4:
1677 // triangle 4 - 8 9 A
1678 verts[0] = swizzleLane8(a);
1679 verts[1] = swizzleLane9(a);
1680 verts[2] = swizzleLaneA(a);
1681 break;
1682 case 5:
1683 // triangle 5 - 8 A B
1684 verts[0] = swizzleLane8(a);
1685 verts[1] = swizzleLaneA(a);
1686 verts[2] = swizzleLaneB(a);
1687 break;
1688 case 6:
1689 // triangle 6 - C D E
1690 verts[0] = swizzleLaneC(a);
1691 verts[1] = swizzleLaneD(a);
1692 verts[2] = swizzleLaneE(a);
1693 break;
1694 case 7:
1695 // triangle 7 - C E F
1696 verts[0] = swizzleLaneC(a);
1697 verts[1] = swizzleLaneE(a);
1698 verts[2] = swizzleLaneF(a);
1699 break;
1700 case 8:
1701 // triangle 0 - 0 1 2
1702 verts[0] = swizzleLane0(b);
1703 verts[1] = swizzleLane1(b);
1704 verts[2] = swizzleLane2(b);
1705 break;
1706 case 9:
1707 // triangle 1 - 0 2 3
1708 verts[0] = swizzleLane0(b);
1709 verts[1] = swizzleLane2(b);
1710 verts[2] = swizzleLane3(b);
1711 break;
1712 case 10:
1713 // triangle 2 - 4 5 6
1714 verts[0] = swizzleLane4(b);
1715 verts[1] = swizzleLane5(b);
1716 verts[2] = swizzleLane6(b);
1717 break;
1718 case 11:
1719 // triangle 3 - 4 6 7
1720 verts[0] = swizzleLane4(b);
1721 verts[1] = swizzleLane6(b);
1722 verts[2] = swizzleLane7(b);
1723 break;
1724 case 12:
1725 // triangle 4 - 8 9 A
1726 verts[0] = swizzleLane8(b);
1727 verts[1] = swizzleLane9(b);
1728 verts[2] = swizzleLaneA(b);
1729 break;
1730 case 13:
1731 // triangle 5 - 8 A B
1732 verts[0] = swizzleLane8(b);
1733 verts[1] = swizzleLaneA(b);
1734 verts[2] = swizzleLaneB(b);
1735 break;
1736 case 14:
1737 // triangle 6 - C D E
1738 verts[0] = swizzleLaneC(b);
1739 verts[1] = swizzleLaneD(b);
1740 verts[2] = swizzleLaneE(b);
1741 break;
1742 case 15:
1743 // triangle 7 - C E F
1744 verts[0] = swizzleLaneC(b);
1745 verts[1] = swizzleLaneE(b);
1746 verts[2] = swizzleLaneF(b);
1747 break;
1748 }
1749 #else
1750 const simdvector& a = PaGetSimdVector(pa, 0, slot);
1751 const simdvector& b = PaGetSimdVector(pa, 1, slot);
1752
1753 switch (primIndex)
1754 {
1755 case 0:
1756 // triangle 0 - 0 1 2
1757 verts[0] = swizzleLane0(a);
1758 verts[1] = swizzleLane1(a);
1759 verts[2] = swizzleLane2(a);
1760 break;
1761 case 1:
1762 // triangle 1 - 0 2 3
1763 verts[0] = swizzleLane0(a);
1764 verts[1] = swizzleLane2(a);
1765 verts[2] = swizzleLane3(a);
1766 break;
1767 case 2:
1768 // triangle 2 - 4 5 6
1769 verts[0] = swizzleLane4(a);
1770 verts[1] = swizzleLane5(a);
1771 verts[2] = swizzleLane6(a);
1772 break;
1773 case 3:
1774 // triangle 3 - 4 6 7
1775 verts[0] = swizzleLane4(a);
1776 verts[1] = swizzleLane6(a);
1777 verts[2] = swizzleLane7(a);
1778 break;
1779 case 4:
1780 // triangle 4 - 8 9 10 (0 1 2)
1781 verts[0] = swizzleLane0(b);
1782 verts[1] = swizzleLane1(b);
1783 verts[2] = swizzleLane2(b);
1784 break;
1785 case 5:
1786 // triangle 1 - 0 2 3
1787 verts[0] = swizzleLane0(b);
1788 verts[1] = swizzleLane2(b);
1789 verts[2] = swizzleLane3(b);
1790 break;
1791 case 6:
1792 // triangle 2 - 4 5 6
1793 verts[0] = swizzleLane4(b);
1794 verts[1] = swizzleLane5(b);
1795 verts[2] = swizzleLane6(b);
1796 break;
1797 case 7:
1798 // triangle 3 - 4 6 7
1799 verts[0] = swizzleLane4(b);
1800 verts[1] = swizzleLane6(b);
1801 verts[2] = swizzleLane7(b);
1802 break;
1803 }
1804 #endif
1805 }
1806
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1807 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1808 {
1809 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
1810 return false;
1811 }
1812
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1813 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1814 {
1815 PaLineStrip1(pa, slot, verts);
1816
1817 if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
1818 {
1819 // loop reconnect now
1820 const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1821
1822 #if USE_SIMD16_FRONTEND
1823 simdvector first;
1824
1825 const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1826
1827 if (!pa.useAlternateOffset)
1828 {
1829 for (uint32_t i = 0; i < 4; i += 1)
1830 {
1831 first[i] = _simd16_extract_ps(first_16[i], 0);
1832 }
1833 }
1834 else
1835 {
1836 for (uint32_t i = 0; i < 4; i += 1)
1837 {
1838 first[i] = _simd16_extract_ps(first_16[i], 1);
1839 }
1840 }
1841
1842 #else
1843 simdvector& first = PaGetSimdVector(pa, pa.first, slot);
1844
1845 #endif
1846 for (int i = 0; i < 4; i++)
1847 {
1848 float* firstVtx = (float*)&(first[i]);
1849 float* targetVtx = (float*)&(verts[1][i]);
1850 targetVtx[lane] = firstVtx[0];
1851 }
1852 }
1853
1854 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1855 return true;
1856 }
1857
1858 #if ENABLE_AVX512_SIMD16
PaLineLoop0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1859 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1860 {
1861 SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
1862 return false;
1863 }
1864
PaLineLoop1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1865 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1866 {
1867 PaLineStrip1_simd16(pa, slot, verts);
1868
1869 if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
1870 {
1871 // loop reconnect now
1872 const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1873
1874 const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
1875
1876 for (int i = 0; i < 4; i++)
1877 {
1878 float* firstVtx = (float*)&(first[i]);
1879 float* targetVtx = (float*)&(verts[1][i]);
1880 targetVtx[lane] = firstVtx[0];
1881 }
1882 }
1883
1884 SetNextPaState_simd16(
1885 pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1886 return true;
1887 }
1888
1889 #endif
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1890 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1891 {
1892 PaLineStripSingle0(pa, slot, primIndex, verts);
1893
1894 if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
1895 {
1896 #if USE_SIMD16_FRONTEND
1897 const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
1898
1899 verts[1] = swizzleLane0(first);
1900 #else
1901 const simdvector& first = PaGetSimdVector(pa, pa.first, slot);
1902
1903 verts[1] = swizzleLane0(first);
1904 #endif
1905 }
1906 }
1907
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1908 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1909 {
1910 SetNextPaState(pa, PaLineList1, PaLineListSingle0);
1911 return false; // Not enough vertices to assemble 8 lines
1912 }
1913
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1914 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1915 {
1916 #if USE_SIMD16_FRONTEND
1917 simdvector a;
1918 simdvector b;
1919
1920 if (!pa.useAlternateOffset)
1921 {
1922 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1923
1924 for (uint32_t i = 0; i < 4; i += 1)
1925 {
1926 a[i] = _simd16_extract_ps(a_16[i], 0);
1927 b[i] = _simd16_extract_ps(a_16[i], 1);
1928 }
1929 }
1930 else
1931 {
1932 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1933
1934 for (uint32_t i = 0; i < 4; i += 1)
1935 {
1936 a[i] = _simd16_extract_ps(b_16[i], 0);
1937 b[i] = _simd16_extract_ps(b_16[i], 1);
1938 }
1939 }
1940
1941 #else
1942 simdvector& a = PaGetSimdVector(pa, 0, slot);
1943 simdvector& b = PaGetSimdVector(pa, 1, slot);
1944
1945 #endif
1946 /// @todo: verify provoking vertex is correct
1947 // Line list 0 1 2 3 4 5 6 7
1948 // 8 9 10 11 12 13 14 15
1949
1950 // shuffle:
1951 // 0 2 4 6 8 10 12 14
1952 // 1 3 5 7 9 11 13 15
1953
1954 for (uint32_t i = 0; i < 4; ++i)
1955 {
1956 // 0 1 2 3 8 9 10 11
1957 __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
1958 // 4 5 6 7 12 13 14 15
1959 __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
1960
1961 // 0 2 4 6 8 10 12 14
1962 verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
1963 // 1 3 5 7 9 11 13 15
1964 verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
1965 }
1966
1967 SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1968 return true;
1969 }
1970
1971 #if ENABLE_AVX512_SIMD16
PaLineList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1972 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1973 {
1974 SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
1975 return false; // Not enough vertices to assemble 16 lines
1976 }
1977
PaLineList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1978 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1979 {
1980 // clang-format off
1981
1982 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1983 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1984
1985 // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1986 // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
1987
1988 simd16vector& v0 = verts[0];
1989 simd16vector& v1 = verts[1];
1990
1991 // for simd16 x, y, z, and w
1992 for (int i = 0; i < 4; i += 1)
1993 {
1994 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1995 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1996
1997 simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
1998 simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1999
2000 v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
2001 v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
2002 }
2003
2004 SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2005 return true;
2006
2007 // clang-format on
2008 }
2009
2010 #endif
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2011 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2012 {
2013 #if USE_SIMD16_FRONTEND
2014 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
2015 const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
2016
2017 if (pa.useAlternateOffset)
2018 {
2019 primIndex += KNOB_SIMD_WIDTH;
2020 }
2021
2022 switch (primIndex)
2023 {
2024 case 0:
2025 verts[0] = swizzleLane0(a);
2026 verts[1] = swizzleLane1(a);
2027 break;
2028 case 1:
2029 verts[0] = swizzleLane2(a);
2030 verts[1] = swizzleLane3(a);
2031 break;
2032 case 2:
2033 verts[0] = swizzleLane4(a);
2034 verts[1] = swizzleLane5(a);
2035 break;
2036 case 3:
2037 verts[0] = swizzleLane6(a);
2038 verts[1] = swizzleLane7(a);
2039 break;
2040 case 4:
2041 verts[0] = swizzleLane8(a);
2042 verts[1] = swizzleLane9(a);
2043 break;
2044 case 5:
2045 verts[0] = swizzleLaneA(a);
2046 verts[1] = swizzleLaneB(a);
2047 break;
2048 case 6:
2049 verts[0] = swizzleLaneC(a);
2050 verts[1] = swizzleLaneD(a);
2051 break;
2052 case 7:
2053 verts[0] = swizzleLaneE(a);
2054 verts[1] = swizzleLaneF(a);
2055 break;
2056 case 8:
2057 verts[0] = swizzleLane0(b);
2058 verts[1] = swizzleLane1(b);
2059 break;
2060 case 9:
2061 verts[0] = swizzleLane2(b);
2062 verts[1] = swizzleLane3(b);
2063 break;
2064 case 10:
2065 verts[0] = swizzleLane4(b);
2066 verts[1] = swizzleLane5(b);
2067 break;
2068 case 11:
2069 verts[0] = swizzleLane6(b);
2070 verts[1] = swizzleLane7(b);
2071 break;
2072 case 12:
2073 verts[0] = swizzleLane8(b);
2074 verts[1] = swizzleLane9(b);
2075 break;
2076 case 13:
2077 verts[0] = swizzleLaneA(b);
2078 verts[1] = swizzleLaneB(b);
2079 break;
2080 case 14:
2081 verts[0] = swizzleLaneC(b);
2082 verts[1] = swizzleLaneD(b);
2083 break;
2084 case 15:
2085 verts[0] = swizzleLaneE(b);
2086 verts[1] = swizzleLaneF(b);
2087 break;
2088 }
2089 #else
2090 const simdvector& a = PaGetSimdVector(pa, 0, slot);
2091 const simdvector& b = PaGetSimdVector(pa, 1, slot);
2092
2093 switch (primIndex)
2094 {
2095 case 0:
2096 verts[0] = swizzleLane0(a);
2097 verts[1] = swizzleLane1(a);
2098 break;
2099 case 1:
2100 verts[0] = swizzleLane2(a);
2101 verts[1] = swizzleLane3(a);
2102 break;
2103 case 2:
2104 verts[0] = swizzleLane4(a);
2105 verts[1] = swizzleLane5(a);
2106 break;
2107 case 3:
2108 verts[0] = swizzleLane6(a);
2109 verts[1] = swizzleLane7(a);
2110 break;
2111 case 4:
2112 verts[0] = swizzleLane0(b);
2113 verts[1] = swizzleLane1(b);
2114 break;
2115 case 5:
2116 verts[0] = swizzleLane2(b);
2117 verts[1] = swizzleLane3(b);
2118 break;
2119 case 6:
2120 verts[0] = swizzleLane4(b);
2121 verts[1] = swizzleLane5(b);
2122 break;
2123 case 7:
2124 verts[0] = swizzleLane6(b);
2125 verts[1] = swizzleLane7(b);
2126 break;
2127 }
2128 #endif
2129 }
2130
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2131 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2132 {
2133 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
2134 return false; // Not enough vertices to assemble 8 lines
2135 }
2136
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2137 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2138 {
2139 #if USE_SIMD16_FRONTEND
2140 simdvector a;
2141 simdvector b;
2142
2143 if (!pa.useAlternateOffset)
2144 {
2145 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
2146
2147 for (uint32_t i = 0; i < 4; i += 1)
2148 {
2149 a[i] = _simd16_extract_ps(a_16[i], 0);
2150 b[i] = _simd16_extract_ps(a_16[i], 1);
2151 }
2152 }
2153 else
2154 {
2155 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
2156
2157 for (uint32_t i = 0; i < 4; i += 1)
2158 {
2159 a[i] = _simd16_extract_ps(b_16[i], 0);
2160 b[i] = _simd16_extract_ps(b_16[i], 1);
2161 }
2162 }
2163
2164 #else
2165 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
2166 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
2167
2168 #endif
2169 /// @todo: verify provoking vertex is correct
2170 // Line list 0 1 2 3 4 5 6 7
2171 // 8 9 10 11 12 13 14 15
2172
2173 // shuffle:
2174 // 0 1 2 3 4 5 6 7
2175 // 1 2 3 4 5 6 7 8
2176
2177 verts[0] = a;
2178
2179 for (uint32_t i = 0; i < 4; ++i)
2180 {
2181 // 1 2 3 x 5 6 7 x
2182 __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
2183 // 4 5 6 7 8 9 10 11
2184 __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
2185
2186 // x x x 4 x x x 8
2187 __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
2188
2189 verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
2190 }
2191
2192 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2193 return true;
2194 }
2195
2196 #if ENABLE_AVX512_SIMD16
PaLineStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2197 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2198 {
2199 SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
2200 return false; // Not enough vertices to assemble 16 lines
2201 }
2202
PaLineStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2203 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2204 {
2205 // clang-format off
2206
2207 const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
2208
2209 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2210 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2211
2212 const simd16mask mask0 = 0x0001;
2213
2214 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2215 // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2216
2217 simd16vector& v0 = verts[0];
2218 simd16vector& v1 = verts[1];
2219
2220 v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2221
2222 // for simd16 x, y, z, and w
2223 for (int i = 0; i < 4; i += 1)
2224 {
2225 simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
2226 simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
2227
2228 simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2229
2230 v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2231 }
2232
2233 SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2234 return true;
2235
2236 // clang-format on
2237 }
2238
2239 #endif
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2240 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2241 {
2242 #if USE_SIMD16_FRONTEND
2243 const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2244 const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2245
2246 if (pa.useAlternateOffset)
2247 {
2248 primIndex += KNOB_SIMD_WIDTH;
2249 }
2250
2251 switch (primIndex)
2252 {
2253 case 0:
2254 verts[0] = swizzleLane0(a);
2255 verts[1] = swizzleLane1(a);
2256 break;
2257 case 1:
2258 verts[0] = swizzleLane1(a);
2259 verts[1] = swizzleLane2(a);
2260 break;
2261 case 2:
2262 verts[0] = swizzleLane2(a);
2263 verts[1] = swizzleLane3(a);
2264 break;
2265 case 3:
2266 verts[0] = swizzleLane3(a);
2267 verts[1] = swizzleLane4(a);
2268 break;
2269 case 4:
2270 verts[0] = swizzleLane4(a);
2271 verts[1] = swizzleLane5(a);
2272 break;
2273 case 5:
2274 verts[0] = swizzleLane5(a);
2275 verts[1] = swizzleLane6(a);
2276 break;
2277 case 6:
2278 verts[0] = swizzleLane6(a);
2279 verts[1] = swizzleLane7(a);
2280 break;
2281 case 7:
2282 verts[0] = swizzleLane7(a);
2283 verts[1] = swizzleLane8(a);
2284 break;
2285 case 8:
2286 verts[0] = swizzleLane8(a);
2287 verts[1] = swizzleLane9(a);
2288 break;
2289 case 9:
2290 verts[0] = swizzleLane9(a);
2291 verts[1] = swizzleLaneA(a);
2292 break;
2293 case 10:
2294 verts[0] = swizzleLaneA(a);
2295 verts[1] = swizzleLaneB(a);
2296 break;
2297 case 11:
2298 verts[0] = swizzleLaneB(a);
2299 verts[1] = swizzleLaneC(a);
2300 break;
2301 case 12:
2302 verts[0] = swizzleLaneC(a);
2303 verts[1] = swizzleLaneD(a);
2304 break;
2305 case 13:
2306 verts[0] = swizzleLaneD(a);
2307 verts[1] = swizzleLaneE(a);
2308 break;
2309 case 14:
2310 verts[0] = swizzleLaneE(a);
2311 verts[1] = swizzleLaneF(a);
2312 break;
2313 case 15:
2314 verts[0] = swizzleLaneF(a);
2315 verts[1] = swizzleLane0(b);
2316 break;
2317 }
2318 #else
2319 const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
2320 const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
2321
2322 switch (primIndex)
2323 {
2324 case 0:
2325 verts[0] = swizzleLane0(a);
2326 verts[1] = swizzleLane1(a);
2327 break;
2328 case 1:
2329 verts[0] = swizzleLane1(a);
2330 verts[1] = swizzleLane2(a);
2331 break;
2332 case 2:
2333 verts[0] = swizzleLane2(a);
2334 verts[1] = swizzleLane3(a);
2335 break;
2336 case 3:
2337 verts[0] = swizzleLane3(a);
2338 verts[1] = swizzleLane4(a);
2339 break;
2340 case 4:
2341 verts[0] = swizzleLane4(a);
2342 verts[1] = swizzleLane5(a);
2343 break;
2344 case 5:
2345 verts[0] = swizzleLane5(a);
2346 verts[1] = swizzleLane6(a);
2347 break;
2348 case 6:
2349 verts[0] = swizzleLane6(a);
2350 verts[1] = swizzleLane7(a);
2351 break;
2352 case 7:
2353 verts[0] = swizzleLane7(a);
2354 verts[1] = swizzleLane0(b);
2355 break;
2356 }
2357 #endif
2358 }
2359
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2360 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2361 {
2362 #if USE_SIMD16_FRONTEND
2363 simdvector a;
2364
2365 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2366
2367 if (!pa.useAlternateOffset)
2368 {
2369 for (uint32_t i = 0; i < 4; i += 1)
2370 {
2371 a[i] = _simd16_extract_ps(a_16[i], 0);
2372 }
2373 }
2374 else
2375 {
2376 for (uint32_t i = 0; i < 4; i += 1)
2377 {
2378 a[i] = _simd16_extract_ps(a_16[i], 1);
2379 }
2380 }
2381
2382 #else
2383 simdvector& a = PaGetSimdVector(pa, 0, slot);
2384
2385 #endif
2386 verts[0] = a; // points only have 1 vertex.
2387
2388 SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2389 return true;
2390 }
2391
2392 #if ENABLE_AVX512_SIMD16
PaPoints0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2393 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2394 {
2395 simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot);
2396
2397 verts[0] = a; // points only have 1 vertex.
2398
2399 SetNextPaState_simd16(
2400 pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2401 return true;
2402 }
2403
2404 #endif
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2405 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2406 {
2407 #if USE_SIMD16_FRONTEND
2408 const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
2409
2410 if (pa.useAlternateOffset)
2411 {
2412 primIndex += KNOB_SIMD_WIDTH;
2413 }
2414
2415 verts[0] = swizzleLaneN(a, primIndex);
2416 #else
2417 const simdvector& a = PaGetSimdVector(pa, 0, slot);
2418
2419 verts[0] = swizzleLaneN(a, primIndex);
2420 #endif
2421 }
2422
2423 //////////////////////////////////////////////////////////////////////////
2424 /// @brief State 1 for RECT_LIST topology.
2425 /// There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2426 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2427 {
2428 SetNextPaState(pa, PaRectList1, PaRectListSingle0);
2429 return false;
2430 }
2431
2432 //////////////////////////////////////////////////////////////////////////
2433 /// @brief State 1 for RECT_LIST topology.
2434 /// Rect lists has the following format.
2435 /// w x y z
2436 /// v2 o---o v5 o---o v8 o---o v11 o---o
2437 /// | \ | | \ | | \ | | \ |
2438 /// v1 o---o v4 o---o v7 o---o v10 o---o
2439 /// v0 v3 v6 v9
2440 ///
2441 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2442 ///
2443 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2444 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2445 /// etc.
2446 ///
2447 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2448 /// where v0 contains all the first vertices for 8 triangles.
2449 ///
2450 /// Result:
2451 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2452 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2453 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2454 ///
2455 /// @param pa - State for PA state machine.
2456 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2457 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2458 /// etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2459 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2460 {
2461 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
2462 #if USE_SIMD16_FRONTEND
2463 simdvector a;
2464 simdvector b;
2465
2466 if (!pa.useAlternateOffset)
2467 {
2468 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2469
2470 for (uint32_t i = 0; i < 4; i += 1)
2471 {
2472 a[i] = _simd16_extract_ps(a_16[i], 0);
2473 b[i] = _simd16_extract_ps(a_16[i], 1);
2474 }
2475 }
2476 else
2477 {
2478 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2479
2480 for (uint32_t i = 0; i < 4; i += 1)
2481 {
2482 a[i] = _simd16_extract_ps(b_16[i], 0);
2483 b[i] = _simd16_extract_ps(b_16[i], 1);
2484 ;
2485 }
2486 }
2487
2488 #else
2489 simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
2490 simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
2491
2492 #endif
2493 __m256 tmp0, tmp1, tmp2;
2494
2495 // Loop over each component in the simdvector.
2496 for (int i = 0; i < 4; ++i)
2497 {
2498 simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2499 tmp0 = _mm256_permute2f128_ps(
2500 b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2501 v0[i] = _mm256_blend_ps(
2502 a[i],
2503 tmp0,
2504 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2505 tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2506 v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2507 v0[i] =
2508 _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2509
2510 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2511 /// AVX2 should make this much cheaper.
2512 simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2513 v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2514 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2515 tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2516 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * }
2517 v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2518 v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2519 v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2520
2521 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2522 simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2523 v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2524 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2525 v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
2526
2527 // Need to compute 4th implied vertex for the rectangle.
2528 tmp2 = _mm256_sub_ps(v0[i], v1[i]);
2529 tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
2530 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2531 v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2532 }
2533
2534 SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2535 return true;
2536 }
2537
2538 //////////////////////////////////////////////////////////////////////////
2539 /// @brief State 2 for RECT_LIST topology.
2540 /// Not implemented unless there is a use case for more then 8 rects.
2541 /// @param pa - State for PA state machine.
2542 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2543 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2544 /// etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2545 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2546 {
2547 SWR_INVALID("Is rect list used for anything other then clears?");
2548 SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2549 return true;
2550 }
2551
2552 #if ENABLE_AVX512_SIMD16
2553 //////////////////////////////////////////////////////////////////////////
2554 /// @brief State 1 for RECT_LIST topology.
2555 /// There is not enough to assemble 8 triangles.
PaRectList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2556 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2557 {
2558 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
2559 return false;
2560 }
2561
2562 //////////////////////////////////////////////////////////////////////////
2563 /// @brief State 1 for RECT_LIST topology.
2564 /// Rect lists has the following format.
2565 /// w x y z
2566 /// v2 o---o v5 o---o v8 o---o v11 o---o
2567 /// | \ | | \ | | \ | | \ |
2568 /// v1 o---o v4 o---o v7 o---o v10 o---o
2569 /// v0 v3 v6 v9
2570 ///
2571 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2572 ///
2573 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2574 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2575 /// etc.
2576 ///
2577 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2578 /// where v0 contains all the first vertices for 8 triangles.
2579 ///
2580 /// Result:
2581 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2582 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2583 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2584 ///
2585 /// @param pa - State for PA state machine.
2586 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2587 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2588 /// etc.
PaRectList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2589 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2590 {
2591 // clang-format off
2592
2593 simdvector a;
2594 simdvector b;
2595
2596 if (!pa.useAlternateOffset)
2597 {
2598 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7,
2599 // v8, v9, v10, v11, v12, v13, v14, v15 }
2600
2601 for (uint32_t i = 0; i < 4; i += 1)
2602 {
2603 a[i] = _simd16_extract_ps(a_16[i], 0);
2604 b[i] = _simd16_extract_ps(a_16[i], 1);
2605 }
2606 }
2607 else
2608 {
2609 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
2610
2611 for (uint32_t i = 0; i < 4; i += 1)
2612 {
2613 a[i] = _simd16_extract_ps(b_16[i], 0);
2614 b[i] = _simd16_extract_ps(b_16[i], 1);
2615 }
2616 }
2617
2618 simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2619 simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2620 simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2621
2622 // Loop over each component in the simdvector.
2623 for (int i = 0; i < 4; i += 1)
2624 {
2625 simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2626 simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2627 simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2628
2629 __m256 tmp0, tmp1, tmp2;
2630
2631 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2632 v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2633 tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2634 v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2635 v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2636
2637 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2638 /// AVX2 should make this much cheaper.
2639 v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2640 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2641 tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2642 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * }
2643 v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2644 v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2645 v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2646
2647 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2648 v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2649 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2650 v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
2651
2652 // Need to compute 4th implied vertex for the rectangle.
2653 tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
2654 tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * }
2655 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2656 v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2657
2658 v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
2659 v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
2660 v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
2661 }
2662
2663 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2664 return true;
2665
2666 // clang-format on
2667 }
2668
2669 //////////////////////////////////////////////////////////////////////////
2670 /// @brief State 2 for RECT_LIST topology.
2671 /// Not implemented unless there is a use case for more then 8 rects.
2672 /// @param pa - State for PA state machine.
2673 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2674 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2675 /// etc.
PaRectList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2676 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2677 {
2678 SWR_INVALID("Is rect list used for anything other then clears?");
2679 SetNextPaState_simd16(
2680 pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2681 return true;
2682 }
2683
2684 #endif
2685 //////////////////////////////////////////////////////////////////////////
2686 /// @brief This procedure is called by the Binner to assemble the attributes.
2687 /// Unlike position, which is stored vertically, the attributes are
2688 /// stored horizontally. The outputs from the VS, labeled as 'a' and
2689 /// 'b' are vertical. This function needs to transpose the lanes
2690 /// containing the vertical attribute data into horizontal form.
2691 /// @param pa - State for PA state machine.
2692 /// @param slot - Index into VS output for a given attribute.
2693 /// @param primIndex - Binner processes each triangle individually.
2694 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2695 /// etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2696 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2697 {
2698 // We have 12 simdscalars contained within 3 simdvectors which
2699 // hold at least 8 triangles worth of data. We want to assemble a single
2700 // triangle with data in horizontal form.
2701 #if USE_SIMD16_FRONTEND
2702 simdvector a;
2703 simdvector b;
2704
2705 if (!pa.useAlternateOffset)
2706 {
2707 const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2708
2709 for (uint32_t i = 0; i < 4; i += 1)
2710 {
2711 a[i] = _simd16_extract_ps(a_16[i], 0);
2712 b[i] = _simd16_extract_ps(a_16[i], 1);
2713 }
2714 }
2715 else
2716 {
2717 const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2718
2719 for (uint32_t i = 0; i < 4; i += 1)
2720 {
2721 a[i] = _simd16_extract_ps(b_16[i], 0);
2722 b[i] = _simd16_extract_ps(b_16[i], 1);
2723 ;
2724 }
2725 }
2726
2727 #else
2728 simdvector& a = PaGetSimdVector(pa, 0, slot);
2729
2730 #endif
2731 // Convert from vertical to horizontal.
2732 switch (primIndex)
2733 {
2734 case 0:
2735 verts[0] = swizzleLane0(a);
2736 verts[1] = swizzleLane1(a);
2737 verts[2] = swizzleLane2(a);
2738 break;
2739 case 1:
2740 verts[0] = swizzleLane0(a);
2741 verts[1] = swizzleLane2(a);
2742 verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
2743 break;
2744 case 2:
2745 case 3:
2746 case 4:
2747 case 5:
2748 case 6:
2749 case 7:
2750 SWR_INVALID("Invalid primIndex: %d", primIndex);
2751 break;
2752 };
2753 }
2754
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,uint32_t in_vertexStride,bool in_isStreaming,uint32_t numVertsPerPrim,PRIMITIVE_TOPOLOGY topo)2755 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT* in_pDC,
2756 uint32_t in_numPrims,
2757 uint8_t* pStream,
2758 uint32_t in_streamSizeInVerts,
2759 uint32_t in_vertexStride,
2760 bool in_isStreaming,
2761 uint32_t numVertsPerPrim,
2762 PRIMITIVE_TOPOLOGY topo) :
2763 PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim),
2764 numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0),
2765 counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
2766 {
2767 const API_STATE& state = GetApiState(pDC);
2768
2769 this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
2770
2771 #if ENABLE_AVX512_SIMD16
2772 pfnPaFunc_simd16 = nullptr;
2773
2774 #endif
2775 switch (this->binTopology)
2776 {
2777 case TOP_TRIANGLE_LIST:
2778 this->pfnPaFunc = PaTriList0;
2779 #if ENABLE_AVX512_SIMD16
2780 this->pfnPaFunc_simd16 = PaTriList0_simd16;
2781 #endif
2782 break;
2783 case TOP_TRIANGLE_STRIP:
2784 this->pfnPaFunc = PaTriStrip0;
2785 #if ENABLE_AVX512_SIMD16
2786 this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2787 #endif
2788 break;
2789 case TOP_TRIANGLE_FAN:
2790 this->pfnPaFunc = PaTriFan0;
2791 #if ENABLE_AVX512_SIMD16
2792 this->pfnPaFunc_simd16 = PaTriFan0_simd16;
2793 #endif
2794 break;
2795 case TOP_QUAD_LIST:
2796 this->pfnPaFunc = PaQuadList0;
2797 #if ENABLE_AVX512_SIMD16
2798 this->pfnPaFunc_simd16 = PaQuadList0_simd16;
2799 #endif
2800 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2801 break;
2802 case TOP_QUAD_STRIP:
2803 // quad strip pattern when decomposed into triangles is the same as verts strips
2804 this->pfnPaFunc = PaTriStrip0;
2805 #if ENABLE_AVX512_SIMD16
2806 this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2807 #endif
2808 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2809 break;
2810 case TOP_LINE_LIST:
2811 this->pfnPaFunc = PaLineList0;
2812 #if ENABLE_AVX512_SIMD16
2813 this->pfnPaFunc_simd16 = PaLineList0_simd16;
2814 #endif
2815 this->numPrims = in_numPrims;
2816 break;
2817 case TOP_LINE_STRIP:
2818 this->pfnPaFunc = PaLineStrip0;
2819 #if ENABLE_AVX512_SIMD16
2820 this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
2821 #endif
2822 this->numPrims = in_numPrims;
2823 break;
2824 case TOP_LINE_LOOP:
2825 this->pfnPaFunc = PaLineLoop0;
2826 #if ENABLE_AVX512_SIMD16
2827 this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
2828 #endif
2829 this->numPrims = in_numPrims;
2830 break;
2831 case TOP_POINT_LIST:
2832 this->pfnPaFunc = PaPoints0;
2833 #if ENABLE_AVX512_SIMD16
2834 this->pfnPaFunc_simd16 = PaPoints0_simd16;
2835 #endif
2836 this->numPrims = in_numPrims;
2837 break;
2838 case TOP_RECT_LIST:
2839 this->pfnPaFunc = PaRectList0;
2840 #if ENABLE_AVX512_SIMD16
2841 this->pfnPaFunc_simd16 = PaRectList0_simd16;
2842 #endif
2843 this->numPrims = in_numPrims * 2;
2844 break;
2845
2846 case TOP_PATCHLIST_1:
2847 this->pfnPaFunc = PaPatchList<1>;
2848 #if ENABLE_AVX512_SIMD16
2849 this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
2850 #endif
2851 break;
2852 case TOP_PATCHLIST_2:
2853 this->pfnPaFunc = PaPatchList<2>;
2854 #if ENABLE_AVX512_SIMD16
2855 this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
2856 #endif
2857 break;
2858 case TOP_PATCHLIST_3:
2859 this->pfnPaFunc = PaPatchList<3>;
2860 #if ENABLE_AVX512_SIMD16
2861 this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
2862 #endif
2863 break;
2864 case TOP_PATCHLIST_4:
2865 this->pfnPaFunc = PaPatchList<4>;
2866 #if ENABLE_AVX512_SIMD16
2867 this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
2868 #endif
2869 break;
2870 case TOP_PATCHLIST_5:
2871 this->pfnPaFunc = PaPatchList<5>;
2872 #if ENABLE_AVX512_SIMD16
2873 this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
2874 #endif
2875 break;
2876 case TOP_PATCHLIST_6:
2877 this->pfnPaFunc = PaPatchList<6>;
2878 #if ENABLE_AVX512_SIMD16
2879 this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
2880 #endif
2881 break;
2882 case TOP_PATCHLIST_7:
2883 this->pfnPaFunc = PaPatchList<7>;
2884 #if ENABLE_AVX512_SIMD16
2885 this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
2886 #endif
2887 break;
2888 case TOP_PATCHLIST_8:
2889 this->pfnPaFunc = PaPatchList<8>;
2890 #if ENABLE_AVX512_SIMD16
2891 this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
2892 #endif
2893 break;
2894 case TOP_PATCHLIST_9:
2895 this->pfnPaFunc = PaPatchList<9>;
2896 #if ENABLE_AVX512_SIMD16
2897 this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
2898 #endif
2899 break;
2900 case TOP_PATCHLIST_10:
2901 this->pfnPaFunc = PaPatchList<10>;
2902 #if ENABLE_AVX512_SIMD16
2903 this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
2904 #endif
2905 break;
2906 case TOP_PATCHLIST_11:
2907 this->pfnPaFunc = PaPatchList<11>;
2908 #if ENABLE_AVX512_SIMD16
2909 this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
2910 #endif
2911 break;
2912 case TOP_PATCHLIST_12:
2913 this->pfnPaFunc = PaPatchList<12>;
2914 #if ENABLE_AVX512_SIMD16
2915 this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
2916 #endif
2917 break;
2918 case TOP_PATCHLIST_13:
2919 this->pfnPaFunc = PaPatchList<13>;
2920 #if ENABLE_AVX512_SIMD16
2921 this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
2922 #endif
2923 break;
2924 case TOP_PATCHLIST_14:
2925 this->pfnPaFunc = PaPatchList<14>;
2926 #if ENABLE_AVX512_SIMD16
2927 this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
2928 #endif
2929 break;
2930 case TOP_PATCHLIST_15:
2931 this->pfnPaFunc = PaPatchList<15>;
2932 #if ENABLE_AVX512_SIMD16
2933 this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
2934 #endif
2935 break;
2936 case TOP_PATCHLIST_16:
2937 this->pfnPaFunc = PaPatchList<16>;
2938 #if ENABLE_AVX512_SIMD16
2939 this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
2940 #endif
2941 break;
2942 case TOP_PATCHLIST_17:
2943 this->pfnPaFunc = PaPatchList<17>;
2944 #if ENABLE_AVX512_SIMD16
2945 this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
2946 #endif
2947 break;
2948 case TOP_PATCHLIST_18:
2949 this->pfnPaFunc = PaPatchList<18>;
2950 #if ENABLE_AVX512_SIMD16
2951 this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
2952 #endif
2953 break;
2954 case TOP_PATCHLIST_19:
2955 this->pfnPaFunc = PaPatchList<19>;
2956 #if ENABLE_AVX512_SIMD16
2957 this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
2958 #endif
2959 break;
2960 case TOP_PATCHLIST_20:
2961 this->pfnPaFunc = PaPatchList<20>;
2962 #if ENABLE_AVX512_SIMD16
2963 this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
2964 #endif
2965 break;
2966 case TOP_PATCHLIST_21:
2967 this->pfnPaFunc = PaPatchList<21>;
2968 #if ENABLE_AVX512_SIMD16
2969 this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
2970 #endif
2971 break;
2972 case TOP_PATCHLIST_22:
2973 this->pfnPaFunc = PaPatchList<22>;
2974 #if ENABLE_AVX512_SIMD16
2975 this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
2976 #endif
2977 break;
2978 case TOP_PATCHLIST_23:
2979 this->pfnPaFunc = PaPatchList<23>;
2980 #if ENABLE_AVX512_SIMD16
2981 this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
2982 #endif
2983 break;
2984 case TOP_PATCHLIST_24:
2985 this->pfnPaFunc = PaPatchList<24>;
2986 #if ENABLE_AVX512_SIMD16
2987 this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
2988 #endif
2989 break;
2990 case TOP_PATCHLIST_25:
2991 this->pfnPaFunc = PaPatchList<25>;
2992 #if ENABLE_AVX512_SIMD16
2993 this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
2994 #endif
2995 break;
2996 case TOP_PATCHLIST_26:
2997 this->pfnPaFunc = PaPatchList<26>;
2998 #if ENABLE_AVX512_SIMD16
2999 this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
3000 #endif
3001 break;
3002 case TOP_PATCHLIST_27:
3003 this->pfnPaFunc = PaPatchList<27>;
3004 #if ENABLE_AVX512_SIMD16
3005 this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
3006 #endif
3007 break;
3008 case TOP_PATCHLIST_28:
3009 this->pfnPaFunc = PaPatchList<28>;
3010 #if ENABLE_AVX512_SIMD16
3011 this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
3012 #endif
3013 break;
3014 case TOP_PATCHLIST_29:
3015 this->pfnPaFunc = PaPatchList<29>;
3016 #if ENABLE_AVX512_SIMD16
3017 this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
3018 #endif
3019 break;
3020 case TOP_PATCHLIST_30:
3021 this->pfnPaFunc = PaPatchList<30>;
3022 #if ENABLE_AVX512_SIMD16
3023 this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
3024 #endif
3025 break;
3026 case TOP_PATCHLIST_31:
3027 this->pfnPaFunc = PaPatchList<31>;
3028 #if ENABLE_AVX512_SIMD16
3029 this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
3030 #endif
3031 break;
3032 case TOP_PATCHLIST_32:
3033 this->pfnPaFunc = PaPatchList<32>;
3034 #if ENABLE_AVX512_SIMD16
3035 this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
3036 #endif
3037 break;
3038
3039 default:
3040 SWR_INVALID("Invalid topology: %d", this->binTopology);
3041 break;
3042 };
3043
3044 this->pfnPaFuncReset = this->pfnPaFunc;
3045 #if ENABLE_AVX512_SIMD16
3046 this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
3047 #endif
3048
3049 #if USE_SIMD16_FRONTEND
3050 simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3051 simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
3052
3053 #else
3054 simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
3055 simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
3056
3057 #endif
3058 switch (this->binTopology)
3059 {
3060 case TOP_TRIANGLE_LIST:
3061 case TOP_TRIANGLE_STRIP:
3062 case TOP_TRIANGLE_FAN:
3063 case TOP_LINE_STRIP:
3064 case TOP_LINE_LIST:
3065 case TOP_LINE_LOOP:
3066 #if USE_SIMD16_FRONTEND
3067 this->primIDIncr = 16;
3068 this->primID = id16;
3069 #else
3070 this->primIDIncr = 8;
3071 this->primID = id8;
3072 #endif
3073 break;
3074 case TOP_QUAD_LIST:
3075 case TOP_QUAD_STRIP:
3076 case TOP_RECT_LIST:
3077 #if USE_SIMD16_FRONTEND
3078 this->primIDIncr = 8;
3079 this->primID = id82;
3080 #else
3081 this->primIDIncr = 4;
3082 this->primID = id4;
3083 #endif
3084 break;
3085 case TOP_POINT_LIST:
3086 #if USE_SIMD16_FRONTEND
3087 this->primIDIncr = 16;
3088 this->primID = id16;
3089 #else
3090 this->primIDIncr = 8;
3091 this->primID = id8;
3092 #endif
3093 break;
3094 case TOP_PATCHLIST_1:
3095 case TOP_PATCHLIST_2:
3096 case TOP_PATCHLIST_3:
3097 case TOP_PATCHLIST_4:
3098 case TOP_PATCHLIST_5:
3099 case TOP_PATCHLIST_6:
3100 case TOP_PATCHLIST_7:
3101 case TOP_PATCHLIST_8:
3102 case TOP_PATCHLIST_9:
3103 case TOP_PATCHLIST_10:
3104 case TOP_PATCHLIST_11:
3105 case TOP_PATCHLIST_12:
3106 case TOP_PATCHLIST_13:
3107 case TOP_PATCHLIST_14:
3108 case TOP_PATCHLIST_15:
3109 case TOP_PATCHLIST_16:
3110 case TOP_PATCHLIST_17:
3111 case TOP_PATCHLIST_18:
3112 case TOP_PATCHLIST_19:
3113 case TOP_PATCHLIST_20:
3114 case TOP_PATCHLIST_21:
3115 case TOP_PATCHLIST_22:
3116 case TOP_PATCHLIST_23:
3117 case TOP_PATCHLIST_24:
3118 case TOP_PATCHLIST_25:
3119 case TOP_PATCHLIST_26:
3120 case TOP_PATCHLIST_27:
3121 case TOP_PATCHLIST_28:
3122 case TOP_PATCHLIST_29:
3123 case TOP_PATCHLIST_30:
3124 case TOP_PATCHLIST_31:
3125 case TOP_PATCHLIST_32:
3126 // Always run KNOB_SIMD_WIDTH number of patches at a time.
3127 #if USE_SIMD16_FRONTEND
3128 this->primIDIncr = 16;
3129 this->primID = id16;
3130 #else
3131 this->primIDIncr = 8;
3132 this->primID = id8;
3133 #endif
3134 break;
3135
3136 default:
3137 SWR_INVALID("Invalid topology: %d", this->binTopology);
3138 break;
3139 };
3140 }
3141 #endif
3142