1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34
35 #if (KNOB_SIMD_WIDTH == 8)
36
swizzleLane0(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)37 INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
38 {
39 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
40 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
41 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
42 }
43
swizzleLane1(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)44 INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
45 {
46 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
47 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
48 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
49 }
50
swizzleLane2(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)51 INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
52 {
53 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
54 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
55 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
56 }
57
swizzleLane3(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)58 INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
59 {
60 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
61 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
62 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
63 }
64
swizzleLane4(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)65 INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
66 {
67 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
68 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
69 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
70 }
71
swizzleLane5(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)72 INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
73 {
74 simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
75 simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
76 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
77 }
78
swizzleLane6(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)79 INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
80 {
81 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
82 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
83 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
84 }
85
swizzleLane7(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)86 INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
87 {
88 simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
89 simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
90 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
91 }
92
swizzleLane0(const simdvector & v)93 INLINE simd4scalar swizzleLane0(const simdvector &v)
94 {
95 return swizzleLane0(v.x, v.y, v.z, v.w);
96 }
97
swizzleLane1(const simdvector & v)98 INLINE simd4scalar swizzleLane1(const simdvector &v)
99 {
100 return swizzleLane1(v.x, v.y, v.z, v.w);
101 }
102
swizzleLane2(const simdvector & v)103 INLINE simd4scalar swizzleLane2(const simdvector &v)
104 {
105 return swizzleLane2(v.x, v.y, v.z, v.w);
106 }
107
swizzleLane3(const simdvector & v)108 INLINE simd4scalar swizzleLane3(const simdvector &v)
109 {
110 return swizzleLane3(v.x, v.y, v.z, v.w);
111 }
112
swizzleLane4(const simdvector & v)113 INLINE simd4scalar swizzleLane4(const simdvector &v)
114 {
115 return swizzleLane4(v.x, v.y, v.z, v.w);
116 }
117
swizzleLane5(const simdvector & v)118 INLINE simd4scalar swizzleLane5(const simdvector &v)
119 {
120 return swizzleLane5(v.x, v.y, v.z, v.w);
121 }
122
swizzleLane6(const simdvector & v)123 INLINE simd4scalar swizzleLane6(const simdvector &v)
124 {
125 return swizzleLane6(v.x, v.y, v.z, v.w);
126 }
127
swizzleLane7(const simdvector & v)128 INLINE simd4scalar swizzleLane7(const simdvector &v)
129 {
130 return swizzleLane7(v.x, v.y, v.z, v.w);
131 }
132
swizzleLaneN(const simdvector & v,int lane)133 INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
134 {
135 switch (lane)
136 {
137 case 0:
138 return swizzleLane0(v);
139 case 1:
140 return swizzleLane1(v);
141 case 2:
142 return swizzleLane2(v);
143 case 3:
144 return swizzleLane3(v);
145 case 4:
146 return swizzleLane4(v);
147 case 5:
148 return swizzleLane5(v);
149 case 6:
150 return swizzleLane6(v);
151 case 7:
152 return swizzleLane7(v);
153 default:
154 return _mm_setzero_ps();
155 }
156 }
157
158 #if ENABLE_AVX512_SIMD16
swizzleLane0(const simd16vector & v)159 INLINE simd4scalar swizzleLane0(const simd16vector &v)
160 {
161 return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
162 }
163
swizzleLane1(const simd16vector & v)164 INLINE simd4scalar swizzleLane1(const simd16vector &v)
165 {
166 return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
167 }
168
swizzleLane2(const simd16vector & v)169 INLINE simd4scalar swizzleLane2(const simd16vector &v)
170 {
171 return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
172 }
173
swizzleLane3(const simd16vector & v)174 INLINE simd4scalar swizzleLane3(const simd16vector &v)
175 {
176 return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
177 }
178
swizzleLane4(const simd16vector & v)179 INLINE simd4scalar swizzleLane4(const simd16vector &v)
180 {
181 return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
182 }
183
swizzleLane5(const simd16vector & v)184 INLINE simd4scalar swizzleLane5(const simd16vector &v)
185 {
186 return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
187 }
188
swizzleLane6(const simd16vector & v)189 INLINE simd4scalar swizzleLane6(const simd16vector &v)
190 {
191 return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
192 }
193
swizzleLane7(const simd16vector & v)194 INLINE simd4scalar swizzleLane7(const simd16vector &v)
195 {
196 return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
197 }
198
swizzleLane8(const simd16vector & v)199 INLINE simd4scalar swizzleLane8(const simd16vector &v)
200 {
201 return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
202 }
203
swizzleLane9(const simd16vector & v)204 INLINE simd4scalar swizzleLane9(const simd16vector &v)
205 {
206 return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
207 }
208
swizzleLaneA(const simd16vector & v)209 INLINE simd4scalar swizzleLaneA(const simd16vector &v)
210 {
211 return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
212 }
213
swizzleLaneB(const simd16vector & v)214 INLINE simd4scalar swizzleLaneB(const simd16vector &v)
215 {
216 return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
217 }
218
swizzleLaneC(const simd16vector & v)219 INLINE simd4scalar swizzleLaneC(const simd16vector &v)
220 {
221 return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
222 }
223
swizzleLaneD(const simd16vector & v)224 INLINE simd4scalar swizzleLaneD(const simd16vector &v)
225 {
226 return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
227 }
228
swizzleLaneE(const simd16vector & v)229 INLINE simd4scalar swizzleLaneE(const simd16vector &v)
230 {
231 return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
232 }
233
swizzleLaneF(const simd16vector & v)234 INLINE simd4scalar swizzleLaneF(const simd16vector &v)
235 {
236 return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
237 }
238
swizzleLaneN(const simd16vector & v,int lane)239 INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
240 {
241 switch (lane)
242 {
243 case 0:
244 return swizzleLane0(v);
245 case 1:
246 return swizzleLane1(v);
247 case 2:
248 return swizzleLane2(v);
249 case 3:
250 return swizzleLane3(v);
251 case 4:
252 return swizzleLane4(v);
253 case 5:
254 return swizzleLane5(v);
255 case 6:
256 return swizzleLane6(v);
257 case 7:
258 return swizzleLane7(v);
259 case 8:
260 return swizzleLane8(v);
261 case 9:
262 return swizzleLane9(v);
263 case 10:
264 return swizzleLaneA(v);
265 case 11:
266 return swizzleLaneB(v);
267 case 12:
268 return swizzleLaneC(v);
269 case 13:
270 return swizzleLaneD(v);
271 case 14:
272 return swizzleLaneE(v);
273 case 15:
274 return swizzleLaneF(v);
275 default:
276 return _mm_setzero_ps();
277 }
278 }
279
280 #endif
281 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
282 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
283 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
284 #if ENABLE_AVX512_SIMD16
285 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
286 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
287 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
288 #endif
289 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
290
291 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
292 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
293 #if ENABLE_AVX512_SIMD16
294 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
295 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
296 #endif
297 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
298
299 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
300 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
301 #if ENABLE_AVX512_SIMD16
302 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
303 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
304 #endif
305 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
306
307 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
308 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
309 #if ENABLE_AVX512_SIMD16
310 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
311 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
312 #endif
313 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
314
315 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
316 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
317 #if ENABLE_AVX512_SIMD16
318 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
319 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
320 #endif
321 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
322
323 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
324 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
325 #if ENABLE_AVX512_SIMD16
326 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
327 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
328 #endif
329 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
330
331 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
332 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
333 #if ENABLE_AVX512_SIMD16
334 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
335 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
336 #endif
337 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
338
339 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
340 #if ENABLE_AVX512_SIMD16
341 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
342 #endif
343 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
344
345 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
346 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
347 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
348 #if ENABLE_AVX512_SIMD16
349 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
350 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
351 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
352 #endif
353 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
354
355 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])356 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
357 {
358 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
359 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
360 // Each attribute has 4 components.
361
362 /// @todo Optimize this
363
364 #if USE_SIMD16_FRONTEND
365 if (pa.useAlternateOffset)
366 {
367 primIndex += KNOB_SIMD_WIDTH;
368 }
369
370 #endif
371 float* pOutVec = (float*)verts;
372
373 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
374 {
375 uint32_t input_cp = primIndex * TotalControlPoints + cp;
376 #if USE_SIMD16_FRONTEND
377 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
378 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
379
380 #else
381 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
382 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
383
384 #endif
385 // Loop over all components of the attribute
386 for (uint32_t i = 0; i < 4; ++i)
387 {
388 #if USE_SIMD16_FRONTEND
389 const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
390 #else
391 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
392 #endif
393 pOutVec[cp * 4 + i] = pInputVec[input_lane];
394 }
395 }
396 }
397
398 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])399 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
400 {
401 SetNextPaState(
402 pa,
403 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
404 PaPatchListSingle<TotalControlPoints>);
405
406 return false;
407 }
408
409 template<uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])410 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
411 {
412 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
413 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
414 // Each attribute has 4 components.
415
416 /// @todo Optimize this
417
418 #if USE_SIMD16_FRONTEND
419 uint32_t lane_offset = 0;
420
421 if (pa.useAlternateOffset)
422 {
423 lane_offset = KNOB_SIMD_WIDTH;
424 }
425
426 #endif
427 // Loop over all components of the attribute
428 for (uint32_t i = 0; i < 4; ++i)
429 {
430 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
431 {
432 float vec[KNOB_SIMD_WIDTH];
433 for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
434 {
435 #if USE_SIMD16_FRONTEND
436 uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
437 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
438 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
439
440 const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
441 #else
442 uint32_t input_cp = lane * TotalControlPoints + cp;
443 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
444 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
445
446 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
447 #endif
448 vec[lane] = pInputVec[input_lane];
449 }
450 verts[cp][i] = _simd_loadu_ps(vec);
451 }
452 }
453
454 SetNextPaState(
455 pa,
456 PaPatchList<TotalControlPoints>,
457 PaPatchListSingle<TotalControlPoints>,
458 0,
459 PA_STATE_OPT::SIMD_WIDTH,
460 true);
461
462 return true;
463 }
464
465 #if ENABLE_AVX512_SIMD16
466 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])467 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
468 {
469 SetNextPaState_simd16(
470 pa,
471 PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
472 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
473 PaPatchListSingle<TotalControlPoints>);
474
475 return false;
476 }
477
478 template<uint32_t TotalControlPoints>
PaPatchListTerm_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])479 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
480 {
481 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
482 // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute.
483 // Each attribute has 4 components.
484
485 /// @todo Optimize this
486
487 // Loop over all components of the attribute
488 for (uint32_t i = 0; i < 4; ++i)
489 {
490 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
491 {
492 float vec[KNOB_SIMD16_WIDTH];
493 for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
494 {
495 uint32_t input_cp = lane * TotalControlPoints + cp;
496 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
497 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
498
499 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
500 vec[lane] = pInputVec[input_lane];
501 }
502 verts[cp][i] = _simd16_loadu_ps(vec);
503 }
504 }
505
506 SetNextPaState_simd16(
507 pa,
508 PaPatchList_simd16<TotalControlPoints>,
509 PaPatchList<TotalControlPoints>,
510 PaPatchListSingle<TotalControlPoints>,
511 0,
512 PA_STATE_OPT::SIMD_WIDTH,
513 true);
514
515 return true;
516 }
517
518 #endif
519 #define PA_PATCH_LIST_TERMINATOR(N) \
520 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
521 { return PaPatchListTerm<N>(pa, slot, verts); }
522 PA_PATCH_LIST_TERMINATOR(1)
523 PA_PATCH_LIST_TERMINATOR(2)
524 PA_PATCH_LIST_TERMINATOR(3)
525 PA_PATCH_LIST_TERMINATOR(4)
526 PA_PATCH_LIST_TERMINATOR(5)
527 PA_PATCH_LIST_TERMINATOR(6)
528 PA_PATCH_LIST_TERMINATOR(7)
529 PA_PATCH_LIST_TERMINATOR(8)
530 PA_PATCH_LIST_TERMINATOR(9)
531 PA_PATCH_LIST_TERMINATOR(10)
532 PA_PATCH_LIST_TERMINATOR(11)
533 PA_PATCH_LIST_TERMINATOR(12)
534 PA_PATCH_LIST_TERMINATOR(13)
535 PA_PATCH_LIST_TERMINATOR(14)
536 PA_PATCH_LIST_TERMINATOR(15)
537 PA_PATCH_LIST_TERMINATOR(16)
538 PA_PATCH_LIST_TERMINATOR(17)
539 PA_PATCH_LIST_TERMINATOR(18)
540 PA_PATCH_LIST_TERMINATOR(19)
541 PA_PATCH_LIST_TERMINATOR(20)
542 PA_PATCH_LIST_TERMINATOR(21)
543 PA_PATCH_LIST_TERMINATOR(22)
544 PA_PATCH_LIST_TERMINATOR(23)
545 PA_PATCH_LIST_TERMINATOR(24)
546 PA_PATCH_LIST_TERMINATOR(25)
547 PA_PATCH_LIST_TERMINATOR(26)
548 PA_PATCH_LIST_TERMINATOR(27)
549 PA_PATCH_LIST_TERMINATOR(28)
550 PA_PATCH_LIST_TERMINATOR(29)
551 PA_PATCH_LIST_TERMINATOR(30)
552 PA_PATCH_LIST_TERMINATOR(31)
553 PA_PATCH_LIST_TERMINATOR(32)
554 #undef PA_PATCH_LIST_TERMINATOR
555
556 #if ENABLE_AVX512_SIMD16
557 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
558 template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
559 { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
560 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
561 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
562 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
563 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
564 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
565 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
566 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
567 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
568 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
569 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
570 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
571 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
572 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
573 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
574 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
575 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
576 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
577 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
578 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
579 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
580 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
581 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
582 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
583 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
584 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
585 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
586 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
587 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
588 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
589 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
590 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
591 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
592 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
593
594 #endif
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])595 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
596 {
597 SetNextPaState(pa, PaTriList1, PaTriListSingle0);
598 return false; // Not enough vertices to assemble 4 or 8 triangles.
599 }
600
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])601 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
602 {
603 SetNextPaState(pa, PaTriList2, PaTriListSingle0);
604 return false; // Not enough vertices to assemble 8 triangles.
605 }
606
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])607 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
608 {
609 #if KNOB_ARCH == KNOB_ARCH_AVX
610 #if USE_SIMD16_FRONTEND
611 simdvector a;
612 simdvector b;
613 simdvector c;
614
615 if (!pa.useAlternateOffset)
616 {
617 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
618 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
619
620 for (uint32_t i = 0; i < 4; i += 1)
621 {
622 a[i] = _simd16_extract_ps(a_16[i], 0);
623 b[i] = _simd16_extract_ps(a_16[i], 1);
624 c[i] = _simd16_extract_ps(b_16[i], 0);
625 }
626 }
627 else
628 {
629 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
630 const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
631
632 for (uint32_t i = 0; i < 4; i += 1)
633 {
634 a[i] = _simd16_extract_ps(b_16[i], 1);
635 b[i] = _simd16_extract_ps(c_16[i], 0);
636 c[i] = _simd16_extract_ps(c_16[i], 1);
637 }
638 }
639
640 #else
641 simdvector &a = PaGetSimdVector(pa, 0, slot);
642 simdvector &b = PaGetSimdVector(pa, 1, slot);
643 simdvector &c = PaGetSimdVector(pa, 2, slot);
644
645 #endif
646 simdscalar s;
647
648 // Tri Pattern - provoking vertex is always v0
649 // v0 -> 0 3 6 9 12 15 18 21
650 // v1 -> 1 4 7 10 13 16 19 22
651 // v2 -> 2 5 8 11 14 17 20 23
652
653 for (int i = 0; i < 4; ++i)
654 {
655 simdvector& v0 = verts[0];
656 v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
657 v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
658 v0[i] = _mm256_permute_ps(v0[i], 0x6C);
659 s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
660 v0[i] = _simd_blend_ps(v0[i], s, 0x44);
661
662 simdvector& v1 = verts[1];
663 v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
664 v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
665 v1[i] = _mm256_permute_ps(v1[i], 0xB1);
666 s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
667 v1[i] = _simd_blend_ps(v1[i], s, 0x66);
668
669 simdvector& v2 = verts[2];
670 v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
671 v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
672 v2[i] = _mm256_permute_ps(v2[i], 0xC6);
673 s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
674 v2[i] = _simd_blend_ps(v2[i], s, 0x22);
675 }
676
677 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
678 const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
679 const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
680 const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
681
682 #if USE_SIMD16_FRONTEND
683 simdvector a;
684 simdvector b;
685 simdvector c;
686
687 if (!pa.useAlternateOffset)
688 {
689 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
690 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
691
692 for (uint32_t i = 0; i < 4; i += 1)
693 {
694 a[i] = _simd16_extract_ps(a_16[i], 0);
695 b[i] = _simd16_extract_ps(a_16[i], 1);
696 c[i] = _simd16_extract_ps(b_16[i], 0);
697 }
698 }
699 else
700 {
701 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
702 const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
703
704 for (uint32_t i = 0; i < 4; i += 1)
705 {
706 a[i] = _simd16_extract_ps(b_16[i], 1);
707 b[i] = _simd16_extract_ps(c_16[i], 0);
708 c[i] = _simd16_extract_ps(c_16[i], 1);
709 }
710 }
711
712 #else
713 const simdvector &a = PaGetSimdVector(pa, 0, slot);
714 const simdvector &b = PaGetSimdVector(pa, 1, slot);
715 const simdvector &c = PaGetSimdVector(pa, 2, slot);
716
717 #endif
718 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
719 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
720 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
721
722 simdvector &v0 = verts[0];
723 simdvector &v1 = verts[1];
724 simdvector &v2 = verts[2];
725
726 // for simd x, y, z, and w
727 for (int i = 0; i < 4; ++i)
728 {
729 simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
730 simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
731 simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
732
733 v0[i] = _simd_permute_ps(temp0, perm0);
734 v1[i] = _simd_permute_ps(temp1, perm1);
735 v2[i] = _simd_permute_ps(temp2, perm2);
736 }
737
738 #endif
739 SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
740 return true;
741 }
742
743 #if ENABLE_AVX512_SIMD16
PaTriList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])744 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
745 {
746 SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
747 return false; // Not enough vertices to assemble 16 triangles
748 }
749
PaTriList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])750 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
751 {
752 SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
753 return false; // Not enough vertices to assemble 16 triangles
754 }
755
PaTriList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])756 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
757 {
758 const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
759 const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
760 const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
761
762 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
763 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
764 const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
765
766 simd16vector &v0 = verts[0];
767 simd16vector &v1 = verts[1];
768 simd16vector &v2 = verts[2];
769
770 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
771 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
772 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
773
774 // for simd16 x, y, z, and w
775 for (int i = 0; i < 4; i += 1)
776 {
777 simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
778 simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
779 simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
780
781 v0[i] = _simd16_permute_ps(temp0, perm0);
782 v1[i] = _simd16_permute_ps(temp1, perm1);
783 v2[i] = _simd16_permute_ps(temp2, perm2);
784 }
785
786 SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
787 return true;
788 }
789
790 #endif
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])791 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
792 {
793 #if USE_SIMD16_FRONTEND
794 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
795 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
796 const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
797
798 if (pa.useAlternateOffset)
799 {
800 primIndex += KNOB_SIMD_WIDTH;
801 }
802
803 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
804 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
805 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
806
807 switch (primIndex)
808 {
809 case 0:
810 verts[0] = swizzleLane0(a);
811 verts[1] = swizzleLane1(a);
812 verts[2] = swizzleLane2(a);
813 break;
814 case 1:
815 verts[0] = swizzleLane3(a);
816 verts[1] = swizzleLane4(a);
817 verts[2] = swizzleLane5(a);
818 break;
819 case 2:
820 verts[0] = swizzleLane6(a);
821 verts[1] = swizzleLane7(a);
822 verts[2] = swizzleLane8(a);
823 break;
824 case 3:
825 verts[0] = swizzleLane9(a);
826 verts[1] = swizzleLaneA(a);
827 verts[2] = swizzleLaneB(a);
828 break;
829 case 4:
830 verts[0] = swizzleLaneC(a);
831 verts[1] = swizzleLaneD(a);
832 verts[2] = swizzleLaneE(a);
833 break;
834 case 5:
835 verts[0] = swizzleLaneF(a);
836 verts[1] = swizzleLane0(b);
837 verts[2] = swizzleLane1(b);
838 break;
839 case 6:
840 verts[0] = swizzleLane2(b);
841 verts[1] = swizzleLane3(b);
842 verts[2] = swizzleLane4(b);
843 break;
844 case 7:
845 verts[0] = swizzleLane5(b);
846 verts[1] = swizzleLane6(b);
847 verts[2] = swizzleLane7(b);
848 break;
849 case 8:
850 verts[0] = swizzleLane8(b);
851 verts[1] = swizzleLane9(b);
852 verts[2] = swizzleLaneA(b);
853 break;
854 case 9:
855 verts[0] = swizzleLaneB(b);
856 verts[1] = swizzleLaneC(b);
857 verts[2] = swizzleLaneD(b);
858 break;
859 case 10:
860 verts[0] = swizzleLaneE(b);
861 verts[1] = swizzleLaneF(b);
862 verts[2] = swizzleLane0(c);
863 break;
864 case 11:
865 verts[0] = swizzleLane1(c);
866 verts[1] = swizzleLane2(c);
867 verts[2] = swizzleLane3(c);
868 break;
869 case 12:
870 verts[0] = swizzleLane4(c);
871 verts[1] = swizzleLane5(c);
872 verts[2] = swizzleLane6(c);
873 break;
874 case 13:
875 verts[0] = swizzleLane7(c);
876 verts[1] = swizzleLane8(c);
877 verts[2] = swizzleLane9(c);
878 break;
879 case 14:
880 verts[0] = swizzleLaneA(c);
881 verts[1] = swizzleLaneB(c);
882 verts[2] = swizzleLaneC(c);
883 break;
884 case 15:
885 verts[0] = swizzleLaneD(c);
886 verts[1] = swizzleLaneE(c);
887 verts[2] = swizzleLaneF(c);
888 break;
889 };
890 #else
891 // We have 12 simdscalars contained within 3 simdvectors which
892 // hold at least 8 triangles worth of data. We want to assemble a single
893 // triangle with data in horizontal form.
894
895 const simdvector &a = PaGetSimdVector(pa, 0, slot);
896 const simdvector &b = PaGetSimdVector(pa, 1, slot);
897 const simdvector &c = PaGetSimdVector(pa, 2, slot);
898
899 // Convert from vertical to horizontal.
900 // Tri Pattern - provoking vertex is always v0
901 // v0 -> 0 3 6 9 12 15 18 21
902 // v1 -> 1 4 7 10 13 16 19 22
903 // v2 -> 2 5 8 11 14 17 20 23
904
905 switch (primIndex)
906 {
907 case 0:
908 verts[0] = swizzleLane0(a);
909 verts[1] = swizzleLane1(a);
910 verts[2] = swizzleLane2(a);
911 break;
912 case 1:
913 verts[0] = swizzleLane3(a);
914 verts[1] = swizzleLane4(a);
915 verts[2] = swizzleLane5(a);
916 break;
917 case 2:
918 verts[0] = swizzleLane6(a);
919 verts[1] = swizzleLane7(a);
920 verts[2] = swizzleLane0(b);
921 break;
922 case 3:
923 verts[0] = swizzleLane1(b);
924 verts[1] = swizzleLane2(b);
925 verts[2] = swizzleLane3(b);
926 break;
927 case 4:
928 verts[0] = swizzleLane4(b);
929 verts[1] = swizzleLane5(b);
930 verts[2] = swizzleLane6(b);
931 break;
932 case 5:
933 verts[0] = swizzleLane7(b);
934 verts[1] = swizzleLane0(c);
935 verts[2] = swizzleLane1(c);
936 break;
937 case 6:
938 verts[0] = swizzleLane2(c);
939 verts[1] = swizzleLane3(c);
940 verts[2] = swizzleLane4(c);
941 break;
942 case 7:
943 verts[0] = swizzleLane5(c);
944 verts[1] = swizzleLane6(c);
945 verts[2] = swizzleLane7(c);
946 break;
947 };
948 #endif
949 }
950
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])951 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
952 {
953 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
954 return false; // Not enough vertices to assemble 8 triangles.
955 }
956
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])957 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
958 {
959 #if USE_SIMD16_FRONTEND
960 simdvector a;
961 simdvector b;
962
963 if (!pa.useAlternateOffset)
964 {
965 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
966
967 for (uint32_t i = 0; i < 4; i += 1)
968 {
969 a[i] = _simd16_extract_ps(a_16[i], 0);
970 b[i] = _simd16_extract_ps(a_16[i], 1);
971 }
972 }
973 else
974 {
975 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
976
977 for (uint32_t i = 0; i < 4; i += 1)
978 {
979 a[i] = _simd16_extract_ps(b_16[i], 0);
980 b[i] = _simd16_extract_ps(b_16[i], 1);
981 }
982 }
983
984 #else
985 simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
986 simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
987
988 #endif
989 simdscalar s;
990
991 for(int i = 0; i < 4; ++i)
992 {
993 simdscalar a0 = a[i];
994 simdscalar b0 = b[i];
995
996 // Tri Pattern - provoking vertex is always v0
997 // v0 -> 01234567
998 // v1 -> 13355779
999 // v2 -> 22446688
1000 simdvector& v0 = verts[0];
1001 v0[i] = a0;
1002
1003 // s -> 4567891011
1004 s = _simd_permute2f128_ps(a0, b0, 0x21);
1005 // s -> 23456789
1006 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1007
1008 simdvector& v1 = verts[1];
1009 // v1 -> 13355779
1010 v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
1011
1012 simdvector& v2 = verts[2];
1013 // v2 -> 22446688
1014 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
1015 }
1016
1017 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1018 return true;
1019 }
1020
1021 #if ENABLE_AVX512_SIMD16
PaTriStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1022 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1023 {
1024 SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
1025 return false; // Not enough vertices to assemble 16 triangles.
1026 }
1027
PaTriStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1028 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1029 {
1030 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1031 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1032
1033 simd16vector &v0 = verts[0];
1034 simd16vector &v1 = verts[1];
1035 simd16vector &v2 = verts[2];
1036
1037 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1038 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1039 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1040
1041 // for simd16 x, y, z, and w
1042 for (int i = 0; i < 4; i += 1)
1043 {
1044 simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
1045 simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1046
1047 simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
1048 simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
1049
1050 v0[i] = a[i]; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1051 v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1052 v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1053 }
1054
1055 SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1056 return true;
1057 }
1058
1059 #endif
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1060 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1061 {
1062 #if USE_SIMD16_FRONTEND
1063 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1064 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1065
1066 if (pa.useAlternateOffset)
1067 {
1068 primIndex += KNOB_SIMD_WIDTH;
1069 }
1070
1071 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1072 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1073 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1074
1075 switch (primIndex)
1076 {
1077 case 0:
1078 verts[0] = swizzleLane0(a);
1079 verts[1] = swizzleLane1(a);
1080 verts[2] = swizzleLane2(a);
1081 break;
1082 case 1:
1083 verts[0] = swizzleLane1(a);
1084 verts[1] = swizzleLane3(a);
1085 verts[2] = swizzleLane2(a);
1086 break;
1087 case 2:
1088 verts[0] = swizzleLane2(a);
1089 verts[1] = swizzleLane3(a);
1090 verts[2] = swizzleLane4(a);
1091 break;
1092 case 3:
1093 verts[0] = swizzleLane3(a);
1094 verts[1] = swizzleLane5(a);
1095 verts[2] = swizzleLane4(a);
1096 break;
1097 case 4:
1098 verts[0] = swizzleLane4(a);
1099 verts[1] = swizzleLane5(a);
1100 verts[2] = swizzleLane6(a);
1101 break;
1102 case 5:
1103 verts[0] = swizzleLane5(a);
1104 verts[1] = swizzleLane7(a);
1105 verts[2] = swizzleLane6(a);
1106 break;
1107 case 6:
1108 verts[0] = swizzleLane6(a);
1109 verts[1] = swizzleLane7(a);
1110 verts[2] = swizzleLane8(a);
1111 break;
1112 case 7:
1113 verts[0] = swizzleLane7(a);
1114 verts[1] = swizzleLane9(a);
1115 verts[2] = swizzleLane8(a);
1116 break;
1117 case 8:
1118 verts[0] = swizzleLane8(a);
1119 verts[1] = swizzleLane9(a);
1120 verts[2] = swizzleLaneA(a);
1121 break;
1122 case 9:
1123 verts[0] = swizzleLane9(a);
1124 verts[1] = swizzleLaneB(a);
1125 verts[2] = swizzleLaneA(a);
1126 break;
1127 case 10:
1128 verts[0] = swizzleLaneA(a);
1129 verts[1] = swizzleLaneB(a);
1130 verts[2] = swizzleLaneC(a);
1131 break;
1132 case 11:
1133 verts[0] = swizzleLaneB(a);
1134 verts[1] = swizzleLaneD(a);
1135 verts[2] = swizzleLaneC(a);
1136 break;
1137 case 12:
1138 verts[0] = swizzleLaneC(a);
1139 verts[1] = swizzleLaneD(a);
1140 verts[2] = swizzleLaneE(a);
1141 break;
1142 case 13:
1143 verts[0] = swizzleLaneD(a);
1144 verts[1] = swizzleLaneF(a);
1145 verts[2] = swizzleLaneE(a);
1146 break;
1147 case 14:
1148 verts[0] = swizzleLaneE(a);
1149 verts[1] = swizzleLaneF(a);
1150 verts[2] = swizzleLane0(b);
1151 break;
1152 case 15:
1153 verts[0] = swizzleLaneF(a);
1154 verts[1] = swizzleLane1(b);
1155 verts[2] = swizzleLane0(b);
1156 break;
1157 };
1158 #else
1159 const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
1160 const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
1161
1162 // Convert from vertical to horizontal.
1163 // Tri Pattern - provoking vertex is always v0
1164 // v0 -> 01234567
1165 // v1 -> 13355779
1166 // v2 -> 22446688
1167
1168 switch (primIndex)
1169 {
1170 case 0:
1171 verts[0] = swizzleLane0(a);
1172 verts[1] = swizzleLane1(a);
1173 verts[2] = swizzleLane2(a);
1174 break;
1175 case 1:
1176 verts[0] = swizzleLane1(a);
1177 verts[1] = swizzleLane3(a);
1178 verts[2] = swizzleLane2(a);
1179 break;
1180 case 2:
1181 verts[0] = swizzleLane2(a);
1182 verts[1] = swizzleLane3(a);
1183 verts[2] = swizzleLane4(a);
1184 break;
1185 case 3:
1186 verts[0] = swizzleLane3(a);
1187 verts[1] = swizzleLane5(a);
1188 verts[2] = swizzleLane4(a);
1189 break;
1190 case 4:
1191 verts[0] = swizzleLane4(a);
1192 verts[1] = swizzleLane5(a);
1193 verts[2] = swizzleLane6(a);
1194 break;
1195 case 5:
1196 verts[0] = swizzleLane5(a);
1197 verts[1] = swizzleLane7(a);
1198 verts[2] = swizzleLane6(a);
1199 break;
1200 case 6:
1201 verts[0] = swizzleLane6(a);
1202 verts[1] = swizzleLane7(a);
1203 verts[2] = swizzleLane0(b);
1204 break;
1205 case 7:
1206 verts[0] = swizzleLane7(a);
1207 verts[1] = swizzleLane1(b);
1208 verts[2] = swizzleLane0(b);
1209 break;
1210 };
1211 #endif
1212 }
1213
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1214 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1215 {
1216 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
1217 return false; // Not enough vertices to assemble 8 triangles.
1218 }
1219
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1220 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1221 {
1222 #if USE_SIMD16_FRONTEND
1223 simdvector leadVert;
1224 simdvector a;
1225 simdvector b;
1226
1227 const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1228
1229 if (!pa.useAlternateOffset)
1230 {
1231 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1232
1233 for (uint32_t i = 0; i < 4; i += 1)
1234 {
1235 leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1236
1237 a[i] = _simd16_extract_ps(a_16[i], 0);
1238 b[i] = _simd16_extract_ps(a_16[i], 1);
1239 }
1240 }
1241 else
1242 {
1243 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1244
1245 for (uint32_t i = 0; i < 4; i += 1)
1246 {
1247 leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1248
1249 a[i] = _simd16_extract_ps(b_16[i], 0);
1250 b[i] = _simd16_extract_ps(b_16[i], 1);
1251 }
1252 }
1253
1254 #else
1255 const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot);
1256 const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
1257 const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
1258
1259 #endif
1260 simdscalar s;
1261
1262 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
1263 for(int i = 0; i < 4; ++i)
1264 {
1265 simdscalar a0 = a[i];
1266 simdscalar b0 = b[i];
1267
1268 simdscalar comp = leadVert[i];
1269
1270 simdvector& v0 = verts[0];
1271 v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
1272 v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00);
1273
1274 simdvector& v2 = verts[2];
1275 s = _simd_permute2f128_ps(a0, b0, 0x21);
1276 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1277
1278 simdvector& v1 = verts[1];
1279 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
1280 }
1281
1282 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1283 return true;
1284 }
1285
1286 #if ENABLE_AVX512_SIMD16
PaTriFan0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1287 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1288 {
1289 SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
1290 return false; // Not enough vertices to assemble 16 triangles.
1291 }
1292
PaTriFan1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1293 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1294 {
1295 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
1296 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1297 const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1298
1299 simd16vector &v0 = verts[0];
1300 simd16vector &v1 = verts[1];
1301 simd16vector &v2 = verts[2];
1302
1303 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1304 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1305 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1306
1307 // for simd16 x, y, z, and w
1308 for (uint32_t i = 0; i < 4; i += 1)
1309 {
1310 simd16scalar shuff = _simd16_shuffle_ps(a[i], a[i], _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
1311
1312 v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1313
1314 simd16scalar temp0 = _simd16_permute2f128_ps(b[i], b[i], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1315 simd16scalar temp1 = _simd16_permute2f128_ps(c[i], c[i], 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
1316
1317 simd16scalar blend = _simd16_blend_ps(temp0, temp1, 0xF000); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
1318
1319 v2[i] = _simd16_shuffle_ps(b[i], blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1320 v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1321 }
1322
1323 SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1324 return true;
1325 }
1326
1327 #endif
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1328 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1329 {
1330 #if USE_SIMD16_FRONTEND
1331 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
1332 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1333 const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1334
1335 if (pa.useAlternateOffset)
1336 {
1337 primIndex += KNOB_SIMD_WIDTH;
1338 }
1339
1340 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1341 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1342 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1343
1344 // vert 0 from leading vertex
1345 verts[0] = swizzleLane0(a);
1346
1347 // vert 1
1348 if (primIndex < 15)
1349 {
1350 verts[1] = swizzleLaneN(b, primIndex + 1);
1351 }
1352 else
1353 {
1354 verts[1] = swizzleLane0(c);
1355 }
1356
1357 // vert 2
1358 if (primIndex < 14)
1359 {
1360 verts[2] = swizzleLaneN(b, primIndex + 2);
1361 }
1362 else
1363 {
1364 verts[2] = swizzleLaneN(c, primIndex - 14);
1365 }
1366 #else
1367 const simdvector &a = PaGetSimdVector(pa, pa.first, slot);
1368 const simdvector &b = PaGetSimdVector(pa, pa.prev, slot);
1369 const simdvector &c = PaGetSimdVector(pa, pa.cur, slot);
1370
1371 // vert 0 from leading vertex
1372 verts[0] = swizzleLane0(a);
1373
1374 // vert 1
1375 if (primIndex < 7)
1376 {
1377 verts[1] = swizzleLaneN(b, primIndex + 1);
1378 }
1379 else
1380 {
1381 verts[1] = swizzleLane0(c);
1382 }
1383
1384 // vert 2
1385 if (primIndex < 6)
1386 {
1387 verts[2] = swizzleLaneN(b, primIndex + 2);
1388 }
1389 else
1390 {
1391 verts[2] = swizzleLaneN(c, primIndex - 6);
1392 }
1393 #endif
1394 }
1395
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1396 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1397 {
1398 SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
1399 return false; // Not enough vertices to assemble 8 triangles.
1400 }
1401
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1402 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1403 {
1404 #if USE_SIMD16_FRONTEND
1405 simdvector a;
1406 simdvector b;
1407
1408 if (!pa.useAlternateOffset)
1409 {
1410 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1411
1412 for (uint32_t i = 0; i < 4; i += 1)
1413 {
1414 a[i] = _simd16_extract_ps(a_16[i], 0);
1415 b[i] = _simd16_extract_ps(a_16[i], 1);
1416 }
1417 }
1418 else
1419 {
1420 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1421
1422 for (uint32_t i = 0; i < 4; i += 1)
1423 {
1424 a[i] = _simd16_extract_ps(b_16[i], 0);
1425 b[i] = _simd16_extract_ps(b_16[i], 1);
1426 }
1427 }
1428
1429 #else
1430 simdvector &a = PaGetSimdVector(pa, 0, slot);
1431 simdvector &b = PaGetSimdVector(pa, 1, slot);
1432
1433 #endif
1434 simdscalar s1, s2;
1435
1436 for(int i = 0; i < 4; ++i)
1437 {
1438 simdscalar a0 = a[i];
1439 simdscalar b0 = b[i];
1440
1441 s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
1442 s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
1443
1444 simdvector& v0 = verts[0];
1445 v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
1446
1447 simdvector& v1 = verts[1];
1448 v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
1449
1450 simdvector& v2 = verts[2];
1451 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
1452 }
1453
1454 SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1455 return true;
1456 }
1457
1458 #if ENABLE_AVX512_SIMD16
PaQuadList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1459 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1460 {
1461 SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
1462 return false; // Not enough vertices to assemble 16 triangles.
1463 }
1464
PaQuadList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1465 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1466 {
1467 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1468 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1469
1470 simd16vector &v0 = verts[0];
1471 simd16vector &v1 = verts[1];
1472 simd16vector &v2 = verts[2];
1473
1474 // v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
1475 // v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
1476 // v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1477
1478 // for simd16 x, y, z, and w
1479 for (uint32_t i = 0; i < 4; i += 1)
1480 {
1481 simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
1482 simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1483
1484 v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
1485 v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
1486 v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1487 }
1488
1489 SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1490 return true;
1491 }
1492
1493 #endif
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1494 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1495 {
1496 #if USE_SIMD16_FRONTEND
1497 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1498 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1499
1500 if (pa.useAlternateOffset)
1501 {
1502 primIndex += KNOB_SIMD_WIDTH;
1503 }
1504
1505 switch (primIndex)
1506 {
1507 case 0:
1508 // triangle 0 - 0 1 2
1509 verts[0] = swizzleLane0(a);
1510 verts[1] = swizzleLane1(a);
1511 verts[2] = swizzleLane2(a);
1512 break;
1513 case 1:
1514 // triangle 1 - 0 2 3
1515 verts[0] = swizzleLane0(a);
1516 verts[1] = swizzleLane2(a);
1517 verts[2] = swizzleLane3(a);
1518 break;
1519 case 2:
1520 // triangle 2 - 4 5 6
1521 verts[0] = swizzleLane4(a);
1522 verts[1] = swizzleLane5(a);
1523 verts[2] = swizzleLane6(a);
1524 break;
1525 case 3:
1526 // triangle 3 - 4 6 7
1527 verts[0] = swizzleLane4(a);
1528 verts[1] = swizzleLane6(a);
1529 verts[2] = swizzleLane7(a);
1530 break;
1531 case 4:
1532 // triangle 4 - 8 9 A
1533 verts[0] = swizzleLane8(a);
1534 verts[1] = swizzleLane9(a);
1535 verts[2] = swizzleLaneA(a);
1536 break;
1537 case 5:
1538 // triangle 5 - 8 A B
1539 verts[0] = swizzleLane8(a);
1540 verts[1] = swizzleLaneA(a);
1541 verts[2] = swizzleLaneB(a);
1542 break;
1543 case 6:
1544 // triangle 6 - C D E
1545 verts[0] = swizzleLaneC(a);
1546 verts[1] = swizzleLaneD(a);
1547 verts[2] = swizzleLaneE(a);
1548 break;
1549 case 7:
1550 // triangle 7 - C E F
1551 verts[0] = swizzleLaneC(a);
1552 verts[1] = swizzleLaneE(a);
1553 verts[2] = swizzleLaneF(a);
1554 break;
1555 case 8:
1556 // triangle 0 - 0 1 2
1557 verts[0] = swizzleLane0(b);
1558 verts[1] = swizzleLane1(b);
1559 verts[2] = swizzleLane2(b);
1560 break;
1561 case 9:
1562 // triangle 1 - 0 2 3
1563 verts[0] = swizzleLane0(b);
1564 verts[1] = swizzleLane2(b);
1565 verts[2] = swizzleLane3(b);
1566 break;
1567 case 10:
1568 // triangle 2 - 4 5 6
1569 verts[0] = swizzleLane4(b);
1570 verts[1] = swizzleLane5(b);
1571 verts[2] = swizzleLane6(b);
1572 break;
1573 case 11:
1574 // triangle 3 - 4 6 7
1575 verts[0] = swizzleLane4(b);
1576 verts[1] = swizzleLane6(b);
1577 verts[2] = swizzleLane7(b);
1578 break;
1579 case 12:
1580 // triangle 4 - 8 9 A
1581 verts[0] = swizzleLane8(b);
1582 verts[1] = swizzleLane9(b);
1583 verts[2] = swizzleLaneA(b);
1584 break;
1585 case 13:
1586 // triangle 5 - 8 A B
1587 verts[0] = swizzleLane8(b);
1588 verts[1] = swizzleLaneA(b);
1589 verts[2] = swizzleLaneB(b);
1590 break;
1591 case 14:
1592 // triangle 6 - C D E
1593 verts[0] = swizzleLaneC(b);
1594 verts[1] = swizzleLaneD(b);
1595 verts[2] = swizzleLaneE(b);
1596 break;
1597 case 15:
1598 // triangle 7 - C E F
1599 verts[0] = swizzleLaneC(b);
1600 verts[1] = swizzleLaneE(b);
1601 verts[2] = swizzleLaneF(b);
1602 break;
1603 }
1604 #else
1605 const simdvector &a = PaGetSimdVector(pa, 0, slot);
1606 const simdvector &b = PaGetSimdVector(pa, 1, slot);
1607
1608 switch (primIndex)
1609 {
1610 case 0:
1611 // triangle 0 - 0 1 2
1612 verts[0] = swizzleLane0(a);
1613 verts[1] = swizzleLane1(a);
1614 verts[2] = swizzleLane2(a);
1615 break;
1616 case 1:
1617 // triangle 1 - 0 2 3
1618 verts[0] = swizzleLane0(a);
1619 verts[1] = swizzleLane2(a);
1620 verts[2] = swizzleLane3(a);
1621 break;
1622 case 2:
1623 // triangle 2 - 4 5 6
1624 verts[0] = swizzleLane4(a);
1625 verts[1] = swizzleLane5(a);
1626 verts[2] = swizzleLane6(a);
1627 break;
1628 case 3:
1629 // triangle 3 - 4 6 7
1630 verts[0] = swizzleLane4(a);
1631 verts[1] = swizzleLane6(a);
1632 verts[2] = swizzleLane7(a);
1633 break;
1634 case 4:
1635 // triangle 4 - 8 9 10 (0 1 2)
1636 verts[0] = swizzleLane0(b);
1637 verts[1] = swizzleLane1(b);
1638 verts[2] = swizzleLane2(b);
1639 break;
1640 case 5:
1641 // triangle 1 - 0 2 3
1642 verts[0] = swizzleLane0(b);
1643 verts[1] = swizzleLane2(b);
1644 verts[2] = swizzleLane3(b);
1645 break;
1646 case 6:
1647 // triangle 2 - 4 5 6
1648 verts[0] = swizzleLane4(b);
1649 verts[1] = swizzleLane5(b);
1650 verts[2] = swizzleLane6(b);
1651 break;
1652 case 7:
1653 // triangle 3 - 4 6 7
1654 verts[0] = swizzleLane4(b);
1655 verts[1] = swizzleLane6(b);
1656 verts[2] = swizzleLane7(b);
1657 break;
1658 }
1659 #endif
1660 }
1661
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1662 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1663 {
1664 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
1665 return false;
1666 }
1667
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1668 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1669 {
1670 PaLineStrip1(pa, slot, verts);
1671
1672 if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
1673 {
1674 // loop reconnect now
1675 const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1676
1677 #if USE_SIMD16_FRONTEND
1678 simdvector first;
1679
1680 const simd16vector &first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1681
1682 if (!pa.useAlternateOffset)
1683 {
1684 for (uint32_t i = 0; i < 4; i += 1)
1685 {
1686 first[i] = _simd16_extract_ps(first_16[i], 0);
1687 }
1688 }
1689 else
1690 {
1691 for (uint32_t i = 0; i < 4; i += 1)
1692 {
1693 first[i] = _simd16_extract_ps(first_16[i], 1);
1694 }
1695 }
1696
1697 #else
1698 simdvector &first = PaGetSimdVector(pa, pa.first, slot);
1699
1700 #endif
1701 for (int i = 0; i < 4; i++)
1702 {
1703 float *firstVtx = (float *)&(first[i]);
1704 float *targetVtx = (float *)&(verts[1][i]);
1705 targetVtx[lane] = firstVtx[0];
1706 }
1707 }
1708
1709 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1710 return true;
1711 }
1712
1713 #if ENABLE_AVX512_SIMD16
PaLineLoop0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1714 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1715 {
1716 SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
1717 return false;
1718 }
1719
PaLineLoop1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1720 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1721 {
1722 PaLineStrip1_simd16(pa, slot, verts);
1723
1724 if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
1725 {
1726 // loop reconnect now
1727 const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1728
1729 const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
1730
1731 for (int i = 0; i < 4; i++)
1732 {
1733 float *firstVtx = (float *)&(first[i]);
1734 float *targetVtx = (float *)&(verts[1][i]);
1735 targetVtx[lane] = firstVtx[0];
1736 }
1737 }
1738
1739 SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1740 return true;
1741 }
1742
1743 #endif
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1744 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1745 {
1746 PaLineStripSingle0(pa, slot, primIndex, verts);
1747
1748 if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
1749 {
1750 #if USE_SIMD16_FRONTEND
1751 const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
1752
1753 verts[1] = swizzleLane0(first);
1754 #else
1755 const simdvector &first = PaGetSimdVector(pa, pa.first, slot);
1756
1757 verts[1] = swizzleLane0(first);
1758 #endif
1759 }
1760 }
1761
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1762 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1763 {
1764 SetNextPaState(pa, PaLineList1, PaLineListSingle0);
1765 return false; // Not enough vertices to assemble 8 lines
1766 }
1767
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1768 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1769 {
1770 #if USE_SIMD16_FRONTEND
1771 simdvector a;
1772 simdvector b;
1773
1774 if (!pa.useAlternateOffset)
1775 {
1776 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1777
1778 for (uint32_t i = 0; i < 4; i += 1)
1779 {
1780 a[i] = _simd16_extract_ps(a_16[i], 0);
1781 b[i] = _simd16_extract_ps(a_16[i], 1);
1782 }
1783 }
1784 else
1785 {
1786 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1787
1788 for (uint32_t i = 0; i < 4; i += 1)
1789 {
1790 a[i] = _simd16_extract_ps(b_16[i], 0);
1791 b[i] = _simd16_extract_ps(b_16[i], 1);
1792 }
1793 }
1794
1795 #else
1796 simdvector &a = PaGetSimdVector(pa, 0, slot);
1797 simdvector &b = PaGetSimdVector(pa, 1, slot);
1798
1799 #endif
1800 /// @todo: verify provoking vertex is correct
1801 // Line list 0 1 2 3 4 5 6 7
1802 // 8 9 10 11 12 13 14 15
1803
1804 // shuffle:
1805 // 0 2 4 6 8 10 12 14
1806 // 1 3 5 7 9 11 13 15
1807
1808 for (uint32_t i = 0; i < 4; ++i)
1809 {
1810 // 0 1 2 3 8 9 10 11
1811 __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
1812 // 4 5 6 7 12 13 14 15
1813 __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
1814
1815 // 0 2 4 6 8 10 12 14
1816 verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
1817 // 1 3 5 7 9 11 13 15
1818 verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
1819 }
1820
1821 SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1822 return true;
1823 }
1824
1825 #if ENABLE_AVX512_SIMD16
PaLineList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1826 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1827 {
1828 SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
1829 return false; // Not enough vertices to assemble 16 lines
1830 }
1831
PaLineList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1832 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1833 {
1834 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1835 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1836
1837 simd16vector &v0 = verts[0];
1838 simd16vector &v1 = verts[1];
1839
1840 // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1841 // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
1842
1843 // for simd16 x, y, z, and w
1844 for (int i = 0; i < 4; i += 1)
1845 {
1846 simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
1847 simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1848
1849 v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1850 v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
1851 }
1852
1853 SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1854 return true;
1855 }
1856
1857 #endif
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1858 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1859 {
1860 #if USE_SIMD16_FRONTEND
1861 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
1862 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
1863
1864 if (pa.useAlternateOffset)
1865 {
1866 primIndex += KNOB_SIMD_WIDTH;
1867 }
1868
1869 switch (primIndex)
1870 {
1871 case 0:
1872 verts[0] = swizzleLane0(a);
1873 verts[1] = swizzleLane1(a);
1874 break;
1875 case 1:
1876 verts[0] = swizzleLane2(a);
1877 verts[1] = swizzleLane3(a);
1878 break;
1879 case 2:
1880 verts[0] = swizzleLane4(a);
1881 verts[1] = swizzleLane5(a);
1882 break;
1883 case 3:
1884 verts[0] = swizzleLane6(a);
1885 verts[1] = swizzleLane7(a);
1886 break;
1887 case 4:
1888 verts[0] = swizzleLane8(a);
1889 verts[1] = swizzleLane9(a);
1890 break;
1891 case 5:
1892 verts[0] = swizzleLaneA(a);
1893 verts[1] = swizzleLaneB(a);
1894 break;
1895 case 6:
1896 verts[0] = swizzleLaneC(a);
1897 verts[1] = swizzleLaneD(a);
1898 break;
1899 case 7:
1900 verts[0] = swizzleLaneE(a);
1901 verts[1] = swizzleLaneF(a);
1902 break;
1903 case 8:
1904 verts[0] = swizzleLane0(b);
1905 verts[1] = swizzleLane1(b);
1906 break;
1907 case 9:
1908 verts[0] = swizzleLane2(b);
1909 verts[1] = swizzleLane3(b);
1910 break;
1911 case 10:
1912 verts[0] = swizzleLane4(b);
1913 verts[1] = swizzleLane5(b);
1914 break;
1915 case 11:
1916 verts[0] = swizzleLane6(b);
1917 verts[1] = swizzleLane7(b);
1918 break;
1919 case 12:
1920 verts[0] = swizzleLane8(b);
1921 verts[1] = swizzleLane9(b);
1922 break;
1923 case 13:
1924 verts[0] = swizzleLaneA(b);
1925 verts[1] = swizzleLaneB(b);
1926 break;
1927 case 14:
1928 verts[0] = swizzleLaneC(b);
1929 verts[1] = swizzleLaneD(b);
1930 break;
1931 case 15:
1932 verts[0] = swizzleLaneE(b);
1933 verts[1] = swizzleLaneF(b);
1934 break;
1935 }
1936 #else
1937 const simdvector &a = PaGetSimdVector(pa, 0, slot);
1938 const simdvector &b = PaGetSimdVector(pa, 1, slot);
1939
1940 switch (primIndex)
1941 {
1942 case 0:
1943 verts[0] = swizzleLane0(a);
1944 verts[1] = swizzleLane1(a);
1945 break;
1946 case 1:
1947 verts[0] = swizzleLane2(a);
1948 verts[1] = swizzleLane3(a);
1949 break;
1950 case 2:
1951 verts[0] = swizzleLane4(a);
1952 verts[1] = swizzleLane5(a);
1953 break;
1954 case 3:
1955 verts[0] = swizzleLane6(a);
1956 verts[1] = swizzleLane7(a);
1957 break;
1958 case 4:
1959 verts[0] = swizzleLane0(b);
1960 verts[1] = swizzleLane1(b);
1961 break;
1962 case 5:
1963 verts[0] = swizzleLane2(b);
1964 verts[1] = swizzleLane3(b);
1965 break;
1966 case 6:
1967 verts[0] = swizzleLane4(b);
1968 verts[1] = swizzleLane5(b);
1969 break;
1970 case 7:
1971 verts[0] = swizzleLane6(b);
1972 verts[1] = swizzleLane7(b);
1973 break;
1974 }
1975 #endif
1976 }
1977
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1978 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1979 {
1980 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
1981 return false; // Not enough vertices to assemble 8 lines
1982 }
1983
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1984 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1985 {
1986 #if USE_SIMD16_FRONTEND
1987 simdvector a;
1988 simdvector b;
1989
1990 if (!pa.useAlternateOffset)
1991 {
1992 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1993
1994 for (uint32_t i = 0; i < 4; i += 1)
1995 {
1996 a[i] = _simd16_extract_ps(a_16[i], 0);
1997 b[i] = _simd16_extract_ps(a_16[i], 1);
1998 }
1999 }
2000 else
2001 {
2002 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
2003
2004 for (uint32_t i = 0; i < 4; i += 1)
2005 {
2006 a[i] = _simd16_extract_ps(b_16[i], 0);
2007 b[i] = _simd16_extract_ps(b_16[i], 1);
2008 }
2009 }
2010
2011 #else
2012 simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
2013 simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
2014
2015 #endif
2016 /// @todo: verify provoking vertex is correct
2017 // Line list 0 1 2 3 4 5 6 7
2018 // 8 9 10 11 12 13 14 15
2019
2020 // shuffle:
2021 // 0 1 2 3 4 5 6 7
2022 // 1 2 3 4 5 6 7 8
2023
2024 verts[0] = a;
2025
2026 for(uint32_t i = 0; i < 4; ++i)
2027 {
2028 // 1 2 3 x 5 6 7 x
2029 __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
2030 // 4 5 6 7 8 9 10 11
2031 __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
2032
2033 // x x x 4 x x x 8
2034 __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
2035
2036 verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
2037 }
2038
2039 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2040 return true;
2041 }
2042
2043 #if ENABLE_AVX512_SIMD16
PaLineStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2044 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2045 {
2046 SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
2047 return false; // Not enough vertices to assemble 16 lines
2048 }
2049
PaLineStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2050 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2051 {
2052 const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
2053
2054 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2055 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2056
2057 simd16vector &v0 = verts[0];
2058 simd16vector &v1 = verts[1];
2059
2060 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2061 // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2062
2063 v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2064
2065 // for simd16 x, y, z, and w
2066 for (int i = 0; i < 4; i += 1)
2067 {
2068 simd16scalar temp = _simd16_blend_ps(a[i], b[i], 0x0001); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2069
2070 v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2071 }
2072
2073 SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2074 return true;
2075 }
2076
2077 #endif
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2078 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2079 {
2080 #if USE_SIMD16_FRONTEND
2081 const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2082 const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2083
2084 if (pa.useAlternateOffset)
2085 {
2086 primIndex += KNOB_SIMD_WIDTH;
2087 }
2088
2089 switch (primIndex)
2090 {
2091 case 0:
2092 verts[0] = swizzleLane0(a);
2093 verts[1] = swizzleLane1(a);
2094 break;
2095 case 1:
2096 verts[0] = swizzleLane1(a);
2097 verts[1] = swizzleLane2(a);
2098 break;
2099 case 2:
2100 verts[0] = swizzleLane2(a);
2101 verts[1] = swizzleLane3(a);
2102 break;
2103 case 3:
2104 verts[0] = swizzleLane3(a);
2105 verts[1] = swizzleLane4(a);
2106 break;
2107 case 4:
2108 verts[0] = swizzleLane4(a);
2109 verts[1] = swizzleLane5(a);
2110 break;
2111 case 5:
2112 verts[0] = swizzleLane5(a);
2113 verts[1] = swizzleLane6(a);
2114 break;
2115 case 6:
2116 verts[0] = swizzleLane6(a);
2117 verts[1] = swizzleLane7(a);
2118 break;
2119 case 7:
2120 verts[0] = swizzleLane7(a);
2121 verts[1] = swizzleLane8(a);
2122 break;
2123 case 8:
2124 verts[0] = swizzleLane8(a);
2125 verts[1] = swizzleLane9(a);
2126 break;
2127 case 9:
2128 verts[0] = swizzleLane9(a);
2129 verts[1] = swizzleLaneA(a);
2130 break;
2131 case 10:
2132 verts[0] = swizzleLaneA(a);
2133 verts[1] = swizzleLaneB(a);
2134 break;
2135 case 11:
2136 verts[0] = swizzleLaneB(a);
2137 verts[1] = swizzleLaneC(a);
2138 break;
2139 case 12:
2140 verts[0] = swizzleLaneC(a);
2141 verts[1] = swizzleLaneD(a);
2142 break;
2143 case 13:
2144 verts[0] = swizzleLaneD(a);
2145 verts[1] = swizzleLaneE(a);
2146 break;
2147 case 14:
2148 verts[0] = swizzleLaneE(a);
2149 verts[1] = swizzleLaneF(a);
2150 break;
2151 case 15:
2152 verts[0] = swizzleLaneF(a);
2153 verts[1] = swizzleLane0(b);
2154 break;
2155 }
2156 #else
2157 const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
2158 const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
2159
2160 switch (primIndex)
2161 {
2162 case 0:
2163 verts[0] = swizzleLane0(a);
2164 verts[1] = swizzleLane1(a);
2165 break;
2166 case 1:
2167 verts[0] = swizzleLane1(a);
2168 verts[1] = swizzleLane2(a);
2169 break;
2170 case 2:
2171 verts[0] = swizzleLane2(a);
2172 verts[1] = swizzleLane3(a);
2173 break;
2174 case 3:
2175 verts[0] = swizzleLane3(a);
2176 verts[1] = swizzleLane4(a);
2177 break;
2178 case 4:
2179 verts[0] = swizzleLane4(a);
2180 verts[1] = swizzleLane5(a);
2181 break;
2182 case 5:
2183 verts[0] = swizzleLane5(a);
2184 verts[1] = swizzleLane6(a);
2185 break;
2186 case 6:
2187 verts[0] = swizzleLane6(a);
2188 verts[1] = swizzleLane7(a);
2189 break;
2190 case 7:
2191 verts[0] = swizzleLane7(a);
2192 verts[1] = swizzleLane0(b);
2193 break;
2194 }
2195 #endif
2196 }
2197
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2198 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2199 {
2200 #if USE_SIMD16_FRONTEND
2201 simdvector a;
2202
2203 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2204
2205 if (!pa.useAlternateOffset)
2206 {
2207 for (uint32_t i = 0; i < 4; i += 1)
2208 {
2209 a[i] = _simd16_extract_ps(a_16[i], 0);
2210 }
2211 }
2212 else
2213 {
2214 for (uint32_t i = 0; i < 4; i += 1)
2215 {
2216 a[i] = _simd16_extract_ps(a_16[i], 1);
2217 }
2218 }
2219
2220 #else
2221 simdvector &a = PaGetSimdVector(pa, 0, slot);
2222
2223 #endif
2224 verts[0] = a; // points only have 1 vertex.
2225
2226 SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2227 return true;
2228 }
2229
2230 #if ENABLE_AVX512_SIMD16
PaPoints0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2231 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2232 {
2233 simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot);
2234
2235 verts[0] = a; // points only have 1 vertex.
2236
2237 SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2238 return true;
2239 }
2240
2241 #endif
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2242 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2243 {
2244 #if USE_SIMD16_FRONTEND
2245 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
2246
2247 if (pa.useAlternateOffset)
2248 {
2249 primIndex += KNOB_SIMD_WIDTH;
2250 }
2251
2252 verts[0] = swizzleLaneN(a, primIndex);
2253 #else
2254 const simdvector &a = PaGetSimdVector(pa, 0, slot);
2255
2256 verts[0] = swizzleLaneN(a, primIndex);
2257 #endif
2258 }
2259
2260 //////////////////////////////////////////////////////////////////////////
2261 /// @brief State 1 for RECT_LIST topology.
2262 /// There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2263 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2264 {
2265 SetNextPaState(pa, PaRectList1, PaRectListSingle0);
2266 return false;
2267 }
2268
2269 //////////////////////////////////////////////////////////////////////////
2270 /// @brief State 1 for RECT_LIST topology.
2271 /// Rect lists has the following format.
2272 /// w x y z
2273 /// v2 o---o v5 o---o v8 o---o v11 o---o
2274 /// | \ | | \ | | \ | | \ |
2275 /// v1 o---o v4 o---o v7 o---o v10 o---o
2276 /// v0 v3 v6 v9
2277 ///
2278 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2279 ///
2280 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2281 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2282 /// etc.
2283 ///
2284 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2285 /// where v0 contains all the first vertices for 8 triangles.
2286 ///
2287 /// Result:
2288 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2289 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2290 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2291 ///
2292 /// @param pa - State for PA state machine.
2293 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2294 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2295 bool PaRectList1(
2296 PA_STATE_OPT& pa,
2297 uint32_t slot,
2298 simdvector verts[])
2299 {
2300 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
2301 #if USE_SIMD16_FRONTEND
2302 simdvector a;
2303 simdvector b;
2304
2305 if (!pa.useAlternateOffset)
2306 {
2307 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2308
2309 for (uint32_t i = 0; i < 4; i += 1)
2310 {
2311 a[i] = _simd16_extract_ps(a_16[i], 0);
2312 b[i] = _simd16_extract_ps(a_16[i], 1);
2313 }
2314 }
2315 else
2316 {
2317 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2318
2319 for (uint32_t i = 0; i < 4; i += 1)
2320 {
2321 a[i] = _simd16_extract_ps(b_16[i], 0);
2322 b[i] = _simd16_extract_ps(b_16[i], 1);;
2323 }
2324 }
2325
2326 #else
2327 simdvector &a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
2328 simdvector &b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
2329
2330 #endif
2331 __m256 tmp0, tmp1, tmp2;
2332
2333 // Loop over each component in the simdvector.
2334 for(int i = 0; i < 4; ++i)
2335 {
2336 simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2337 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2338 v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2339 tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2340 v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2341 v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2342
2343 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2344 /// AVX2 should make this much cheaper.
2345 simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2346 v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2347 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2348 tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2349 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
2350 v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2351 v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2352 v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2353
2354 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2355 simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2356 v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2357 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2358 v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
2359
2360 // Need to compute 4th implied vertex for the rectangle.
2361 tmp2 = _mm256_sub_ps(v0[i], v1[i]);
2362 tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
2363 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2364 v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2365 }
2366
2367 SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2368 return true;
2369 }
2370
2371 //////////////////////////////////////////////////////////////////////////
2372 /// @brief State 2 for RECT_LIST topology.
2373 /// Not implemented unless there is a use case for more then 8 rects.
2374 /// @param pa - State for PA state machine.
2375 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2376 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2377 bool PaRectList2(
2378 PA_STATE_OPT& pa,
2379 uint32_t slot,
2380 simdvector verts[])
2381 {
2382 SWR_INVALID("Is rect list used for anything other then clears?");
2383 SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2384 return true;
2385 }
2386
2387 #if ENABLE_AVX512_SIMD16
2388 //////////////////////////////////////////////////////////////////////////
2389 /// @brief State 1 for RECT_LIST topology.
2390 /// There is not enough to assemble 8 triangles.
PaRectList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2391 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2392 {
2393 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
2394 return false;
2395 }
2396
2397 //////////////////////////////////////////////////////////////////////////
2398 /// @brief State 1 for RECT_LIST topology.
2399 /// Rect lists has the following format.
2400 /// w x y z
2401 /// v2 o---o v5 o---o v8 o---o v11 o---o
2402 /// | \ | | \ | | \ | | \ |
2403 /// v1 o---o v4 o---o v7 o---o v10 o---o
2404 /// v0 v3 v6 v9
2405 ///
2406 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2407 ///
2408 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2409 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2410 /// etc.
2411 ///
2412 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2413 /// where v0 contains all the first vertices for 8 triangles.
2414 ///
2415 /// Result:
2416 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2417 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2418 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2419 ///
2420 /// @param pa - State for PA state machine.
2421 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2422 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2423 bool PaRectList1_simd16(
2424 PA_STATE_OPT& pa,
2425 uint32_t slot,
2426 simd16vector verts[])
2427 {
2428 simdvector a;
2429 simdvector b;
2430
2431 if (!pa.useAlternateOffset)
2432 {
2433 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 }
2434
2435 for (uint32_t i = 0; i < 4; i += 1)
2436 {
2437 a[i] = _simd16_extract_ps(a_16[i], 0);
2438 b[i] = _simd16_extract_ps(a_16[i], 1);
2439 }
2440 }
2441 else
2442 {
2443 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
2444
2445 for (uint32_t i = 0; i < 4; i += 1)
2446 {
2447 a[i] = _simd16_extract_ps(b_16[i], 0);
2448 b[i] = _simd16_extract_ps(b_16[i], 1);
2449 }
2450 }
2451
2452 simd16vector &v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2453 simd16vector &v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2454 simd16vector &v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2455
2456 // Loop over each component in the simdvector.
2457 for (int i = 0; i < 4; i += 1)
2458 {
2459 simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2460 simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2461 simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2462
2463 __m256 tmp0, tmp1, tmp2;
2464
2465 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2466 v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2467 tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2468 v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2469 v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2470
2471 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2472 /// AVX2 should make this much cheaper.
2473 v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2474 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2475 tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2476 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
2477 v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2478 v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2479 v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2480
2481 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2482 v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2483 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2484 v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
2485
2486 // Need to compute 4th implied vertex for the rectangle.
2487 tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
2488 tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * }
2489 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2490 v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2491
2492 v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
2493 v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
2494 v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
2495 }
2496
2497 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2498 return true;
2499 }
2500
2501 //////////////////////////////////////////////////////////////////////////
2502 /// @brief State 2 for RECT_LIST topology.
2503 /// Not implemented unless there is a use case for more then 8 rects.
2504 /// @param pa - State for PA state machine.
2505 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2506 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2507 bool PaRectList2_simd16(
2508 PA_STATE_OPT& pa,
2509 uint32_t slot,
2510 simd16vector verts[])
2511 {
2512 SWR_INVALID("Is rect list used for anything other then clears?");
2513 SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2514 return true;
2515 }
2516
2517 #endif
2518 //////////////////////////////////////////////////////////////////////////
2519 /// @brief This procedure is called by the Binner to assemble the attributes.
2520 /// Unlike position, which is stored vertically, the attributes are
2521 /// stored horizontally. The outputs from the VS, labeled as 'a' and
2522 /// 'b' are vertical. This function needs to transpose the lanes
2523 /// containing the vertical attribute data into horizontal form.
2524 /// @param pa - State for PA state machine.
2525 /// @param slot - Index into VS output for a given attribute.
2526 /// @param primIndex - Binner processes each triangle individually.
2527 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2528 void PaRectListSingle0(
2529 PA_STATE_OPT& pa,
2530 uint32_t slot,
2531 uint32_t primIndex,
2532 simd4scalar verts[])
2533 {
2534 // We have 12 simdscalars contained within 3 simdvectors which
2535 // hold at least 8 triangles worth of data. We want to assemble a single
2536 // triangle with data in horizontal form.
2537 #if USE_SIMD16_FRONTEND
2538 simdvector a;
2539 simdvector b;
2540
2541 if (!pa.useAlternateOffset)
2542 {
2543 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2544
2545 for (uint32_t i = 0; i < 4; i += 1)
2546 {
2547 a[i] = _simd16_extract_ps(a_16[i], 0);
2548 b[i] = _simd16_extract_ps(a_16[i], 1);
2549 }
2550 }
2551 else
2552 {
2553 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2554
2555 for (uint32_t i = 0; i < 4; i += 1)
2556 {
2557 a[i] = _simd16_extract_ps(b_16[i], 0);
2558 b[i] = _simd16_extract_ps(b_16[i], 1);;
2559 }
2560 }
2561
2562 #else
2563 simdvector& a = PaGetSimdVector(pa, 0, slot);
2564
2565 #endif
2566 // Convert from vertical to horizontal.
2567 switch(primIndex)
2568 {
2569 case 0:
2570 verts[0] = swizzleLane0(a);
2571 verts[1] = swizzleLane1(a);
2572 verts[2] = swizzleLane2(a);
2573 break;
2574 case 1:
2575 verts[0] = swizzleLane0(a);
2576 verts[1] = swizzleLane2(a);
2577 verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
2578 break;
2579 case 2:
2580 case 3:
2581 case 4:
2582 case 5:
2583 case 6:
2584 case 7:
2585 SWR_INVALID("Invalid primIndex: %d", primIndex);
2586 break;
2587 };
2588 }
2589
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,uint32_t in_vertexStride,bool in_isStreaming,uint32_t numVertsPerPrim,PRIMITIVE_TOPOLOGY topo)2590 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
2591 uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) :
2592 PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
2593 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
2594 {
2595 const API_STATE& state = GetApiState(pDC);
2596
2597 this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
2598
2599 #if ENABLE_AVX512_SIMD16
2600 pfnPaFunc_simd16 = nullptr;
2601
2602 #endif
2603 switch (this->binTopology)
2604 {
2605 case TOP_TRIANGLE_LIST:
2606 this->pfnPaFunc = PaTriList0;
2607 #if ENABLE_AVX512_SIMD16
2608 this->pfnPaFunc_simd16 = PaTriList0_simd16;
2609 #endif
2610 break;
2611 case TOP_TRIANGLE_STRIP:
2612 this->pfnPaFunc = PaTriStrip0;
2613 #if ENABLE_AVX512_SIMD16
2614 this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2615 #endif
2616 break;
2617 case TOP_TRIANGLE_FAN:
2618 this->pfnPaFunc = PaTriFan0;
2619 #if ENABLE_AVX512_SIMD16
2620 this->pfnPaFunc_simd16 = PaTriFan0_simd16;
2621 #endif
2622 break;
2623 case TOP_QUAD_LIST:
2624 this->pfnPaFunc = PaQuadList0;
2625 #if ENABLE_AVX512_SIMD16
2626 this->pfnPaFunc_simd16 = PaQuadList0_simd16;
2627 #endif
2628 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2629 break;
2630 case TOP_QUAD_STRIP:
2631 // quad strip pattern when decomposed into triangles is the same as verts strips
2632 this->pfnPaFunc = PaTriStrip0;
2633 #if ENABLE_AVX512_SIMD16
2634 this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2635 #endif
2636 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2637 break;
2638 case TOP_LINE_LIST:
2639 this->pfnPaFunc = PaLineList0;
2640 #if ENABLE_AVX512_SIMD16
2641 this->pfnPaFunc_simd16 = PaLineList0_simd16;
2642 #endif
2643 this->numPrims = in_numPrims;
2644 break;
2645 case TOP_LINE_STRIP:
2646 this->pfnPaFunc = PaLineStrip0;
2647 #if ENABLE_AVX512_SIMD16
2648 this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
2649 #endif
2650 this->numPrims = in_numPrims;
2651 break;
2652 case TOP_LINE_LOOP:
2653 this->pfnPaFunc = PaLineLoop0;
2654 #if ENABLE_AVX512_SIMD16
2655 this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
2656 #endif
2657 this->numPrims = in_numPrims;
2658 break;
2659 case TOP_POINT_LIST:
2660 this->pfnPaFunc = PaPoints0;
2661 #if ENABLE_AVX512_SIMD16
2662 this->pfnPaFunc_simd16 = PaPoints0_simd16;
2663 #endif
2664 this->numPrims = in_numPrims;
2665 break;
2666 case TOP_RECT_LIST:
2667 this->pfnPaFunc = PaRectList0;
2668 #if ENABLE_AVX512_SIMD16
2669 this->pfnPaFunc_simd16 = PaRectList0_simd16;
2670 #endif
2671 this->numPrims = in_numPrims * 2;
2672 break;
2673
2674 case TOP_PATCHLIST_1:
2675 this->pfnPaFunc = PaPatchList<1>;
2676 #if ENABLE_AVX512_SIMD16
2677 this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
2678 #endif
2679 break;
2680 case TOP_PATCHLIST_2:
2681 this->pfnPaFunc = PaPatchList<2>;
2682 #if ENABLE_AVX512_SIMD16
2683 this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
2684 #endif
2685 break;
2686 case TOP_PATCHLIST_3:
2687 this->pfnPaFunc = PaPatchList<3>;
2688 #if ENABLE_AVX512_SIMD16
2689 this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
2690 #endif
2691 break;
2692 case TOP_PATCHLIST_4:
2693 this->pfnPaFunc = PaPatchList<4>;
2694 #if ENABLE_AVX512_SIMD16
2695 this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
2696 #endif
2697 break;
2698 case TOP_PATCHLIST_5:
2699 this->pfnPaFunc = PaPatchList<5>;
2700 #if ENABLE_AVX512_SIMD16
2701 this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
2702 #endif
2703 break;
2704 case TOP_PATCHLIST_6:
2705 this->pfnPaFunc = PaPatchList<6>;
2706 #if ENABLE_AVX512_SIMD16
2707 this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
2708 #endif
2709 break;
2710 case TOP_PATCHLIST_7:
2711 this->pfnPaFunc = PaPatchList<7>;
2712 #if ENABLE_AVX512_SIMD16
2713 this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
2714 #endif
2715 break;
2716 case TOP_PATCHLIST_8:
2717 this->pfnPaFunc = PaPatchList<8>;
2718 #if ENABLE_AVX512_SIMD16
2719 this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
2720 #endif
2721 break;
2722 case TOP_PATCHLIST_9:
2723 this->pfnPaFunc = PaPatchList<9>;
2724 #if ENABLE_AVX512_SIMD16
2725 this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
2726 #endif
2727 break;
2728 case TOP_PATCHLIST_10:
2729 this->pfnPaFunc = PaPatchList<10>;
2730 #if ENABLE_AVX512_SIMD16
2731 this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
2732 #endif
2733 break;
2734 case TOP_PATCHLIST_11:
2735 this->pfnPaFunc = PaPatchList<11>;
2736 #if ENABLE_AVX512_SIMD16
2737 this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
2738 #endif
2739 break;
2740 case TOP_PATCHLIST_12:
2741 this->pfnPaFunc = PaPatchList<12>;
2742 #if ENABLE_AVX512_SIMD16
2743 this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
2744 #endif
2745 break;
2746 case TOP_PATCHLIST_13:
2747 this->pfnPaFunc = PaPatchList<13>;
2748 #if ENABLE_AVX512_SIMD16
2749 this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
2750 #endif
2751 break;
2752 case TOP_PATCHLIST_14:
2753 this->pfnPaFunc = PaPatchList<14>;
2754 #if ENABLE_AVX512_SIMD16
2755 this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
2756 #endif
2757 break;
2758 case TOP_PATCHLIST_15:
2759 this->pfnPaFunc = PaPatchList<15>;
2760 #if ENABLE_AVX512_SIMD16
2761 this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
2762 #endif
2763 break;
2764 case TOP_PATCHLIST_16:
2765 this->pfnPaFunc = PaPatchList<16>;
2766 #if ENABLE_AVX512_SIMD16
2767 this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
2768 #endif
2769 break;
2770 case TOP_PATCHLIST_17:
2771 this->pfnPaFunc = PaPatchList<17>;
2772 #if ENABLE_AVX512_SIMD16
2773 this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
2774 #endif
2775 break;
2776 case TOP_PATCHLIST_18:
2777 this->pfnPaFunc = PaPatchList<18>;
2778 #if ENABLE_AVX512_SIMD16
2779 this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
2780 #endif
2781 break;
2782 case TOP_PATCHLIST_19:
2783 this->pfnPaFunc = PaPatchList<19>;
2784 #if ENABLE_AVX512_SIMD16
2785 this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
2786 #endif
2787 break;
2788 case TOP_PATCHLIST_20:
2789 this->pfnPaFunc = PaPatchList<20>;
2790 #if ENABLE_AVX512_SIMD16
2791 this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
2792 #endif
2793 break;
2794 case TOP_PATCHLIST_21:
2795 this->pfnPaFunc = PaPatchList<21>;
2796 #if ENABLE_AVX512_SIMD16
2797 this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
2798 #endif
2799 break;
2800 case TOP_PATCHLIST_22:
2801 this->pfnPaFunc = PaPatchList<22>;
2802 #if ENABLE_AVX512_SIMD16
2803 this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
2804 #endif
2805 break;
2806 case TOP_PATCHLIST_23:
2807 this->pfnPaFunc = PaPatchList<23>;
2808 #if ENABLE_AVX512_SIMD16
2809 this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
2810 #endif
2811 break;
2812 case TOP_PATCHLIST_24:
2813 this->pfnPaFunc = PaPatchList<24>;
2814 #if ENABLE_AVX512_SIMD16
2815 this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
2816 #endif
2817 break;
2818 case TOP_PATCHLIST_25:
2819 this->pfnPaFunc = PaPatchList<25>;
2820 #if ENABLE_AVX512_SIMD16
2821 this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
2822 #endif
2823 break;
2824 case TOP_PATCHLIST_26:
2825 this->pfnPaFunc = PaPatchList<26>;
2826 #if ENABLE_AVX512_SIMD16
2827 this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
2828 #endif
2829 break;
2830 case TOP_PATCHLIST_27:
2831 this->pfnPaFunc = PaPatchList<27>;
2832 #if ENABLE_AVX512_SIMD16
2833 this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
2834 #endif
2835 break;
2836 case TOP_PATCHLIST_28:
2837 this->pfnPaFunc = PaPatchList<28>;
2838 #if ENABLE_AVX512_SIMD16
2839 this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
2840 #endif
2841 break;
2842 case TOP_PATCHLIST_29:
2843 this->pfnPaFunc = PaPatchList<29>;
2844 #if ENABLE_AVX512_SIMD16
2845 this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
2846 #endif
2847 break;
2848 case TOP_PATCHLIST_30:
2849 this->pfnPaFunc = PaPatchList<30>;
2850 #if ENABLE_AVX512_SIMD16
2851 this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
2852 #endif
2853 break;
2854 case TOP_PATCHLIST_31:
2855 this->pfnPaFunc = PaPatchList<31>;
2856 #if ENABLE_AVX512_SIMD16
2857 this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
2858 #endif
2859 break;
2860 case TOP_PATCHLIST_32:
2861 this->pfnPaFunc = PaPatchList<32>;
2862 #if ENABLE_AVX512_SIMD16
2863 this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
2864 #endif
2865 break;
2866
2867 default:
2868 SWR_INVALID("Invalid topology: %d", this->binTopology);
2869 break;
2870 };
2871
2872 this->pfnPaFuncReset = this->pfnPaFunc;
2873 #if ENABLE_AVX512_SIMD16
2874 this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
2875 #endif
2876
2877 #if USE_SIMD16_FRONTEND
2878 simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2879 simd16scalari id82 = _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
2880
2881 #else
2882 simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2883 simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
2884
2885 #endif
2886 switch(this->binTopology)
2887 {
2888 case TOP_TRIANGLE_LIST:
2889 case TOP_TRIANGLE_STRIP:
2890 case TOP_TRIANGLE_FAN:
2891 case TOP_LINE_STRIP:
2892 case TOP_LINE_LIST:
2893 case TOP_LINE_LOOP:
2894 #if USE_SIMD16_FRONTEND
2895 this->primIDIncr = 16;
2896 this->primID = id16;
2897 #else
2898 this->primIDIncr = 8;
2899 this->primID = id8;
2900 #endif
2901 break;
2902 case TOP_QUAD_LIST:
2903 case TOP_QUAD_STRIP:
2904 case TOP_RECT_LIST:
2905 #if USE_SIMD16_FRONTEND
2906 this->primIDIncr = 8;
2907 this->primID = id82;
2908 #else
2909 this->primIDIncr = 4;
2910 this->primID = id4;
2911 #endif
2912 break;
2913 case TOP_POINT_LIST:
2914 #if USE_SIMD16_FRONTEND
2915 this->primIDIncr = 16;
2916 this->primID = id16;
2917 #else
2918 this->primIDIncr = 8;
2919 this->primID = id8;
2920 #endif
2921 break;
2922 case TOP_PATCHLIST_1:
2923 case TOP_PATCHLIST_2:
2924 case TOP_PATCHLIST_3:
2925 case TOP_PATCHLIST_4:
2926 case TOP_PATCHLIST_5:
2927 case TOP_PATCHLIST_6:
2928 case TOP_PATCHLIST_7:
2929 case TOP_PATCHLIST_8:
2930 case TOP_PATCHLIST_9:
2931 case TOP_PATCHLIST_10:
2932 case TOP_PATCHLIST_11:
2933 case TOP_PATCHLIST_12:
2934 case TOP_PATCHLIST_13:
2935 case TOP_PATCHLIST_14:
2936 case TOP_PATCHLIST_15:
2937 case TOP_PATCHLIST_16:
2938 case TOP_PATCHLIST_17:
2939 case TOP_PATCHLIST_18:
2940 case TOP_PATCHLIST_19:
2941 case TOP_PATCHLIST_20:
2942 case TOP_PATCHLIST_21:
2943 case TOP_PATCHLIST_22:
2944 case TOP_PATCHLIST_23:
2945 case TOP_PATCHLIST_24:
2946 case TOP_PATCHLIST_25:
2947 case TOP_PATCHLIST_26:
2948 case TOP_PATCHLIST_27:
2949 case TOP_PATCHLIST_28:
2950 case TOP_PATCHLIST_29:
2951 case TOP_PATCHLIST_30:
2952 case TOP_PATCHLIST_31:
2953 case TOP_PATCHLIST_32:
2954 // Always run KNOB_SIMD_WIDTH number of patches at a time.
2955 #if USE_SIMD16_FRONTEND
2956 this->primIDIncr = 16;
2957 this->primID = id16;
2958 #else
2959 this->primIDIncr = 8;
2960 this->primID = id8;
2961 #endif
2962 break;
2963
2964 default:
2965 SWR_INVALID("Invalid topology: %d", this->binTopology);
2966 break;
2967 };
2968
2969 }
2970 #endif
2971