1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
26
27 #if ENABLE_AVX512_SIMD16
28
29 #if KNOB_SIMD16_WIDTH == 16
30
31 #if ENABLE_AVX512_EMULATION
32 struct simd16scalar
33 {
34 __m256 lo;
35 __m256 hi;
36 };
37 struct simd16scalard
38 {
39 __m256d lo;
40 __m256d hi;
41 };
42 struct simd16scalari
43 {
44 __m256i lo;
45 __m256i hi;
46 };
47 typedef uint16_t simd16mask;
48
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
52
53 #else
54 typedef __m512 simd16scalar;
55 typedef __m512d simd16scalard;
56 typedef __m512i simd16scalari;
57 typedef __mmask16 simd16mask;
58 #endif//ENABLE_AVX512_EMULATION
59 #else
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
62
OSALIGN(union,KNOB_SIMD16_BYTES)63 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
64 {
65 simd16scalar v[4];
66 struct
67 {
68 simd16scalar x, y, z, w;
69 };
70
71 simd16scalar& operator[] (const int i) { return v[i]; }
72 const simd16scalar& operator[] (const int i) const { return v[i]; }
73 };
74
75 #if ENABLE_AVX512_EMULATION
76
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
78 INLINE type func()\
79 {\
80 type result;\
81 \
82 result.lo = intrin();\
83 result.hi = intrin();\
84 \
85 return result;\
86 }
87
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
90 {\
91 type result;\
92 \
93 result.lo = intrin(a.lo);\
94 result.hi = intrin(a.hi);\
95 \
96 return result;\
97 }
98
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
101 {\
102 type result;\
103 \
104 result.lo = intrin(a.lo, b.lo);\
105 result.hi = intrin(a.hi, b.hi);\
106 \
107 return result;\
108 }
109
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
112 {\
113 type result;\
114 \
115 result.lo = intrin(a.lo, b.lo, c.lo);\
116 result.hi = intrin(a.hi, b.hi, c.hi);\
117 \
118 return result;\
119 }
120
SIMD16_EMU_AVX512_0(simd16scalar,_simd16_setzero_ps,_mm256_setzero_ps)121 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
122 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
123
124 INLINE simd16scalar _simd16_set1_ps(float a)
125 {
126 simd16scalar result;
127
128 result.lo = _mm256_set1_ps(a);
129 result.hi = _mm256_set1_ps(a);
130
131 return result;
132 }
133
_simd16_set1_epi8(char a)134 INLINE simd16scalari _simd16_set1_epi8(char a)
135 {
136 simd16scalari result;
137
138 result.lo = _mm256_set1_epi8(a);
139 result.hi = _mm256_set1_epi8(a);
140
141 return result;
142 }
143
_simd16_set1_epi32(int a)144 INLINE simd16scalari _simd16_set1_epi32(int a)
145 {
146 simd16scalari result;
147
148 result.lo = _mm256_set1_epi32(a);
149 result.hi = _mm256_set1_epi32(a);
150
151 return result;
152 }
153
_simd16_set_ps(float e15,float e14,float e13,float e12,float e11,float e10,float e9,float e8,float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)154 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
155 {
156 simd16scalar result;
157
158 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
159 result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
160
161 return result;
162 }
163
_simd16_set_epi32(int e15,int e14,int e13,int e12,int e11,int e10,int e9,int e8,int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)164 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
165 {
166 simd16scalari result;
167
168 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
169 result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
170
171 return result;
172 }
173
_simd16_set_ps(float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)174 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
175 {
176 simd16scalar result;
177
178 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
179 result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
180
181 return result;
182 }
183
_simd16_set_epi32(int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)184 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
185 {
186 simd16scalari result;
187
188 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
189 result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
190
191 return result;
192 }
193
_simd16_load_ps(float const * m)194 INLINE simd16scalar _simd16_load_ps(float const *m)
195 {
196 simd16scalar result;
197
198 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
199
200 result.lo = _mm256_load_ps(m);
201 result.hi = _mm256_load_ps(n);
202
203 return result;
204 }
205
_simd16_loadu_ps(float const * m)206 INLINE simd16scalar _simd16_loadu_ps(float const *m)
207 {
208 simd16scalar result;
209
210 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
211
212 result.lo = _mm256_loadu_ps(m);
213 result.hi = _mm256_loadu_ps(n);
214
215 return result;
216 }
217
_simd16_load1_ps(float const * m)218 INLINE simd16scalar _simd16_load1_ps(float const *m)
219 {
220 simd16scalar result;
221
222 result.lo = _mm256_broadcast_ss(m);
223 result.hi = _mm256_broadcast_ss(m);
224
225 return result;
226 }
227
_simd16_load_si(simd16scalari const * m)228 INLINE simd16scalari _simd16_load_si(simd16scalari const *m)
229 {
230 simd16scalari result;
231
232 result.lo = _mm256_load_si256(&m[0].lo);
233 result.hi = _mm256_load_si256(&m[0].hi);
234
235 return result;
236 }
237
_simd16_loadu_si(simd16scalari const * m)238 INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m)
239 {
240 simd16scalari result;
241
242 result.lo = _mm256_loadu_si256(&m[0].lo);
243 result.hi = _mm256_loadu_si256(&m[0].hi);
244
245 return result;
246 }
247
_simd16_broadcast_ss(float const * m)248 INLINE simd16scalar _simd16_broadcast_ss(float const *m)
249 {
250 simd16scalar result;
251
252 result.lo = _mm256_broadcast_ss(m);
253 result.hi = _mm256_broadcast_ss(m);
254
255 return result;
256 }
257
_simd16_broadcast_ps(__m128 const * m)258 INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m)
259 {
260 simd16scalar result;
261
262 result.lo = _mm256_broadcast_ps(m);
263 result.hi = _mm256_broadcast_ps(m);
264
265 return result;
266 }
267
_simd16_store_ps(float * m,simd16scalar a)268 INLINE void _simd16_store_ps(float *m, simd16scalar a)
269 {
270 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
271
272 _mm256_store_ps(m, a.lo);
273 _mm256_store_ps(n, a.hi);
274 }
275
_simd16_maskstore_ps(float * m,simd16scalari mask,simd16scalar a)276 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
277 {
278 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
279
280 _mm256_maskstore_ps(m, mask.lo, a.lo);
281 _mm256_maskstore_ps(n, mask.hi, a.hi);
282 }
283
_simd16_store_si(simd16scalari * m,simd16scalari a)284 INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a)
285 {
286 _mm256_store_si256(&m[0].lo, a.lo);
287 _mm256_store_si256(&m[0].hi, a.hi);
288 }
289
_simd16_extract_ps(simd16scalar a,int imm8)290 INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8)
291 {
292 switch (imm8)
293 {
294 case 0:
295 return a.lo;
296 case 1:
297 return a.hi;
298 }
299 return _simd_set1_ps(0.0f);
300 }
301
_simd16_extract_si(simd16scalari a,int imm8)302 INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8)
303 {
304 switch (imm8)
305 {
306 case 0:
307 return a.lo;
308 case 1:
309 return a.hi;
310 }
311 return _simd_set1_epi32(0);
312 }
313
_simd16_insert_ps(simd16scalar a,simdscalar b,int imm8)314 INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
315 {
316 switch (imm8)
317 {
318 case 0:
319 a.lo = b;
320 break;
321 case 1:
322 a.hi = b;
323 break;
324 }
325 return a;
326 }
327
_simd16_insert_si(simd16scalari a,simdscalari b,int imm8)328 INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
329 {
330 switch (imm8)
331 {
332 case 0:
333 a.lo = b;
334 break;
335 case 1:
336 a.hi = b;
337 break;
338 }
339 return a;
340 }
341
342 template <simd16mask mask>
_simd16_blend_ps_temp(simd16scalar a,simd16scalar b)343 INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
344 {
345 simd16scalar result;
346
347 result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
348 result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
349
350 return result;
351 }
352
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
354
SIMD16_EMU_AVX512_3(simd16scalar,_simd16_blendv_ps,_mm256_blendv_ps)355 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
356
357 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
358 {
359 simd16scalari result;
360
361 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
362 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
363
364 return result;
365 }
366
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalari mask)367 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
368 {
369 simd16scalari result;
370
371 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
372 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
373
374 return result;
375 }
376
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_mul_ps,_mm256_mul_ps)377 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
378 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
379 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
380 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
381 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
382 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
383
384 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
385 {
386 simd16mask mask;
387
388 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
389 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
390
391 return mask;
392 }
393
_simd16_movemask_pd(simd16scalard a)394 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
395 {
396 simd16mask mask;
397
398 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
399 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
400
401 return mask;
402 }
403
_simd16_movemask_epi8(simd16scalari a)404 INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
405 {
406 simd16mask mask;
407
408 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
409 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
410
411 return mask;
412 }
413
_simd16_cvtps_epi32(simd16scalar a)414 INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
415 {
416 simd16scalari result;
417
418 result.lo = _mm256_cvtps_epi32(a.lo);
419 result.hi = _mm256_cvtps_epi32(a.hi);
420
421 return result;
422 }
423
_simd16_cvttps_epi32(simd16scalar a)424 INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a)
425 {
426 simd16scalari result;
427
428 result.lo = _mm256_cvttps_epi32(a.lo);
429 result.hi = _mm256_cvttps_epi32(a.hi);
430
431 return result;
432 }
433
_simd16_cvtepi32_ps(simd16scalari a)434 INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a)
435 {
436 simd16scalar result;
437
438 result.lo = _mm256_cvtepi32_ps(a.lo);
439 result.hi = _mm256_cvtepi32_ps(a.hi);
440
441 return result;
442 }
443
444 template <int comp>
_simd16_cmp_ps(simd16scalar a,simd16scalar b)445 INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
446 {
447 simd16scalar result;
448
449 result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
450 result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
451
452 return result;
453 }
454
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
461
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_and_ps,_simd_and_ps)462 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
463 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
464 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
466
467 INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
468 {
469 return *reinterpret_cast<simd16scalar *>(&a);
470 }
471
_simd16_castps_si(simd16scalar a)472 INLINE simd16scalari _simd16_castps_si(simd16scalar a)
473 {
474 return *reinterpret_cast<simd16scalari *>(&a);
475 }
476
_simd16_castsi_pd(simd16scalari a)477 INLINE simd16scalard _simd16_castsi_pd(simd16scalari a)
478 {
479 return *reinterpret_cast<simd16scalard *>(&a);
480 }
481
_simd16_castpd_si(simd16scalard a)482 INLINE simd16scalari _simd16_castpd_si(simd16scalard a)
483 {
484 return *reinterpret_cast<simd16scalari *>(&a);
485 }
486
_simd16_castpd_ps(simd16scalard a)487 INLINE simd16scalar _simd16_castpd_ps(simd16scalard a)
488 {
489 return *reinterpret_cast<simd16scalar *>(&a);
490 }
491
_simd16_castps_pd(simd16scalar a)492 INLINE simd16scalard _simd16_castps_pd(simd16scalar a)
493 {
494 return *reinterpret_cast<simd16scalard *>(&a);
495 }
496
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_andnot_ps,_mm256_andnot_ps)497 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps)
498
499 template <int mode>
500 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
501 {
502 simd16scalar result;
503
504 result.lo = _mm256_round_ps(a.lo, mode);
505 result.hi = _mm256_round_ps(a.hi, mode);
506
507 return result;
508 }
509
510 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
511
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_mul_epi32,_simd_mul_epi32)512 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
513 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
514 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
522 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
524 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
527 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
528
529 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
530 {
531 int lo = _mm256_testz_ps(a.lo, b.lo);
532 int hi = _mm256_testz_ps(a.hi, b.hi);
533
534 return lo & hi;
535 }
536
537 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
538
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_unpacklo_ps,_simd_unpacklo_ps)539 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
540 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
541 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
542 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
543
544 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
545 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
546 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
547 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
548 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
549 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
550 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
551 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
552
553 template <int imm8>
554 INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a)
555 {
556 simd16scalari result;
557
558 result.lo = _simd_slli_epi32(a.lo, imm8);
559 result.hi = _simd_slli_epi32(a.hi, imm8);
560
561 return result;
562 }
563
564 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
565
566 template <int imm8>
_simd16_srai_epi32_temp(simd16scalari a)567 INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a)
568 {
569 simd16scalari result;
570
571 result.lo = _simd_srai_epi32(a.lo, imm8);
572 result.hi = _simd_srai_epi32(a.hi, imm8);
573
574 return result;
575 }
576
577 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
578
579 template <int imm8>
_simd16_srli_epi32_temp(simd16scalari a)580 INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
581 {
582 simd16scalari result;
583
584 result.lo = _simd_srli_epi32(a.lo, imm8);
585 result.hi = _simd_srli_epi32(a.hi, imm8);
586
587 return result;
588 }
589
590 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
591
SIMD16_EMU_AVX512_3(simd16scalar,_simd16_fmadd_ps,_simd_fmadd_ps)592 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
593 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
594
595 //__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
596 template <int scale>
597 INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
598 {
599 simd16scalar result;
600
601 result.lo = _simd_i32gather_ps(m, index.lo, scale);
602 result.hi = _simd_i32gather_ps(m, index.hi, scale);
603
604 return result;
605 }
606
607 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
608
609 //__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
610 template <int scale>
_simd16_mask_i32gather_ps_temp(simd16scalar a,const float * m,simd16scalari index,simd16scalari mask)611 INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
612 {
613 simd16scalar result;
614
615 result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
616 result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
617
618 return result;
619 }
620
621 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index)
622
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_shuffle_epi8,_simd_shuffle_epi8)623 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
624 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
625 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
626 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
627 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
628 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
629 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
630 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
631 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
632 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
633 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
634
635 INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
636 {
637 simd16scalar result;
638
639 const simdscalari mask = _simd_set1_epi32(7);
640
641 simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
642 simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
643
644 simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
645 simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
646
647 result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
648 result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
649
650 return result;
651 }
652
_simd16_permute_epi32(simd16scalari a,simd16scalari i)653 INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
654 {
655 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
656 }
657
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_srlv_epi32,_simd_srlv_epi32)658 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
659 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
660
661 template <int imm8>
662 INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
663 {
664 simd16scalar result;
665
666 result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
667 result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
668
669 return result;
670 }
671
672 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
673
674 template <int imm8>
_simd16_permute2f128_pd_temp(simd16scalard a,simd16scalard b)675 INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
676 {
677 simd16scalard result;
678
679 result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
680 result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
681
682 return result;
683 }
684
685 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
686
687 template <int imm8>
_simd16_permute2f128_si_temp(simd16scalari a,simd16scalari b)688 INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
689 {
690 simd16scalari result;
691
692 result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
693 result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
694
695 return result;
696 }
697
698 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
699
700 template <int imm8>
_simd16_shuffle_ps_temp(simd16scalar a,simd16scalar b)701 INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
702 {
703 simd16scalar result;
704
705 result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
706 result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
707
708 return result;
709 }
710
711 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
712
713 template <int imm8>
_simd16_shuffle_pd_temp(simd16scalard a,simd16scalard b)714 INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
715 {
716 simd16scalard result;
717
718 result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
719 result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
720
721 return result;
722 }
723
724 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
725
726 template <int imm8>
_simd16_shuffle_epi32_temp(simd16scalari a,simd16scalari b)727 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
728 {
729 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
730 }
731
732 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
733
734 template <int imm8>
_simd16_shuffle_epi64_temp(simd16scalari a,simd16scalari b)735 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
736 {
737 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
738 }
739
740 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
741
_simd16_cvtepu8_epi16(simdscalari a)742 INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a)
743 {
744 simd16scalari result;
745
746 result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
747 result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
748
749 return result;
750 }
751
_simd16_cvtepu8_epi32(__m128i a)752 INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a)
753 {
754 simd16scalari result;
755
756 result.lo = _simd_cvtepu8_epi32(a);
757 result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
758
759 return result;
760 }
761
_simd16_cvtepu16_epi32(simdscalari a)762 INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a)
763 {
764 simd16scalari result;
765
766 result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
767 result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
768
769 return result;
770 }
771
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_packus_epi16,_simd_packus_epi16)772 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
773 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
774 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
775 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
776
777 INLINE simd16mask _simd16_int2mask(int mask)
778 {
779 return mask;
780 }
781
_simd16_mask2int(simd16mask mask)782 INLINE int _simd16_mask2int(simd16mask mask)
783 {
784 return mask;
785 }
786
_simd16_cmplt_ps_mask(simd16scalar a,simd16scalar b)787 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
788 {
789 return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
790 }
791
792 // convert bitmask to vector mask
vMask16(int32_t mask)793 INLINE simd16scalar vMask16(int32_t mask)
794 {
795 simd16scalari temp = _simd16_set1_epi32(mask);
796
797 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
798
799 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
800
801 return _simd16_castsi_ps(result);
802 }
803
804 #else
805
_simd16_scalari2mask(simd16scalari mask)806 INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
807 {
808 return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
809 }
810
811 #if 0
812 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
813 {
814 return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
815 }
816 #endif
817
818 #define _simd16_setzero_ps _mm512_setzero_ps
819 #define _simd16_setzero_si _mm512_setzero_si512
820 #define _simd16_set1_ps _mm512_set1_ps
821 #define _simd16_set1_epi8 _mm512_set1_epi8
822 #define _simd16_set1_epi32 _mm512_set1_epi32
823
_simd16_set_ps(float e15,float e14,float e13,float e12,float e11,float e10,float e9,float e8,float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)824 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
825 {
826 return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
827 }
828
_simd16_set_epi32(int e15,int e14,int e13,int e12,int e11,int e10,int e9,int e8,int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)829 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
830 {
831 return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
832 }
833
_simd16_set_ps(float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)834 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
835 {
836 return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
837 }
838
_simd16_set_epi32(int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)839 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
840 {
841 return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
842 }
843
844 #define _simd16_load_ps _mm512_load_ps
845 #define _simd16_loadu_ps _mm512_loadu_ps
846 #if 1
847 #define _simd16_load1_ps _simd16_broadcast_ss
848 #endif
849 #define _simd16_load_si _mm512_load_si512
850 #define _simd16_loadu_si _mm512_loadu_si512
851 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
852 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
853 #define _simd16_store_ps _mm512_store_ps
854 #define _simd16_store_si _mm512_store_si512
855 #define _simd16_extract_ps _mm512_extractf32x8_ps
856 #define _simd16_extract_si _mm512_extracti32x8_epi32
857 #define _simd16_insert_ps _mm512_insertf32x8
858 #define _simd16_insert_si _mm512_inserti32x8
859
_simd16_maskstore_ps(float * m,simd16scalari mask,simd16scalar a)860 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
861 {
862 simd16mask k = _simd16_scalari2mask(mask);
863
864 _mm512_mask_store_ps(m, k, a);
865 }
866
867 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
868
_simd16_blendv_ps(simd16scalar a,simd16scalar b,const simd16scalar mask)869 INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
870 {
871 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
872
873 _mm512_mask_blend_ps(k, a, b);
874 }
875
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalar mask)876 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
877 {
878 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
879
880 _mm512_mask_blend_epi32(k, a, b);
881 }
882
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalari mask)883 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
884 {
885 simd16mask k = _simd16_scalari2mask(mask);
886
887 _mm512_mask_blend_epi32(k, a, b);
888 }
889
890 #define _simd16_mul_ps _mm512_mul_ps
891 #define _simd16_add_ps _mm512_add_ps
892 #define _simd16_sub_ps _mm512_sub_ps
893 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
894 #define _simd16_min_ps _mm512_min_ps
895 #define _simd16_max_ps _mm512_max_ps
896
_simd16_movemask_ps(simd16scalar a)897 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
898 {
899 return _simd16_scalari2mask(_mm512_castps_si512(a));
900 }
901
902 #if 0
903 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
904 {
905 return _simd16_scalard2mask(_mm512i_castpd_si512(a));
906 }
907 #endif
908
909 #if 0
910 INLINE int _simd16_movemask_epi8(simd16scalari a)
911 {
912 return _simd16_scalar2mask(a);
913 }
914 #endif
915
916 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
917 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
918 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
919
920 template <int comp>
_simd16_cmp_ps_temp(simd16scalar a,simd16scalar b)921 INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
922 {
923 simd16mask k = _mm512_cmpeq_ps_mask(a, b);
924
925 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
926 }
927
928 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
929
930 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
931 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
932 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
933 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
934 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
935 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
936
937 #define _simd16_castsi_ps _mm512_castsi512_ps
938 #define _simd16_castps_si _mm512_castps_si512
939 #define _simd16_castsi_pd _mm512_castsi512_pd
940 #define _simd16_castpd_si _mm512_castpd_si512
941 #define _simd16_castpd_ps _mm512_castpd_ps
942 #define _simd16_castps_pd _mm512_castps_pd
943
944 #define _simd16_andnot_ps _mm512_andnot_ps
945
946 template <int mode>
_simd16_round_ps_temp(simd16scalar a)947 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
948 {
949 return _mm512_roundscale_ps(a, mode);
950 }
951
952 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
953
954 #define _simd16_mul_epi32 _mm512_mul_epi32
955 #define _simd16_mullo_epi32 _mm512_mullo_epi32
956 #define _simd16_sub_epi32 _mm512_sub_epi32
957 #define _simd16_sub_epi64 _mm512_sub_epi64
958 #define _simd16_min_epi32 _mm512_min_epi32
959 #define _simd16_max_epi32 _mm512_max_epi32
960 #define _simd16_min_epu32 _mm512_min_epu32
961 #define _simd16_max_epu32 _mm512_max_epu32
962 #define _simd16_add_epi32 _mm512_add_epi32
963 #define _simd16_and_si _mm512_and_si512
964 #define _simd16_andnot_si _mm512_andnot_si512
965 #define _simd16_or_si _mm512_or_si512
966 #define _simd16_xor_si _mm512_xor_si512
967
_simd16_cmpeq_epi32(simd16scalari a,simd16scalari b)968 INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
969 {
970 simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
971
972 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
973 }
974
_simd16_cmpgt_epi32(simd16scalari a,simd16scalari b)975 INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
976 {
977 simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
978
979 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
980 }
981
_simd16_cmplt_epi32(simd16scalari a,simd16scalari b)982 INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
983 {
984 simd16mask k = _mm512_cmplt_epi32_mask(a, b);
985
986 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
987 }
988
989 #if 0
990 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
991 {
992 int lo = _mm256_testz_ps(a.lo, b.lo);
993 int hi = _mm256_testz_ps(a.hi, b.hi);
994
995 return lo & hi;
996 }
997
998 #endif
999
1000 #define _simd16_unpacklo_ps _mm512_unpacklo_ps
1001 #define _simd16_unpackhi_ps _mm512_unpackhi_ps
1002 #define _simd16_unpacklo_pd _mm512_unpacklo_pd
1003 #define _simd16_unpackhi_pd _mm512_unpackhi_pd
1004 #define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
1005 #define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
1006 #define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
1007 #define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
1008 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
1009 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
1010 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
1011 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
1012 #define _simd16_slli_epi32 _mm512_slli_epi32
1013 #define _simd16_srli_epi32 _mm512_srli_epi32
1014 #define _simd16_srai_epi32 _mm512_srai_epi32
1015 #define _simd16_fmadd_ps _mm512_fmadd_ps
1016 #define _simd16_fmsub_ps _mm512_fmsub_ps
1017 #define _simd16_adds_epu8 _mm512_adds_epu8
1018 #define _simd16_subs_epu8 _mm512_subs_epu8
1019 #define _simd16_add_epi8 _mm512_add_epi8
1020 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
1021
1022 #define _simd16_fmadd_ps _mm512_fmadd_ps
1023 #define _simd16_fmsub_ps _mm512_fmsub_ps
1024
1025 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
1026 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale)
1027
1028 #define _simd16_abs_epi32 _mm512_abs_epi32
1029 #define _simd16_cmpeq_epi64 _mm512_abs_epi32
1030
_simd16_cmpeq_epi64(simd16scalari a,simd16scalari b)1031 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
1032 {
1033 __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
1034
1035 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1036 }
1037
_simd16_cmpgt_epi64(simd16scalari a,simd16scalari b)1038 INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
1039 {
1040 __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
1041
1042 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1043 }
1044
_simd16_cmpeq_epi16(simd16scalari a,simd16scalari b)1045 INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
1046 {
1047 __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
1048
1049 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1050 }
1051
_simd16_cmpgt_epi16(simd16scalari a,simd16scalari b)1052 INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
1053 {
1054 __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
1055
1056 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1057 }
1058
_simd16_cmpeq_epi8(simd16scalari a,simd16scalari b)1059 INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
1060 {
1061 __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
1062
1063 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1064 }
1065
_simd16_cmpgt_epi8(simd16scalari a,simd16scalari b)1066 INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
1067 {
1068 __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
1069
1070 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1071 }
1072
1073 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1074 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1075 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1076 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1077 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1078 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1079 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1080 #define _simd16_shuffle_ps _mm512_shuffle_ps
1081 #define _simd16_shuffle_pd _mm512_shuffle_pd
1082 #define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
1083 #define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
1084 #define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
1085 #define _simd16_packus_epi16 _mm512_packus_epi16
1086 #define _simd16_packs_epi16 _mm512_packs_epi16
1087 #define _simd16_packus_epi32 _mm512_packus_epi32
1088 #define _simd16_packs_epi32 _mm512_packs_epi32
1089
1090 template <int imm8>
_simd16_shuffle_epi32_temp(simd16scalari a,simd16scalari b)1091 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
1092 {
1093 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
1094 }
1095
1096 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1097
1098 template <int imm8>
_simd16_shuffle_epi64_temp(simd16scalari a,simd16scalari b)1099 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
1100 {
1101 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
1102 }
1103
1104 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1105
_simd16_int2mask(int mask)1106 INLINE simd16mask _simd16_int2mask(int mask)
1107 {
1108 return _mm512_int2mask(mask);
1109 }
1110
_simd16_mask2int(simd16mask mask)1111 INLINE int _simd16_mask2int(simd16mask mask)
1112 {
1113 return _mm512_mask2int(mask);
1114 }
1115
_simd16_cmplt_ps_mask(simd16scalar a,simd16scalar b)1116 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
1117 {
1118 return _mm512_cmplt_ps_mask(a, b);
1119 }
1120
1121 // convert bitmask to vector mask
vMask16(int32_t mask)1122 INLINE simd16scalar vMask16(int32_t mask)
1123 {
1124 simd16scalari temp = _simd16_set1_epi32(mask);
1125
1126 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1127
1128 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
1129
1130 return _simd16_castsi_ps(result);
1131 }
1132
1133 #endif//ENABLE_AVX512_EMULATION
1134
1135 #endif//ENABLE_AVX512_SIMD16
1136
1137 #endif//__SWR_SIMD16INTRIN_H_
1138