• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
26 
27 #if ENABLE_AVX512_SIMD16
28 
29 #if KNOB_SIMD16_WIDTH == 16
30 
31 #if ENABLE_AVX512_EMULATION
32 struct simd16scalar
33 {
34     __m256  lo;
35     __m256  hi;
36 };
37 struct simd16scalard
38 {
39     __m256d lo;
40     __m256d hi;
41 };
42 struct simd16scalari
43 {
44     __m256i lo;
45     __m256i hi;
46 };
47 typedef uint16_t simd16mask;
48 
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
52 
53 #else
54 typedef __m512 simd16scalar;
55 typedef __m512d simd16scalard;
56 typedef __m512i simd16scalari;
57 typedef __mmask16 simd16mask;
58 #endif//ENABLE_AVX512_EMULATION
59 #else
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
62 
OSALIGN(union,KNOB_SIMD16_BYTES)63 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
64 {
65     simd16scalar  v[4];
66     struct
67     {
68         simd16scalar x, y, z, w;
69     };
70 
71     simd16scalar& operator[] (const int i) { return v[i]; }
72     const simd16scalar& operator[] (const int i) const { return v[i]; }
73 };
74 
75 #if ENABLE_AVX512_EMULATION
76 
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
78 INLINE type func()\
79 {\
80     type result;\
81 \
82     result.lo = intrin();\
83     result.hi = intrin();\
84 \
85     return result;\
86 }
87 
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
90 {\
91     type result;\
92 \
93     result.lo = intrin(a.lo);\
94     result.hi = intrin(a.hi);\
95 \
96     return result;\
97 }
98 
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
101 {\
102     type result;\
103 \
104     result.lo = intrin(a.lo, b.lo);\
105     result.hi = intrin(a.hi, b.hi);\
106 \
107     return result;\
108 }
109 
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
112 {\
113     type result;\
114 \
115     result.lo = intrin(a.lo, b.lo, c.lo);\
116     result.hi = intrin(a.hi, b.hi, c.hi);\
117 \
118     return result;\
119 }
120 
SIMD16_EMU_AVX512_0(simd16scalar,_simd16_setzero_ps,_mm256_setzero_ps)121 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
122 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
123 
124 INLINE simd16scalar _simd16_set1_ps(float a)
125 {
126     simd16scalar result;
127 
128     result.lo = _mm256_set1_ps(a);
129     result.hi = _mm256_set1_ps(a);
130 
131     return result;
132 }
133 
_simd16_set1_epi8(char a)134 INLINE simd16scalari _simd16_set1_epi8(char a)
135 {
136     simd16scalari result;
137 
138     result.lo = _mm256_set1_epi8(a);
139     result.hi = _mm256_set1_epi8(a);
140 
141     return result;
142 }
143 
_simd16_set1_epi32(int a)144 INLINE simd16scalari _simd16_set1_epi32(int a)
145 {
146     simd16scalari result;
147 
148     result.lo = _mm256_set1_epi32(a);
149     result.hi = _mm256_set1_epi32(a);
150 
151     return result;
152 }
153 
_simd16_set_ps(float e15,float e14,float e13,float e12,float e11,float e10,float e9,float e8,float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)154 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
155 {
156     simd16scalar result;
157 
158     result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
159     result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
160 
161     return result;
162 }
163 
_simd16_set_epi32(int e15,int e14,int e13,int e12,int e11,int e10,int e9,int e8,int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)164 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
165 {
166     simd16scalari result;
167 
168     result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
169     result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
170 
171     return result;
172 }
173 
_simd16_set_ps(float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)174 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
175 {
176     simd16scalar result;
177 
178     result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
179     result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
180 
181     return result;
182 }
183 
_simd16_set_epi32(int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)184 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
185 {
186     simd16scalari result;
187 
188     result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
189     result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
190 
191     return result;
192 }
193 
_simd16_load_ps(float const * m)194 INLINE simd16scalar _simd16_load_ps(float const *m)
195 {
196     simd16scalar result;
197 
198     float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
199 
200     result.lo = _mm256_load_ps(m);
201     result.hi = _mm256_load_ps(n);
202 
203     return result;
204 }
205 
_simd16_loadu_ps(float const * m)206 INLINE simd16scalar _simd16_loadu_ps(float const *m)
207 {
208     simd16scalar result;
209 
210     float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
211 
212     result.lo = _mm256_loadu_ps(m);
213     result.hi = _mm256_loadu_ps(n);
214 
215     return result;
216 }
217 
_simd16_load1_ps(float const * m)218 INLINE simd16scalar _simd16_load1_ps(float const *m)
219 {
220     simd16scalar result;
221 
222     result.lo = _mm256_broadcast_ss(m);
223     result.hi = _mm256_broadcast_ss(m);
224 
225     return result;
226 }
227 
_simd16_load_si(simd16scalari const * m)228 INLINE simd16scalari _simd16_load_si(simd16scalari const *m)
229 {
230     simd16scalari result;
231 
232     result.lo = _mm256_load_si256(&m[0].lo);
233     result.hi = _mm256_load_si256(&m[0].hi);
234 
235     return result;
236 }
237 
_simd16_loadu_si(simd16scalari const * m)238 INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m)
239 {
240     simd16scalari result;
241 
242     result.lo = _mm256_loadu_si256(&m[0].lo);
243     result.hi = _mm256_loadu_si256(&m[0].hi);
244 
245     return result;
246 }
247 
_simd16_broadcast_ss(float const * m)248 INLINE simd16scalar _simd16_broadcast_ss(float const *m)
249 {
250     simd16scalar result;
251 
252     result.lo = _mm256_broadcast_ss(m);
253     result.hi = _mm256_broadcast_ss(m);
254 
255     return result;
256 }
257 
_simd16_broadcast_ps(__m128 const * m)258 INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m)
259 {
260     simd16scalar result;
261 
262     result.lo = _mm256_broadcast_ps(m);
263     result.hi = _mm256_broadcast_ps(m);
264 
265     return result;
266 }
267 
_simd16_store_ps(float * m,simd16scalar a)268 INLINE void _simd16_store_ps(float *m, simd16scalar a)
269 {
270     float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
271 
272     _mm256_store_ps(m, a.lo);
273     _mm256_store_ps(n, a.hi);
274 }
275 
_simd16_maskstore_ps(float * m,simd16scalari mask,simd16scalar a)276 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
277 {
278     float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
279 
280     _mm256_maskstore_ps(m, mask.lo, a.lo);
281     _mm256_maskstore_ps(n, mask.hi, a.hi);
282 }
283 
_simd16_store_si(simd16scalari * m,simd16scalari a)284 INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a)
285 {
286     _mm256_store_si256(&m[0].lo, a.lo);
287     _mm256_store_si256(&m[0].hi, a.hi);
288 }
289 
_simd16_extract_ps(simd16scalar a,int imm8)290 INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8)
291 {
292     switch (imm8)
293     {
294     case 0:
295         return a.lo;
296     case 1:
297         return a.hi;
298     }
299     return _simd_set1_ps(0.0f);
300 }
301 
_simd16_extract_si(simd16scalari a,int imm8)302 INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8)
303 {
304     switch (imm8)
305     {
306     case 0:
307         return a.lo;
308     case 1:
309         return a.hi;
310     }
311     return _simd_set1_epi32(0);
312 }
313 
_simd16_insert_ps(simd16scalar a,simdscalar b,int imm8)314 INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
315 {
316     switch (imm8)
317     {
318     case 0:
319         a.lo = b;
320         break;
321     case 1:
322         a.hi = b;
323         break;
324     }
325     return a;
326 }
327 
_simd16_insert_si(simd16scalari a,simdscalari b,int imm8)328 INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
329 {
330     switch (imm8)
331     {
332     case 0:
333         a.lo = b;
334         break;
335     case 1:
336         a.hi = b;
337         break;
338     }
339     return a;
340 }
341 
342 template <simd16mask mask>
_simd16_blend_ps_temp(simd16scalar a,simd16scalar b)343 INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
344 {
345     simd16scalar result;
346 
347     result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
348     result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
349 
350     return result;
351 }
352 
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
354 
SIMD16_EMU_AVX512_3(simd16scalar,_simd16_blendv_ps,_mm256_blendv_ps)355 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
356 
357 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
358 {
359     simd16scalari result;
360 
361     result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
362     result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
363 
364     return result;
365 }
366 
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalari mask)367 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
368 {
369     simd16scalari result;
370 
371     result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
372     result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
373 
374     return result;
375 }
376 
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_mul_ps,_mm256_mul_ps)377 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
378 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
379 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
380 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
381 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
382 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
383 
384 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
385 {
386     simd16mask mask;
387 
388     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
389     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
390 
391     return mask;
392 }
393 
_simd16_movemask_pd(simd16scalard a)394 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
395 {
396     simd16mask mask;
397 
398     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
399     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
400 
401     return mask;
402 }
403 
_simd16_movemask_epi8(simd16scalari a)404 INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
405 {
406     simd16mask mask;
407 
408     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
409     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
410 
411     return mask;
412 }
413 
_simd16_cvtps_epi32(simd16scalar a)414 INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
415 {
416     simd16scalari result;
417 
418     result.lo = _mm256_cvtps_epi32(a.lo);
419     result.hi = _mm256_cvtps_epi32(a.hi);
420 
421     return result;
422 }
423 
_simd16_cvttps_epi32(simd16scalar a)424 INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a)
425 {
426     simd16scalari result;
427 
428     result.lo = _mm256_cvttps_epi32(a.lo);
429     result.hi = _mm256_cvttps_epi32(a.hi);
430 
431     return result;
432 }
433 
_simd16_cvtepi32_ps(simd16scalari a)434 INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a)
435 {
436     simd16scalar result;
437 
438     result.lo = _mm256_cvtepi32_ps(a.lo);
439     result.hi = _mm256_cvtepi32_ps(a.hi);
440 
441     return result;
442 }
443 
444 template <int comp>
_simd16_cmp_ps(simd16scalar a,simd16scalar b)445 INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
446 {
447     simd16scalar result;
448 
449     result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
450     result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
451 
452     return result;
453 }
454 
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
461 
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_and_ps,_simd_and_ps)462 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
463 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
464 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
466 
467 INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
468 {
469     return *reinterpret_cast<simd16scalar *>(&a);
470 }
471 
_simd16_castps_si(simd16scalar a)472 INLINE simd16scalari _simd16_castps_si(simd16scalar a)
473 {
474     return *reinterpret_cast<simd16scalari *>(&a);
475 }
476 
_simd16_castsi_pd(simd16scalari a)477 INLINE simd16scalard _simd16_castsi_pd(simd16scalari a)
478 {
479     return *reinterpret_cast<simd16scalard *>(&a);
480 }
481 
_simd16_castpd_si(simd16scalard a)482 INLINE simd16scalari _simd16_castpd_si(simd16scalard a)
483 {
484     return *reinterpret_cast<simd16scalari *>(&a);
485 }
486 
_simd16_castpd_ps(simd16scalard a)487 INLINE simd16scalar _simd16_castpd_ps(simd16scalard a)
488 {
489     return *reinterpret_cast<simd16scalar *>(&a);
490 }
491 
_simd16_castps_pd(simd16scalar a)492 INLINE simd16scalard _simd16_castps_pd(simd16scalar a)
493 {
494     return *reinterpret_cast<simd16scalard *>(&a);
495 }
496 
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_andnot_ps,_mm256_andnot_ps)497 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps)
498 
499 template <int mode>
500 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
501 {
502     simd16scalar result;
503 
504     result.lo = _mm256_round_ps(a.lo, mode);
505     result.hi = _mm256_round_ps(a.hi, mode);
506 
507     return result;
508 }
509 
510 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
511 
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_mul_epi32,_simd_mul_epi32)512 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
513 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
514 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
522 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
524 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
527 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
528 
529 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
530 {
531     int lo = _mm256_testz_ps(a.lo, b.lo);
532     int hi = _mm256_testz_ps(a.hi, b.hi);
533 
534     return lo & hi;
535 }
536 
537 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
538 
SIMD16_EMU_AVX512_2(simd16scalar,_simd16_unpacklo_ps,_simd_unpacklo_ps)539 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
540 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
541 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
542 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
543 
544 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
545 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
546 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
547 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
548 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
549 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
550 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
551 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
552 
553 template <int imm8>
554 INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a)
555 {
556     simd16scalari result;
557 
558     result.lo = _simd_slli_epi32(a.lo, imm8);
559     result.hi = _simd_slli_epi32(a.hi, imm8);
560 
561     return result;
562 }
563 
564 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
565 
566 template <int imm8>
_simd16_srai_epi32_temp(simd16scalari a)567 INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a)
568 {
569     simd16scalari result;
570 
571     result.lo = _simd_srai_epi32(a.lo, imm8);
572     result.hi = _simd_srai_epi32(a.hi, imm8);
573 
574     return result;
575 }
576 
577 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
578 
579 template <int imm8>
_simd16_srli_epi32_temp(simd16scalari a)580 INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
581 {
582     simd16scalari result;
583 
584     result.lo = _simd_srli_epi32(a.lo, imm8);
585     result.hi = _simd_srli_epi32(a.hi, imm8);
586 
587     return result;
588 }
589 
590 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
591 
SIMD16_EMU_AVX512_3(simd16scalar,_simd16_fmadd_ps,_simd_fmadd_ps)592 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
593 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
594 
595 //__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
596 template <int scale>
597 INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
598 {
599     simd16scalar result;
600 
601     result.lo = _simd_i32gather_ps(m, index.lo, scale);
602     result.hi = _simd_i32gather_ps(m, index.hi, scale);
603 
604     return result;
605 }
606 
607 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
608 
609 //__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
610 template <int scale>
_simd16_mask_i32gather_ps_temp(simd16scalar a,const float * m,simd16scalari index,simd16scalari mask)611 INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
612 {
613     simd16scalar result;
614 
615     result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
616     result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
617 
618     return result;
619 }
620 
621 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index)
622 
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_shuffle_epi8,_simd_shuffle_epi8)623 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
624 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
625 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
626 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
627 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
628 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
629 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
630 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
631 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
632 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
633 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
634 
635 INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
636 {
637     simd16scalar result;
638 
639     const simdscalari mask = _simd_set1_epi32(7);
640 
641     simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
642     simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
643 
644     simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
645     simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
646 
647     result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
648     result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
649 
650     return result;
651 }
652 
_simd16_permute_epi32(simd16scalari a,simd16scalari i)653 INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
654 {
655     return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
656 }
657 
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_srlv_epi32,_simd_srlv_epi32)658 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
659 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
660 
661 template <int imm8>
662 INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
663 {
664     simd16scalar result;
665 
666     result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
667     result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
668 
669     return result;
670 }
671 
672 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
673 
674 template <int imm8>
_simd16_permute2f128_pd_temp(simd16scalard a,simd16scalard b)675 INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
676 {
677     simd16scalard result;
678 
679     result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
680     result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
681 
682     return result;
683 }
684 
685 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
686 
687 template <int imm8>
_simd16_permute2f128_si_temp(simd16scalari a,simd16scalari b)688 INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
689 {
690     simd16scalari result;
691 
692     result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
693     result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
694 
695     return result;
696 }
697 
698 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
699 
700 template <int imm8>
_simd16_shuffle_ps_temp(simd16scalar a,simd16scalar b)701 INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
702 {
703     simd16scalar result;
704 
705     result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
706     result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
707 
708     return result;
709 }
710 
711 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
712 
713 template <int imm8>
_simd16_shuffle_pd_temp(simd16scalard a,simd16scalard b)714 INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
715 {
716     simd16scalard result;
717 
718     result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
719     result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
720 
721     return result;
722 }
723 
724 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
725 
726 template <int imm8>
_simd16_shuffle_epi32_temp(simd16scalari a,simd16scalari b)727 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
728 {
729     return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
730 }
731 
732 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
733 
734 template <int imm8>
_simd16_shuffle_epi64_temp(simd16scalari a,simd16scalari b)735 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
736 {
737     return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
738 }
739 
740 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
741 
_simd16_cvtepu8_epi16(simdscalari a)742 INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a)
743 {
744     simd16scalari result;
745 
746     result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
747     result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
748 
749     return result;
750 }
751 
_simd16_cvtepu8_epi32(__m128i a)752 INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a)
753 {
754     simd16scalari result;
755 
756     result.lo = _simd_cvtepu8_epi32(a);
757     result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
758 
759     return result;
760 }
761 
_simd16_cvtepu16_epi32(simdscalari a)762 INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a)
763 {
764     simd16scalari result;
765 
766     result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
767     result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
768 
769     return result;
770 }
771 
SIMD16_EMU_AVX512_2(simd16scalari,_simd16_packus_epi16,_simd_packus_epi16)772 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
773 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
774 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
775 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
776 
777 INLINE simd16mask _simd16_int2mask(int mask)
778 {
779     return mask;
780 }
781 
_simd16_mask2int(simd16mask mask)782 INLINE int _simd16_mask2int(simd16mask mask)
783 {
784     return mask;
785 }
786 
_simd16_cmplt_ps_mask(simd16scalar a,simd16scalar b)787 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
788 {
789     return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
790 }
791 
792 // convert bitmask to vector mask
vMask16(int32_t mask)793 INLINE simd16scalar vMask16(int32_t mask)
794 {
795     simd16scalari temp = _simd16_set1_epi32(mask);
796 
797     simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
798 
799     simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
800 
801     return _simd16_castsi_ps(result);
802 }
803 
804 #else
805 
_simd16_scalari2mask(simd16scalari mask)806 INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
807 {
808     return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
809 }
810 
811 #if 0
812 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
813 {
814     return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
815 }
816 #endif
817 
818 #define _simd16_setzero_ps      _mm512_setzero_ps
819 #define _simd16_setzero_si      _mm512_setzero_si512
820 #define _simd16_set1_ps         _mm512_set1_ps
821 #define _simd16_set1_epi8       _mm512_set1_epi8
822 #define _simd16_set1_epi32      _mm512_set1_epi32
823 
_simd16_set_ps(float e15,float e14,float e13,float e12,float e11,float e10,float e9,float e8,float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)824 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
825 {
826     return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
827 }
828 
_simd16_set_epi32(int e15,int e14,int e13,int e12,int e11,int e10,int e9,int e8,int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)829 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
830 {
831     return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
832 }
833 
_simd16_set_ps(float e7,float e6,float e5,float e4,float e3,float e2,float e1,float e0)834 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
835 {
836     return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
837 }
838 
_simd16_set_epi32(int e7,int e6,int e5,int e4,int e3,int e2,int e1,int e0)839 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
840 {
841     return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
842 }
843 
844 #define _simd16_load_ps         _mm512_load_ps
845 #define _simd16_loadu_ps        _mm512_loadu_ps
846 #if 1
847 #define _simd16_load1_ps        _simd16_broadcast_ss
848 #endif
849 #define _simd16_load_si         _mm512_load_si512
850 #define _simd16_loadu_si        _mm512_loadu_si512
851 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
852 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
853 #define _simd16_store_ps        _mm512_store_ps
854 #define _simd16_store_si        _mm512_store_si512
855 #define _simd16_extract_ps      _mm512_extractf32x8_ps
856 #define _simd16_extract_si      _mm512_extracti32x8_epi32
857 #define _simd16_insert_ps       _mm512_insertf32x8
858 #define _simd16_insert_si       _mm512_inserti32x8
859 
_simd16_maskstore_ps(float * m,simd16scalari mask,simd16scalar a)860 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
861 {
862     simd16mask k = _simd16_scalari2mask(mask);
863 
864     _mm512_mask_store_ps(m, k, a);
865 }
866 
867 #define _simd16_blend_ps(a, b, mask)    _mm512_mask_blend_ps(mask, a, b)
868 
_simd16_blendv_ps(simd16scalar a,simd16scalar b,const simd16scalar mask)869 INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
870 {
871     simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
872 
873     _mm512_mask_blend_ps(k, a, b);
874 }
875 
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalar mask)876 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
877 {
878     simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
879 
880     _mm512_mask_blend_epi32(k, a, b);
881 }
882 
_simd16_blendv_epi32(simd16scalari a,simd16scalari b,const simd16scalari mask)883 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
884 {
885     simd16mask k = _simd16_scalari2mask(mask);
886 
887     _mm512_mask_blend_epi32(k, a, b);
888 }
889 
890 #define _simd16_mul_ps          _mm512_mul_ps
891 #define _simd16_add_ps          _mm512_add_ps
892 #define _simd16_sub_ps          _mm512_sub_ps
893 #define _simd16_rsqrt_ps        _mm512_rsqrt14_ps
894 #define _simd16_min_ps          _mm512_min_ps
895 #define _simd16_max_ps          _mm512_max_ps
896 
_simd16_movemask_ps(simd16scalar a)897 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
898 {
899     return  _simd16_scalari2mask(_mm512_castps_si512(a));
900 }
901 
902 #if 0
903 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
904 {
905     return  _simd16_scalard2mask(_mm512i_castpd_si512(a));
906 }
907 #endif
908 
909 #if 0
910 INLINE int _simd16_movemask_epi8(simd16scalari a)
911 {
912     return  _simd16_scalar2mask(a);
913 }
914 #endif
915 
916 #define _simd16_cvtps_epi32     _mm512_cvtps_epi32
917 #define _simd16_cvttps_epi32    _mm512_cvttps_epi32
918 #define _simd16_cvtepi32_ps     _mm512_cvtepi32_ps
919 
920 template <int comp>
_simd16_cmp_ps_temp(simd16scalar a,simd16scalar b)921 INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
922 {
923     simd16mask k = _mm512_cmpeq_ps_mask(a, b);
924 
925     return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
926 }
927 
928 #define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
929 
930 #define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
931 #define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
932 #define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
933 #define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
934 #define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
935 #define _simd16_cmple_ps(a, b)      _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
936 
937 #define _simd16_castsi_ps           _mm512_castsi512_ps
938 #define _simd16_castps_si           _mm512_castps_si512
939 #define _simd16_castsi_pd           _mm512_castsi512_pd
940 #define _simd16_castpd_si           _mm512_castpd_si512
941 #define _simd16_castpd_ps           _mm512_castpd_ps
942 #define _simd16_castps_pd           _mm512_castps_pd
943 
944 #define _simd16_andnot_ps           _mm512_andnot_ps
945 
946 template <int mode>
_simd16_round_ps_temp(simd16scalar a)947 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
948 {
949     return _mm512_roundscale_ps(a, mode);
950 }
951 
952 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
953 
954 #define _simd16_mul_epi32         _mm512_mul_epi32
955 #define _simd16_mullo_epi32       _mm512_mullo_epi32
956 #define _simd16_sub_epi32         _mm512_sub_epi32
957 #define _simd16_sub_epi64         _mm512_sub_epi64
958 #define _simd16_min_epi32         _mm512_min_epi32
959 #define _simd16_max_epi32         _mm512_max_epi32
960 #define _simd16_min_epu32         _mm512_min_epu32
961 #define _simd16_max_epu32         _mm512_max_epu32
962 #define _simd16_add_epi32         _mm512_add_epi32
963 #define _simd16_and_si            _mm512_and_si512
964 #define _simd16_andnot_si         _mm512_andnot_si512
965 #define _simd16_or_si             _mm512_or_si512
966 #define _simd16_xor_si            _mm512_xor_si512
967 
_simd16_cmpeq_epi32(simd16scalari a,simd16scalari b)968 INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
969 {
970     simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
971 
972     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
973 }
974 
_simd16_cmpgt_epi32(simd16scalari a,simd16scalari b)975 INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
976 {
977     simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
978 
979     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
980 }
981 
_simd16_cmplt_epi32(simd16scalari a,simd16scalari b)982 INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
983 {
984     simd16mask k = _mm512_cmplt_epi32_mask(a, b);
985 
986     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
987 }
988 
989 #if 0
990 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
991 {
992     int lo = _mm256_testz_ps(a.lo, b.lo);
993     int hi = _mm256_testz_ps(a.hi, b.hi);
994 
995     return lo & hi;
996 }
997 
998 #endif
999 
1000 #define _simd16_unpacklo_ps       _mm512_unpacklo_ps
1001 #define _simd16_unpackhi_ps       _mm512_unpackhi_ps
1002 #define _simd16_unpacklo_pd       _mm512_unpacklo_pd
1003 #define _simd16_unpackhi_pd       _mm512_unpackhi_pd
1004 #define _simd16_unpacklo_epi8     _mm512_unpacklo_epi8
1005 #define _simd16_unpackhi_epi8     _mm512_unpackhi_epi8
1006 #define _simd16_unpacklo_epi16    _mm512_unpacklo_epi16
1007 #define _simd16_unpackhi_epi16    _mm512_unpackhi_epi16
1008 #define _simd16_unpacklo_epi32    _mm512_unpacklo_epi32
1009 #define _simd16_unpackhi_epi32    _mm512_unpackhi_epi32
1010 #define _simd16_unpacklo_epi64    _mm512_unpacklo_epi64
1011 #define _simd16_unpackhi_epi64    _mm512_unpackhi_epi64
1012 #define _simd16_slli_epi32        _mm512_slli_epi32
1013 #define _simd16_srli_epi32        _mm512_srli_epi32
1014 #define _simd16_srai_epi32        _mm512_srai_epi32
1015 #define _simd16_fmadd_ps          _mm512_fmadd_ps
1016 #define _simd16_fmsub_ps          _mm512_fmsub_ps
1017 #define _simd16_adds_epu8         _mm512_adds_epu8
1018 #define _simd16_subs_epu8         _mm512_subs_epu8
1019 #define _simd16_add_epi8          _mm512_add_epi8
1020 #define _simd16_shuffle_epi8      _mm512_shuffle_epi8
1021 
1022 #define _simd16_fmadd_ps          _mm512_fmadd_ps
1023 #define _simd16_fmsub_ps          _mm512_fmsub_ps
1024 
1025 #define _simd16_i32gather_ps(m, index, scale)               _mm512_i32gather_ps(index, m, scale)
1026 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale)
1027 
1028 #define _simd16_abs_epi32         _mm512_abs_epi32
1029 #define _simd16_cmpeq_epi64       _mm512_abs_epi32
1030 
_simd16_cmpeq_epi64(simd16scalari a,simd16scalari b)1031 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
1032 {
1033     __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
1034 
1035     return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1036 }
1037 
_simd16_cmpgt_epi64(simd16scalari a,simd16scalari b)1038 INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
1039 {
1040     __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
1041 
1042     return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1043 }
1044 
_simd16_cmpeq_epi16(simd16scalari a,simd16scalari b)1045 INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
1046 {
1047     __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
1048 
1049     return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1050 }
1051 
_simd16_cmpgt_epi16(simd16scalari a,simd16scalari b)1052 INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
1053 {
1054     __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
1055 
1056     return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1057 }
1058 
_simd16_cmpeq_epi8(simd16scalari a,simd16scalari b)1059 INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
1060 {
1061     __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
1062 
1063     return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1064 }
1065 
_simd16_cmpgt_epi8(simd16scalari a,simd16scalari b)1066 INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
1067 {
1068     __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
1069 
1070     return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1071 }
1072 
1073 #define _simd16_permute_ps(a, i)        _mm512_permutexvar_ps(i, a)
1074 #define _simd16_permute_epi32(a, i)     _mm512_permutexvar_epi32(i, a)
1075 #define _simd16_sllv_epi32              _mm512_srlv_epi32
1076 #define _simd16_srlv_epi32              _mm512_sllv_epi32
1077 #define _simd16_permute2f128_ps         _mm512_shuffle_f32x4
1078 #define _simd16_permute2f128_pd         _mm512_shuffle_f64x2
1079 #define _simd16_permute2f128_si         _mm512_shuffle_i32x4
1080 #define _simd16_shuffle_ps              _mm512_shuffle_ps
1081 #define _simd16_shuffle_pd              _mm512_shuffle_pd
1082 #define _simd16_cvtepu8_epi16           _mm512_cvtepu8_epi16
1083 #define _simd16_cvtepu8_epi32           _mm512_cvtepu8_epi32
1084 #define _simd16_cvtepu16_epi32          _mm512_cvtepu16_epi32
1085 #define _simd16_packus_epi16            _mm512_packus_epi16
1086 #define _simd16_packs_epi16             _mm512_packs_epi16
1087 #define _simd16_packus_epi32            _mm512_packus_epi32
1088 #define _simd16_packs_epi32             _mm512_packs_epi32
1089 
1090 template <int imm8>
_simd16_shuffle_epi32_temp(simd16scalari a,simd16scalari b)1091 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
1092 {
1093     return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
1094 }
1095 
1096 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1097 
1098 template <int imm8>
_simd16_shuffle_epi64_temp(simd16scalari a,simd16scalari b)1099 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
1100 {
1101     return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
1102 }
1103 
1104 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1105 
_simd16_int2mask(int mask)1106 INLINE simd16mask _simd16_int2mask(int mask)
1107 {
1108     return _mm512_int2mask(mask);
1109 }
1110 
_simd16_mask2int(simd16mask mask)1111 INLINE int _simd16_mask2int(simd16mask mask)
1112 {
1113     return _mm512_mask2int(mask);
1114 }
1115 
_simd16_cmplt_ps_mask(simd16scalar a,simd16scalar b)1116 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
1117 {
1118     return _mm512_cmplt_ps_mask(a, b);
1119 }
1120 
1121 // convert bitmask to vector mask
vMask16(int32_t mask)1122 INLINE simd16scalar vMask16(int32_t mask)
1123 {
1124     simd16scalari temp = _simd16_set1_epi32(mask);
1125 
1126     simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1127 
1128     simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
1129 
1130     return _simd16_castsi_ps(result);
1131 }
1132 
1133 #endif//ENABLE_AVX512_EMULATION
1134 
1135 #endif//ENABLE_AVX512_SIMD16
1136 
1137 #endif//__SWR_SIMD16INTRIN_H_
1138