• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/****************************************************************************
2* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22****************************************************************************/
23#if !defined(__SIMD_LIB_AVX512_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
28// gcc as of 7.1 was missing these intrinsics
29#ifndef _mm512_cmpneq_ps_mask
30#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
31#endif
32
33#ifndef _mm512_cmplt_ps_mask
34#define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
35#endif
36
37#ifndef _mm512_cmplt_pd_mask
38#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
39#endif
40
41#endif
42
43//============================================================================
44// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
45// processors)
46//
47//============================================================================
48
49static const int TARGET_SIMD_WIDTH = 16;
50using SIMD256T = SIMD256Impl::AVX2Impl;
51
52#define SIMD_WRAPPER_1_(op, intrin)  \
53    static SIMDINLINE Float SIMDCALL op(Float a)   \
54    {\
55        return intrin(a);\
56    }
57
58#define SIMD_WRAPPER_1(op)  \
59    SIMD_WRAPPER_1_(op, _mm512_##op)
60
61#define SIMD_WRAPPER_2_(op, intrin)  \
62    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
63    {\
64        return _mm512_##intrin(a, b);\
65    }
66#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
67
68#define SIMD_WRAPPERI_2_(op, intrin)  \
69    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
70    {\
71        return _mm512_castsi512_ps(_mm512_##intrin(\
72            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
73    }
74
75#define SIMD_DWRAPPER_2(op)  \
76    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
77    {\
78        return _mm512_##op(a, b);\
79    }
80
81#define SIMD_WRAPPER_2I_(op, intrin)  \
82    template<int ImmT>\
83    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
84    {\
85        return _mm512_##intrin(a, b, ImmT);\
86    }
87#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
88
89#define SIMD_DWRAPPER_2I_(op, intrin)  \
90    template<int ImmT>\
91    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
92    {\
93        return _mm512_##intrin(a, b, ImmT);\
94    }
95#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
96
97#define SIMD_WRAPPER_3(op)  \
98    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
99    {\
100        return _mm512_##op(a, b, c);\
101    }
102
103#define SIMD_IWRAPPER_1(op)  \
104    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
105    {\
106        return _mm512_##op(a);\
107    }
108#define SIMD_IWRAPPER_1_8(op)  \
109    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
110    {\
111        return _mm512_##op(a);\
112    }
113
114#define SIMD_IWRAPPER_1_4(op)  \
115    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
116    {\
117        return _mm512_##op(a);\
118    }
119
120#define SIMD_IWRAPPER_1I_(op, intrin)  \
121    template<int ImmT> \
122    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
123    {\
124        return intrin(a, ImmT);\
125    }
126#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
127
128#define SIMD_IWRAPPER_2_(op, intrin)  \
129    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
130    {\
131        return _mm512_##intrin(a, b);\
132    }
133#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
134
135#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
136    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
137    {\
138        return cmp(a, b);\
139    }
140
141#define SIMD_IFWRAPPER_2(op, intrin)  \
142    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
143    {\
144        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
145    }
146
147#define SIMD_IWRAPPER_2I_(op, intrin)  \
148    template<int ImmT>\
149    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
150    {\
151        return _mm512_##intrin(a, b, ImmT);\
152    }
153#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
154
155private:
156    static SIMDINLINE Integer vmask(__mmask16 m)
157    {
158        return _mm512_maskz_set1_epi32(m, -1);
159    }
160
161    static SIMDINLINE Integer vmask(__mmask8 m)
162    {
163        return _mm512_maskz_set1_epi64(m, -1LL);
164    }
165
166public:
167//-----------------------------------------------------------------------
168// Single precision floating point arithmetic operations
169//-----------------------------------------------------------------------
170SIMD_WRAPPER_2(add_ps);     // return a + b
171SIMD_WRAPPER_2(div_ps);     // return a / b
172SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
173SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
174SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
175SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
176SIMD_WRAPPER_2(mul_ps);     // return a * b
177SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);       // return 1.0f / a
178SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps);   // return 1.0f / sqrt(a)
179SIMD_WRAPPER_2(sub_ps);     // return a - b
180
181template <RoundMode RMT>
182static SIMDINLINE Float SIMDCALL round_ps(Float a)
183{
184    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
185}
186
187static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
188static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
189
190//-----------------------------------------------------------------------
191// Integer (various width) arithmetic operations
192//-----------------------------------------------------------------------
193SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
194SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
195//SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
196//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
197SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
198SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
199SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
200SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
201SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
202
203                            // return (a * b) & 0xFFFFFFFF
204                            //
205                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
206                            // and store the low 32 bits of the intermediate integers in dst.
207SIMD_IWRAPPER_2(mullo_epi32);
208SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
209SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
210//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
211
212//-----------------------------------------------------------------------
213// Logical operations
214//-----------------------------------------------------------------------
215SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
216SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
217SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
218SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
219
220// SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
221// SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
222// SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
223// SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
224
225
226//-----------------------------------------------------------------------
227// Shift operations
228//-----------------------------------------------------------------------
229SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
230SIMD_IWRAPPER_2(sllv_epi32);
231SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
232SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
233
234#if 0
235SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
236
237template<int ImmT>                              // same as srli_si, but with Float cast to int
238static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
239{
240    return castsi_ps(srli_si<ImmT>(castps_si(a)));
241}
242#endif
243
244SIMD_IWRAPPER_2(srlv_epi32);
245
246//-----------------------------------------------------------------------
247// Conversion operations
248//-----------------------------------------------------------------------
249static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
250{
251    return _mm512_castpd_ps(a);
252}
253
254static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
255{
256    return _mm512_castps_si512(a);
257}
258
259static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
260{
261    return _mm512_castsi512_pd(a);
262}
263
264static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
265{
266    return _mm512_castps_pd(a);
267}
268
269static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
270{
271    return _mm512_castpd_si512(a);
272}
273
274static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
275{
276    return _mm512_castsi512_ps(a);
277}
278
279static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
280{
281    return _mm512_cvtepi32_ps(a);
282}
283
284//SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
285SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
286SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
287SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
288SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
289
290static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
291{
292    return _mm512_cvtps_epi32(a);
293}
294
295static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
296{
297    return _mm512_cvttps_epi32(a);
298}
299
300//-----------------------------------------------------------------------
301// Comparison operations
302//-----------------------------------------------------------------------
303template<CompareType CmpTypeT>
304static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
305{
306    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
307}
308
309template<CompareType CmpTypeT>
310static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
311{
312    // Legacy vector mask generator
313    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
314    return castsi_ps(vmask(result));
315}
316
317static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
318static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
319static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
320static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
321static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
322static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
323
324template<CompareTypeInt CmpTypeT>
325static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
326{
327    // Legacy vector mask generator
328    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
329    return vmask(result);
330}
331template<CompareTypeInt CmpTypeT>
332static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
333{
334    // Legacy vector mask generator
335    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
336    return vmask(result);
337}
338
339//SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
340//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
341SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
342SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
343//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
344//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
345SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
346SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
347SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
348
349static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
350{
351    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
352}
353
354static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
355{
356    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
357}
358
359//-----------------------------------------------------------------------
360// Blend / shuffle / permute operations
361//-----------------------------------------------------------------------
362template <int ImmT>
363static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
364{
365    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
366}
367
368template <int ImmT>
369static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
370{
371    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
372}
373
374static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
375{
376    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
377}
378
379
380static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
381{
382    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
383}
384
385static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
386{
387    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
388}
389
390static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
391{
392    return _mm512_set1_ps(*p);
393}
394
395template<int imm>
396static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
397{
398    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
399}
400
401template<int imm>
402static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
403{
404    return _mm512_extractf64x4_pd(a, imm);
405}
406
407template<int imm>
408static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
409{
410    return _mm512_extracti64x4_epi64(a, imm);
411}
412
413template<int imm>
414static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
415{
416    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
417}
418
419template<int imm>
420static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
421{
422    return _mm512_insertf64x4(a, b, imm);
423}
424
425template<int imm>
426static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
427{
428    return _mm512_inserti64x4(a, b, imm);
429}
430
431// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
432// SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
433// SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
434// SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
435
436static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
437{
438    return _mm512_permutexvar_epi32(swiz, a);
439}
440
441static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
442{
443    return _mm512_permutexvar_ps(swiz, a);
444}
445
446SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
447SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
448SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
449
450SIMD_IWRAPPER_1I(shuffle_epi32);
451
452//SIMD_IWRAPPER_2(shuffle_epi8);
453SIMD_DWRAPPER_2I(shuffle_pd);
454SIMD_WRAPPER_2I(shuffle_ps);
455
456template<int ImmT>
457static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
458{
459    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
460}
461
462SIMD_IWRAPPER_2(unpackhi_epi16);
463
464//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
465static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
466{
467    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
468}
469
470SIMD_IWRAPPER_2(unpackhi_epi64);
471//SIMD_IWRAPPER_2(unpackhi_epi8);
472SIMD_DWRAPPER_2(unpackhi_pd);
473SIMD_WRAPPER_2(unpackhi_ps);
474//SIMD_IWRAPPER_2(unpacklo_epi16);
475SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
476SIMD_IWRAPPER_2(unpacklo_epi64);
477//SIMD_IWRAPPER_2(unpacklo_epi8);
478SIMD_DWRAPPER_2(unpacklo_pd);
479SIMD_WRAPPER_2(unpacklo_ps);
480
481//-----------------------------------------------------------------------
482// Load / store operations
483//-----------------------------------------------------------------------
484template<ScaleFactor ScaleT>
485static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
486{
487    return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
488}
489
490static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
491{
492    return broadcast_ss(p);
493}
494
495static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
496{
497    return _mm512_load_ps(p);
498}
499
500static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
501{
502    return _mm512_load_si512(&p->v);
503}
504
505static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
506{
507    return _mm512_loadu_ps(p);
508}
509
510static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
511{
512    return _mm512_loadu_si512(p);
513}
514
515// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
516template<ScaleFactor ScaleT>
517static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
518{
519    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
520
521    return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
522}
523
524static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
525{
526    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
527    _mm512_mask_store_ps(p, m, src);
528}
529
530//static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
531//{
532//    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
533//    return static_cast<uint64_t>(m);
534//}
535
536static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
537{
538    __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
539    return static_cast<uint32_t>(m);
540}
541static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
542{
543    __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
544    return static_cast<uint32_t>(m);
545}
546
547static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
548{
549    return _mm512_set1_epi64(i);
550}
551
552static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
553{
554    return _mm512_set1_epi32(i);
555}
556
557static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
558{
559    return _mm512_set1_epi8(i);
560}
561
562static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
563{
564    return _mm512_set1_ps(f);
565}
566
567static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
568{
569    return _mm512_setzero_pd();
570}
571
572static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
573{
574    return _mm512_setzero_ps();
575}
576
577static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
578{
579    return _mm512_setzero_si512();
580}
581
582static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
583{
584    _mm512_store_ps(p, a);
585}
586
587static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
588{
589    _mm512_store_si512(&p->v, a);
590}
591
592static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
593{
594    _mm512_storeu_si512(&p->v, a);
595}
596
597static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
598{
599    _mm512_stream_ps(p, a);
600}
601
602static SIMDINLINE Integer SIMDCALL set_epi32(
603    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
604    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
605{
606    return _mm512_set_epi32(
607        i15, i14, i13, i12, i11, i10, i9, i8,
608        i7, i6, i5, i4, i3, i2, i1, i0);
609}
610
611static SIMDINLINE Integer SIMDCALL set_epi32(
612    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
613{
614    return set_epi32(
615        0, 0, 0, 0, 0, 0, 0, 0,
616        i7, i6, i5, i4, i3, i2, i1, i0);
617}
618
619static SIMDINLINE Float SIMDCALL set_ps(
620    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
621    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
622{
623    return _mm512_set_ps(
624        i15, i14, i13, i12, i11, i10, i9, i8,
625        i7, i6, i5, i4, i3, i2, i1, i0);
626}
627
628static SIMDINLINE Float SIMDCALL set_ps(
629    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
630{
631    return set_ps(
632        0, 0, 0, 0, 0, 0, 0, 0,
633        i7, i6, i5, i4, i3, i2, i1, i0);
634}
635
636static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
637{
638    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
639}
640
641#undef SIMD_WRAPPER_1_
642#undef SIMD_WRAPPER_1
643#undef SIMD_WRAPPER_2
644#undef SIMD_WRAPPER_2_
645#undef SIMD_WRAPPERI_2_
646#undef SIMD_DWRAPPER_2
647#undef SIMD_DWRAPPER_2I
648#undef SIMD_WRAPPER_2I_
649#undef SIMD_WRAPPER_3_
650#undef SIMD_WRAPPER_2I
651#undef SIMD_WRAPPER_3
652#undef SIMD_IWRAPPER_1
653#undef SIMD_IWRAPPER_2
654#undef SIMD_IFWRAPPER_2
655#undef SIMD_IWRAPPER_2I
656#undef SIMD_IWRAPPER_1
657#undef SIMD_IWRAPPER_1I
658#undef SIMD_IWRAPPER_1I_
659#undef SIMD_IWRAPPER_2
660#undef SIMD_IWRAPPER_2_
661#undef SIMD_IWRAPPER_2I
662
663