• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27//============================================================================
28// SIMD128 AVX (1) implementation
29//============================================================================
30
31#define SIMD_WRAPPER_1(op) \
32    static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
33
34#define SIMD_WRAPPER_2(op) \
35    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
36
37#define SIMD_DWRAPPER_2(op) \
38    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
39
40#define SIMD_WRAPPER_2I(op)                               \
41    template <int ImmT>                                   \
42    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
43    {                                                     \
44        return _mm_##op(a, b, ImmT);                      \
45    }
46
47#define SIMD_DWRAPPER_2I(op)                                 \
48    template <int ImmT>                                      \
49    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
50    {                                                        \
51        return _mm_##op(a, b, ImmT);                         \
52    }
53
54#define SIMD_WRAPPER_3(op) \
55    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
56
57#define SIMD_IWRAPPER_1(op) \
58    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
59
60#define SIMD_IWRAPPER_1I_(op, intrin)                \
61    template <int ImmT>                              \
62    static SIMDINLINE Integer SIMDCALL op(Integer a) \
63    {                                                \
64        return intrin(a, ImmT);                      \
65    }
66#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
67
68#define SIMD_IWRAPPER_2_(op, intrin) \
69    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
70
71#define SIMD_IWRAPPER_2(op) \
72    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
73
74#define SIMD_IFWRAPPER_2(op, intrin)                            \
75    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
76    {                                                           \
77        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));   \
78    }
79
80#define SIMD_IWRAPPER_2I(op)                                    \
81    template <int ImmT>                                         \
82    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
83    {                                                           \
84        return _mm_##op(a, b, ImmT);                            \
85    }
86
87//-----------------------------------------------------------------------
88// Single precision floating point arithmetic operations
89//-----------------------------------------------------------------------
90SIMD_WRAPPER_2(add_ps);   // return a + b
91SIMD_WRAPPER_2(div_ps);   // return a / b
92SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
93SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
94SIMD_WRAPPER_2(mul_ps);   // return a * b
95SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
96SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
97SIMD_WRAPPER_2(sub_ps);   // return a - b
98
99static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
100{
101    return add_ps(mul_ps(a, b), c);
102}
103static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
104{
105    return sub_ps(mul_ps(a, b), c);
106}
107
108template <RoundMode RMT>
109static SIMDINLINE Float SIMDCALL round_ps(Float a)
110{
111    return _mm_round_ps(a, static_cast<int>(RMT));
112}
113
114static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
115{
116    return round_ps<RoundMode::CEIL_NOEXC>(a);
117}
118static SIMDINLINE Float SIMDCALL floor_ps(Float a)
119{
120    return round_ps<RoundMode::FLOOR_NOEXC>(a);
121}
122
123//-----------------------------------------------------------------------
124// Integer (various width) arithmetic operations
125//-----------------------------------------------------------------------
126SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
127SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
128SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
129SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
130SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
131SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
132SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
133SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
134SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
135
136// return (a * b) & 0xFFFFFFFF
137//
138// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
139// and store the low 32 bits of the intermediate integers in dst.
140SIMD_IWRAPPER_2(mullo_epi32);
141SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
142SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
143SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
144
145//-----------------------------------------------------------------------
146// Logical operations
147//-----------------------------------------------------------------------
148SIMD_WRAPPER_2(and_ps);                        // return a & b       (float treated as int)
149SIMD_IWRAPPER_2_(and_si, _mm_and_si128);       // return a & b       (int)
150SIMD_WRAPPER_2(andnot_ps);                     // return (~a) & b    (float treated as int)
151SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b    (int)
152SIMD_WRAPPER_2(or_ps);                         // return a | b       (float treated as int)
153SIMD_IWRAPPER_2_(or_si, _mm_or_si128);         // return a | b       (int)
154SIMD_WRAPPER_2(xor_ps);                        // return a ^ b       (float treated as int)
155SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);       // return a ^ b       (int)
156
157//-----------------------------------------------------------------------
158// Shift operations
159//-----------------------------------------------------------------------
160SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
161SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
162
163static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
164{
165    int32_t a, count;
166    a     = _mm_extract_epi32(vA, 0);
167    count = _mm_extract_epi32(vB, 0);
168    a <<= count;
169    vA = _mm_insert_epi32(vA, a, 0);
170
171    a     = _mm_extract_epi32(vA, 1);
172    count = _mm_extract_epi32(vB, 1);
173    a <<= count;
174    vA = _mm_insert_epi32(vA, a, 1);
175
176    a     = _mm_extract_epi32(vA, 2);
177    count = _mm_extract_epi32(vB, 2);
178    a <<= count;
179    vA = _mm_insert_epi32(vA, a, 2);
180
181    a     = _mm_extract_epi32(vA, 3);
182    count = _mm_extract_epi32(vB, 3);
183    a <<= count;
184    vA = _mm_insert_epi32(vA, a, 3);
185
186    return vA;
187}
188
189SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
190SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
191SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
192
193static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
194{
195    return _mm_srl_epi64(a, n);
196}
197
198template <int ImmT> // same as srli_si, but with Float cast to int
199static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
200{
201    return castsi_ps(srli_si<ImmT>(castps_si(a)));
202}
203
204static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
205{
206    int32_t a, count;
207    a     = _mm_extract_epi32(vA, 0);
208    count = _mm_extract_epi32(vB, 0);
209    a >>= count;
210    vA = _mm_insert_epi32(vA, a, 0);
211
212    a     = _mm_extract_epi32(vA, 1);
213    count = _mm_extract_epi32(vB, 1);
214    a >>= count;
215    vA = _mm_insert_epi32(vA, a, 1);
216
217    a     = _mm_extract_epi32(vA, 2);
218    count = _mm_extract_epi32(vB, 2);
219    a >>= count;
220    vA = _mm_insert_epi32(vA, a, 2);
221
222    a     = _mm_extract_epi32(vA, 3);
223    count = _mm_extract_epi32(vB, 3);
224    a >>= count;
225    vA = _mm_insert_epi32(vA, a, 3);
226
227    return vA;
228}
229
230//-----------------------------------------------------------------------
231// Conversion operations
232//-----------------------------------------------------------------------
233static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
234{
235    return _mm_castpd_ps(a);
236}
237
238static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
239{
240    return _mm_castps_si128(a);
241}
242
243static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
244{
245    return _mm_castsi128_pd(a);
246}
247
248static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
249{
250    return _mm_castps_pd(a);
251}
252
253static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
254{
255    return _mm_castsi128_ps(a);
256}
257
258static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
259{
260    return _mm_cvtepi32_ps(a);
261}
262
263static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
264{
265    return _mm_cvtsi128_si32(a);
266}
267
268static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
269{
270    return _mm_cvtsi32_si128(n);
271}
272
273SIMD_IWRAPPER_1(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
274SIMD_IWRAPPER_1(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
275SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
276SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
277SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
278
279static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
280{
281    return _mm_cvtps_epi32(a);
282}
283
284static SIMDINLINE Integer SIMDCALL
285                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
286{
287    return _mm_cvttps_epi32(a);
288}
289
290//-----------------------------------------------------------------------
291// Comparison operations
292//-----------------------------------------------------------------------
293template <CompareType CmpTypeT>
294static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
295{
296    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
297}
298static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
299{
300    return cmp_ps<CompareType::LT_OQ>(a, b);
301}
302static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
303{
304    return cmp_ps<CompareType::GT_OQ>(a, b);
305}
306static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
307{
308    return cmp_ps<CompareType::NEQ_OQ>(a, b);
309}
310static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
311{
312    return cmp_ps<CompareType::EQ_OQ>(a, b);
313}
314static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
315{
316    return cmp_ps<CompareType::GE_OQ>(a, b);
317}
318static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
319{
320    return cmp_ps<CompareType::LE_OQ>(a, b);
321}
322
323SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
324SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
325SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
326SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
327SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
328SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
329SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
330SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
331SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
332
333static SIMDINLINE bool SIMDCALL testz_ps(Float a,
334                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
335{
336    return 0 != _mm_testz_ps(a, b);
337}
338
339static SIMDINLINE bool SIMDCALL testz_si(Integer a,
340                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
341{
342    return 0 != _mm_testz_si128(a, b);
343}
344
345//-----------------------------------------------------------------------
346// Blend / shuffle / permute operations
347//-----------------------------------------------------------------------
348SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a  (float)
349SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a  (float)
350
351static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
352                                                Integer b,
353                                                Float   mask) // return mask ? b : a (int)
354{
355    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
356}
357
358static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
359                                                Integer b,
360                                                Integer mask) // return mask ? b : a (int)
361{
362    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
363}
364
365static SIMDINLINE Float SIMDCALL
366                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
367{
368    return _mm_broadcast_ss(p);
369}
370
371SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
372SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
373SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
374SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
375
376static SIMDINLINE Integer SIMDCALL
377                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
378{
379    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
380}
381
382static SIMDINLINE Float SIMDCALL
383                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
384{
385    return _mm_permutevar_ps(a, swiz);
386}
387
388SIMD_IWRAPPER_1I(shuffle_epi32);
389
390template <int ImmT>
391static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
392
393SIMD_IWRAPPER_2(shuffle_epi8);
394SIMD_DWRAPPER_2I(shuffle_pd);
395SIMD_WRAPPER_2I(shuffle_ps);
396SIMD_IWRAPPER_2(unpackhi_epi16);
397
398// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
399static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
400{
401    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
402}
403
404SIMD_IWRAPPER_2(unpackhi_epi64);
405SIMD_IWRAPPER_2(unpackhi_epi8);
406SIMD_DWRAPPER_2(unpackhi_pd);
407SIMD_WRAPPER_2(unpackhi_ps);
408SIMD_IWRAPPER_2(unpacklo_epi16);
409SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
410SIMD_IWRAPPER_2(unpacklo_epi64);
411SIMD_IWRAPPER_2(unpacklo_epi8);
412SIMD_DWRAPPER_2(unpacklo_pd);
413SIMD_WRAPPER_2(unpacklo_ps);
414
415//-----------------------------------------------------------------------
416// Load / store operations
417//-----------------------------------------------------------------------
418template <ScaleFactor ScaleT = ScaleFactor::SF_1>
419static SIMDINLINE Float SIMDCALL
420                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
421{
422    uint32_t* pOffsets = (uint32_t*)&idx;
423    Float     vResult;
424    float*    pResult = (float*)&vResult;
425    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
426    {
427        uint32_t offset = pOffsets[i];
428        offset          = offset * static_cast<uint32_t>(ScaleT);
429        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
430    }
431
432    return vResult;
433}
434
435static SIMDINLINE Float SIMDCALL
436                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
437{
438    return broadcast_ss(p);
439}
440
441static SIMDINLINE Float SIMDCALL
442                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
443{
444    return _mm_load_ps(p);
445}
446
447static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
448{
449    return _mm_load_si128(&p->v);
450}
451
452static SIMDINLINE Float SIMDCALL
453                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
454{
455    return _mm_loadu_ps(p);
456}
457
458static SIMDINLINE Integer SIMDCALL
459                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
460{
461    return _mm_lddqu_si128(&p->v);
462}
463
464// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
465template <ScaleFactor ScaleT = ScaleFactor::SF_1>
466static SIMDINLINE Float SIMDCALL
467                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
468{
469    uint32_t* pOffsets = (uint32_t*)&idx;
470    Float     vResult  = old;
471    float*    pResult  = (float*)&vResult;
472    unsigned long index;
473    uint32_t  umask = movemask_ps(mask);
474    while (_BitScanForward(&index, umask))
475    {
476        umask &= ~(1 << index);
477        uint32_t offset = pOffsets[index];
478        offset          = offset * static_cast<uint32_t>(ScaleT);
479        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
480    }
481
482    return vResult;
483}
484
485static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
486{
487    _mm_maskstore_ps(p, mask, src);
488}
489
490static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
491{
492    return static_cast<uint32_t>(_mm_movemask_epi8(a));
493}
494
495static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
496{
497    return static_cast<uint32_t>(_mm_movemask_pd(a));
498}
499static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
500{
501    return static_cast<uint32_t>(_mm_movemask_ps(a));
502}
503
504static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
505{
506    return _mm_set1_epi32(i);
507}
508
509static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
510{
511    return _mm_set1_epi8(i);
512}
513
514static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
515{
516    return _mm_set1_ps(f);
517}
518
519static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
520{
521    return _mm_setzero_ps();
522}
523
524static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
525{
526    return _mm_setzero_si128();
527}
528
529static SIMDINLINE void SIMDCALL
530                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
531{
532    _mm_store_ps(p, a);
533}
534
535static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
536{
537    _mm_store_si128(&p->v, a);
538}
539
540static SIMDINLINE void SIMDCALL
541                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
542{
543    _mm_storeu_si128(&p->v, a);
544}
545
546static SIMDINLINE void SIMDCALL
547                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
548{
549    _mm_stream_ps(p, a);
550}
551
552static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
553{
554    return _mm_set_ps(in3, in2, in1, in0);
555}
556
557static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
558{
559    return _mm_set_epi32(in3, in2, in1, in0);
560}
561
562template <int ImmT>
563static SIMDINLINE float SIMDCALL extract_ps(Float a)
564{
565    int tmp = _mm_extract_ps(a, ImmT);
566    return *reinterpret_cast<float*>(&tmp);
567}
568
569static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
570{
571    Integer       vec = set1_epi32(mask);
572    const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
573    vec               = and_si(vec, bit);
574    vec               = cmplt_epi32(setzero_si(), vec);
575    return castsi_ps(vec);
576}
577
578#undef SIMD_WRAPPER_1
579#undef SIMD_WRAPPER_2
580#undef SIMD_DWRAPPER_2
581#undef SIMD_DWRAPPER_2I
582#undef SIMD_WRAPPER_2I
583#undef SIMD_WRAPPER_3
584#undef SIMD_IWRAPPER_1
585#undef SIMD_IWRAPPER_2
586#undef SIMD_IFWRAPPER_2
587#undef SIMD_IWRAPPER_2I
588#undef SIMD_IWRAPPER_1
589#undef SIMD_IWRAPPER_1I
590#undef SIMD_IWRAPPER_1I_
591#undef SIMD_IWRAPPER_2
592#undef SIMD_IWRAPPER_2_
593#undef SIMD_IWRAPPER_2I
594