• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/****************************************************************************
2* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27using SIMD128T = SIMD128Impl::AVXImpl;
28
29//============================================================================
30// SIMD256 AVX (1) implementation
31//============================================================================
32
33#define SIMD_WRAPPER_1(op)  \
34    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
35    {\
36        return _mm256_##op(a);\
37    }
38
39#define SIMD_WRAPPER_2(op)  \
40    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
41    {\
42        return _mm256_##op(a, b);\
43    }
44
45#define SIMD_DWRAPPER_2(op)  \
46    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
47    {\
48        return _mm256_##op(a, b);\
49    }
50
51#define SIMD_WRAPPER_2I(op)  \
52    template<int ImmT>\
53    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
54    {\
55        return  _mm256_##op(a, b, ImmT);\
56    }
57
58#define SIMD_DWRAPPER_2I(op)  \
59    template<int ImmT>\
60    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
61    {\
62        return _mm256_##op(a, b, ImmT);\
63    }
64
65#define SIMD_WRAPPER_3(op)  \
66    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
67    {\
68        return _mm256_##op(a, b, c);\
69    }
70
71#define SIMD_IWRAPPER_1(op)  \
72    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
73    {\
74        return _mm256_##op(a);\
75    }
76
77#define SIMD_IWRAPPER_2(op)  \
78    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
79    {\
80        return _mm256_##op(a, b);\
81    }
82
83#define SIMD_IFWRAPPER_2(op, intrin)  \
84    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
85    {\
86        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
87    }
88
89#define SIMD_IFWRAPPER_2I(op, intrin)  \
90    template<int ImmT> \
91    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
92    {\
93        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
94    }
95
96#define SIMD_IWRAPPER_2I_(op, intrin)  \
97    template<int ImmT>\
98    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
99    {\
100        return _mm256_##intrin(a, b, ImmT);\
101    }
102#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
103
104#define SIMD_IWRAPPER_3(op)  \
105    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
106    {\
107        return _mm256_##op(a, b, c);\
108    }
109
110// emulated integer simd
111#define SIMD_EMU_IWRAPPER_1(op) \
112    static SIMDINLINE \
113    Integer SIMDCALL op(Integer const &a)\
114    {\
115        return Integer\
116        {\
117            SIMD128T::op(a.v4[0]),\
118            SIMD128T::op(a.v4[1]),\
119        };\
120    }
121#define SIMD_EMU_IWRAPPER_1L(op, shift) \
122    static SIMDINLINE \
123    Integer SIMDCALL op(Integer const &a)\
124    {\
125        return Integer \
126        {\
127            SIMD128T::op(a.v4[0]), \
128            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
129        };\
130    }\
131    static SIMDINLINE \
132    Integer SIMDCALL op(SIMD128Impl::Integer const &a)\
133    {\
134        return Integer \
135        {\
136            SIMD128T::op(a), \
137            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
138        };\
139    }
140
141#define SIMD_EMU_IWRAPPER_1I(op) \
142    template <int ImmT> static SIMDINLINE \
143    Integer SIMDCALL op(Integer const &a)\
144    {\
145        return Integer\
146        {\
147            SIMD128T::template op<ImmT>(a.v4[0]),\
148            SIMD128T::template op<ImmT>(a.v4[1]),\
149        };\
150    }
151
152#define SIMD_EMU_IWRAPPER_2(op) \
153    static SIMDINLINE \
154    Integer SIMDCALL op(Integer const &a, Integer const &b)\
155    {\
156        return Integer\
157        {\
158            SIMD128T::op(a.v4[0], b.v4[0]),\
159            SIMD128T::op(a.v4[1], b.v4[1]),\
160        };\
161    }
162
163#define SIMD_EMU_IWRAPPER_2I(op) \
164    template <int ImmT> static SIMDINLINE \
165    Integer SIMDCALL op(Integer const &a, Integer const &b)\
166    {\
167        return Integer\
168        {\
169            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
170            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
171        };\
172    }
173
174//-----------------------------------------------------------------------
175// Single precision floating point arithmetic operations
176//-----------------------------------------------------------------------
177SIMD_WRAPPER_2(add_ps);     // return a + b
178SIMD_WRAPPER_2(div_ps);     // return a / b
179
180static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c
181{
182    return add_ps(mul_ps(a, b), c);
183}
184
185static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c
186{
187    return sub_ps(mul_ps(a, b), c);
188}
189
190SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
191SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
192SIMD_WRAPPER_2(mul_ps);     // return a * b
193SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
194SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
195SIMD_WRAPPER_2(sub_ps);     // return a - b
196
197template <RoundMode RMT>
198static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
199{
200    return _mm256_round_ps(a, static_cast<int>(RMT));
201}
202
203static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
204static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
205
206//-----------------------------------------------------------------------
207// Integer (various width) arithmetic operations
208//-----------------------------------------------------------------------
209SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
210SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
211SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
212SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
213SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
214SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
215SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
216SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
217SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
218
219// return (a * b) & 0xFFFFFFFF
220//
221// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
222// and store the low 32 bits of the intermediate integers in dst.
223SIMD_EMU_IWRAPPER_2(mullo_epi32);
224SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
225SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
226SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
227
228//-----------------------------------------------------------------------
229// Logical operations
230//-----------------------------------------------------------------------
231SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
232SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
233SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
234SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
235SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
236SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
237SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
238SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
239
240
241//-----------------------------------------------------------------------
242// Shift operations
243//-----------------------------------------------------------------------
244SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
245
246static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b      (uint32)
247{
248    int32_t aHi, aLow, countHi, countLow;
249    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
250    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
251    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
252    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
253
254    aHi = _mm_extract_epi32(vAHi, 0);
255    countHi = _mm_extract_epi32(vCountHi, 0);
256    aHi <<= countHi;
257    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
258
259    aLow = _mm_extract_epi32(vALow, 0);
260    countLow = _mm_extract_epi32(vCountLow, 0);
261    aLow <<= countLow;
262    vALow = _mm_insert_epi32(vALow, aLow, 0);
263
264    aHi = _mm_extract_epi32(vAHi, 1);
265    countHi = _mm_extract_epi32(vCountHi, 1);
266    aHi <<= countHi;
267    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
268
269    aLow = _mm_extract_epi32(vALow, 1);
270    countLow = _mm_extract_epi32(vCountLow, 1);
271    aLow <<= countLow;
272    vALow = _mm_insert_epi32(vALow, aLow, 1);
273
274    aHi = _mm_extract_epi32(vAHi, 2);
275    countHi = _mm_extract_epi32(vCountHi, 2);
276    aHi <<= countHi;
277    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
278
279    aLow = _mm_extract_epi32(vALow, 2);
280    countLow = _mm_extract_epi32(vCountLow, 2);
281    aLow <<= countLow;
282    vALow = _mm_insert_epi32(vALow, aLow, 2);
283
284    aHi = _mm_extract_epi32(vAHi, 3);
285    countHi = _mm_extract_epi32(vCountHi, 3);
286    aHi <<= countHi;
287    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
288
289    aLow = _mm_extract_epi32(vALow, 3);
290    countLow = _mm_extract_epi32(vCountLow, 3);
291    aLow <<= countLow;
292    vALow = _mm_insert_epi32(vALow, aLow, 3);
293
294    __m256i ret = _mm256_set1_epi32(0);
295    ret = _mm256_insertf128_si256(ret, vAHi, 1);
296    ret = _mm256_insertf128_si256(ret, vALow, 0);
297    return ret;
298}
299
300SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
301SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
302SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
303
304template<int ImmT>                              // same as srli_si, but with Float cast to int
305static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
306{
307    return castsi_ps(srli_si<ImmT>(castps_si(a)));
308}
309
310static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b      (uint32)
311{
312    int32_t aHi, aLow, countHi, countLow;
313    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
314    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
315    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
316    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
317
318    aHi = _mm_extract_epi32(vAHi, 0);
319    countHi = _mm_extract_epi32(vCountHi, 0);
320    aHi >>= countHi;
321    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
322
323    aLow = _mm_extract_epi32(vALow, 0);
324    countLow = _mm_extract_epi32(vCountLow, 0);
325    aLow >>= countLow;
326    vALow = _mm_insert_epi32(vALow, aLow, 0);
327
328    aHi = _mm_extract_epi32(vAHi, 1);
329    countHi = _mm_extract_epi32(vCountHi, 1);
330    aHi >>= countHi;
331    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
332
333    aLow = _mm_extract_epi32(vALow, 1);
334    countLow = _mm_extract_epi32(vCountLow, 1);
335    aLow >>= countLow;
336    vALow = _mm_insert_epi32(vALow, aLow, 1);
337
338    aHi = _mm_extract_epi32(vAHi, 2);
339    countHi = _mm_extract_epi32(vCountHi, 2);
340    aHi >>= countHi;
341    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
342
343    aLow = _mm_extract_epi32(vALow, 2);
344    countLow = _mm_extract_epi32(vCountLow, 2);
345    aLow >>= countLow;
346    vALow = _mm_insert_epi32(vALow, aLow, 2);
347
348    aHi = _mm_extract_epi32(vAHi, 3);
349    countHi = _mm_extract_epi32(vCountHi, 3);
350    aHi >>= countHi;
351    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
352
353    aLow = _mm_extract_epi32(vALow, 3);
354    countLow = _mm_extract_epi32(vCountLow, 3);
355    aLow >>= countLow;
356    vALow = _mm_insert_epi32(vALow, aLow, 3);
357
358    __m256i ret = _mm256_set1_epi32(0);
359    ret = _mm256_insertf128_si256(ret, vAHi, 1);
360    ret = _mm256_insertf128_si256(ret, vALow, 0);
361    return ret;
362}
363
364
365
366//-----------------------------------------------------------------------
367// Conversion operations
368//-----------------------------------------------------------------------
369static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)   // return *(Float*)(&a)
370{
371    return _mm256_castpd_ps(a);
372}
373
374static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)   // return *(Integer*)(&a)
375{
376    return _mm256_castps_si256(a);
377}
378
379static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)   // return *(Double*)(&a)
380{
381    return _mm256_castsi256_pd(a);
382}
383
384static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
385{
386    return _mm256_castps_pd(a);
387}
388
389static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a)   // return *(Integer*)(&a)
390{
391    return _mm256_castpd_si256(a);
392}
393
394static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)   // return *(Float*)(&a)
395{
396    return _mm256_castsi256_ps(a);
397}
398
399static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a    (int32 --> float)
400{
401    return _mm256_cvtepi32_ps(a);
402}
403
404SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
405SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
406SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
407SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
408SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
409
410static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
411{
412    return _mm256_cvtps_epi32(a);
413}
414
415static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
416{
417    return _mm256_cvttps_epi32(a);
418}
419
420//-----------------------------------------------------------------------
421// Comparison operations
422//-----------------------------------------------------------------------
423template<CompareType CmpTypeT>
424static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
425{
426    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
427}
428static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
429static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
430static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
431static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
432static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
433static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
434
435SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
436SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
437SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
438SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
439SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
440SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
441SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
442SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
443SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
444
445static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
446{
447    return  0 != _mm256_testz_ps(a, b);
448}
449
450static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
451{
452    return  0 != _mm256_testz_si256(a, b);
453}
454
455//-----------------------------------------------------------------------
456// Blend / shuffle / permute operations
457//-----------------------------------------------------------------------
458SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
459SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
460SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
461
462static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
463{
464    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
465}
466
467static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
468{
469    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
470}
471
472static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
473{
474    return _mm256_broadcast_ss(p);
475}
476
477SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
478SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
479SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
480SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
481
482static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
483{
484    Integer result;
485
486    // Ugly slow implementation
487    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
488    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
489    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
490
491    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
492    {
493        pResult[i] = pA[0xF & pSwiz[i]];
494    }
495
496    return result;
497}
498
499static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
500{
501    Float result;
502
503    // Ugly slow implementation
504    float const *pA = reinterpret_cast<float const*>(&a);
505    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
506    float *pResult = reinterpret_cast<float *>(&result);
507
508    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
509    {
510        pResult[i] = pA[0xF & pSwiz[i]];
511    }
512
513    return result;
514}
515
516SIMD_WRAPPER_2I(permute2f128_ps);
517SIMD_DWRAPPER_2I(permute2f128_pd);
518SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
519
520
521SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
522
523template<int ImmT>
524static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
525{
526    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
527}
528SIMD_EMU_IWRAPPER_2(shuffle_epi8);
529SIMD_DWRAPPER_2I(shuffle_pd);
530SIMD_WRAPPER_2I(shuffle_ps);
531SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
532SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
533SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
534SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
535SIMD_DWRAPPER_2(unpackhi_pd);
536SIMD_WRAPPER_2(unpackhi_ps);
537SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
538SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
539SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
540SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
541SIMD_DWRAPPER_2(unpacklo_pd);
542SIMD_WRAPPER_2(unpacklo_ps);
543
544//-----------------------------------------------------------------------
545// Load / store operations
546//-----------------------------------------------------------------------
547template<ScaleFactor ScaleT>
548static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
549{
550    uint32_t *pOffsets = (uint32_t*)&idx;
551    Float vResult;
552    float* pResult = (float*)&vResult;
553    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
554    {
555        uint32_t offset = pOffsets[i];
556        offset = offset * static_cast<uint32_t>(ScaleT);
557        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
558    }
559
560    return vResult;
561}
562
563static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
564{
565    return broadcast_ss(p);
566}
567
568static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
569{
570    return _mm256_load_ps(p);
571}
572
573static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
574{
575    return _mm256_load_si256(&p->v);
576}
577
578static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
579{
580    return _mm256_loadu_ps(p);
581}
582
583static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
584{
585    return _mm256_lddqu_si256(&p->v);
586}
587
588// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
589template<ScaleFactor ScaleT>
590static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
591{
592    uint32_t *pOffsets = (uint32_t*)&idx;
593    Float vResult = old;
594    float* pResult = (float*)&vResult;
595    DWORD index;
596    uint32_t umask = movemask_ps(mask);
597    while (_BitScanForward(&index, umask))
598    {
599        umask &= ~(1 << index);
600        uint32_t offset = pOffsets[index];
601        offset = offset * static_cast<uint32_t>(ScaleT);
602        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
603    }
604
605    return vResult;
606}
607
608static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
609{
610    _mm256_maskstore_ps(p, mask, src);
611}
612
613static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
614{
615    return SIMD128T::movemask_epi8(a.v4[0]) |
616           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
617}
618
619static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
620{
621    return static_cast<uint32_t>(_mm256_movemask_pd(a));
622}
623static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
624{
625    return static_cast<uint32_t>(_mm256_movemask_ps(a));
626}
627
628static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
629{
630    return _mm256_set1_epi32(i);
631}
632
633static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
634{
635    return _mm256_set1_epi8(i);
636}
637
638static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
639{
640    return _mm256_set1_ps(f);
641}
642
643static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
644{
645    return _mm256_setzero_ps();
646}
647
648static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
649{
650    return _mm256_setzero_si256();
651}
652
653static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
654{
655    _mm256_store_ps(p, a);
656}
657
658static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
659{
660    _mm256_store_si256(&p->v, a);
661}
662
663static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
664{
665    _mm256_stream_ps(p, a);
666}
667
668//=======================================================================
669// Legacy interface (available only in SIMD256 width)
670//=======================================================================
671
672static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
673{
674    return _mm256_broadcast_ps(&p->v);
675}
676
677template<int ImmT>
678static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a)
679{
680    return _mm256_extractf128_pd(a, ImmT);
681}
682
683template<int ImmT>
684static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float const &a)
685{
686    return _mm256_extractf128_ps(a, ImmT);
687}
688
689template<int ImmT>
690static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a)
691{
692    return _mm256_extractf128_si256(a, ImmT);
693}
694
695template<int ImmT>
696static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b)
697{
698    return _mm256_insertf128_pd(a, b, ImmT);
699}
700
701template<int ImmT>
702static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b)
703{
704    return _mm256_insertf128_ps(a, b, ImmT);
705}
706
707template<int ImmT>
708static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b)
709{
710    return _mm256_insertf128_si256(a, b, ImmT);
711}
712
713#ifndef _mm256_set_m128i
714#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
715    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
716#endif
717
718#ifndef _mm256_loadu2_m128i
719#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
720                            /* SIMD128Impl::Integer const* */ loaddr) \
721    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
722#endif
723
724static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
725{
726    return _mm256_loadu2_m128i(&phi->v, &plo->v);
727}
728
729static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
730{
731    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
732}
733
734static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
735{
736    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
737}
738
739static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src)
740{
741    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
742}
743
744static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
745{
746    Integer vec = set1_epi32(mask);
747    const Integer bit = set_epi32(
748        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
749    vec = and_si(vec, bit);
750    vec = cmplt_epi32(setzero_si(), vec);
751    return castsi_ps(vec);
752}
753
754#undef SIMD_WRAPPER_1
755#undef SIMD_WRAPPER_2
756#undef SIMD_DWRAPPER_2
757#undef SIMD_DWRAPPER_2I
758#undef SIMD_WRAPPER_2I
759#undef SIMD_WRAPPER_3
760#undef SIMD_IWRAPPER_1
761#undef SIMD_IWRAPPER_2
762#undef SIMD_IFWRAPPER_2
763#undef SIMD_IFWRAPPER_2I
764#undef SIMD_IWRAPPER_2I
765#undef SIMD_IWRAPPER_2I_
766#undef SIMD_IWRAPPER_2_
767#undef SIMD_IWRAPPER_3
768#undef SIMD_EMU_IWRAPPER_1
769#undef SIMD_EMU_IWRAPPER_1I
770#undef SIMD_EMU_IWRAPPER_2
771#undef SIMD_EMU_IWRAPPER_2I
772