• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __IMMINTRIN_H
25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26 #endif
27 
28 typedef double __v4df __attribute__ ((__vector_size__ (32)));
29 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31 typedef int __v8si __attribute__ ((__vector_size__ (32)));
32 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34 
35 typedef float __m256 __attribute__ ((__vector_size__ (32)));
36 typedef double __m256d __attribute__((__vector_size__(32)));
37 typedef long long __m256i __attribute__((__vector_size__(32)));
38 
39 /* Arithmetic */
40 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_add_pd(__m256d a,__m256d b)41 _mm256_add_pd(__m256d a, __m256d b)
42 {
43   return a+b;
44 }
45 
46 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_add_ps(__m256 a,__m256 b)47 _mm256_add_ps(__m256 a, __m256 b)
48 {
49   return a+b;
50 }
51 
52 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_sub_pd(__m256d a,__m256d b)53 _mm256_sub_pd(__m256d a, __m256d b)
54 {
55   return a-b;
56 }
57 
58 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_sub_ps(__m256 a,__m256 b)59 _mm256_sub_ps(__m256 a, __m256 b)
60 {
61   return a-b;
62 }
63 
64 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_addsub_pd(__m256d a,__m256d b)65 _mm256_addsub_pd(__m256d a, __m256d b)
66 {
67   return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68 }
69 
70 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_addsub_ps(__m256 a,__m256 b)71 _mm256_addsub_ps(__m256 a, __m256 b)
72 {
73   return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74 }
75 
76 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_div_pd(__m256d a,__m256d b)77 _mm256_div_pd(__m256d a, __m256d b)
78 {
79   return a / b;
80 }
81 
82 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_div_ps(__m256 a,__m256 b)83 _mm256_div_ps(__m256 a, __m256 b)
84 {
85   return a / b;
86 }
87 
88 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_max_pd(__m256d a,__m256d b)89 _mm256_max_pd(__m256d a, __m256d b)
90 {
91   return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92 }
93 
94 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_max_ps(__m256 a,__m256 b)95 _mm256_max_ps(__m256 a, __m256 b)
96 {
97   return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98 }
99 
100 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_min_pd(__m256d a,__m256d b)101 _mm256_min_pd(__m256d a, __m256d b)
102 {
103   return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104 }
105 
106 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_min_ps(__m256 a,__m256 b)107 _mm256_min_ps(__m256 a, __m256 b)
108 {
109   return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110 }
111 
112 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_mul_pd(__m256d a,__m256d b)113 _mm256_mul_pd(__m256d a, __m256d b)
114 {
115   return a * b;
116 }
117 
118 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_mul_ps(__m256 a,__m256 b)119 _mm256_mul_ps(__m256 a, __m256 b)
120 {
121   return a * b;
122 }
123 
124 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_sqrt_pd(__m256d a)125 _mm256_sqrt_pd(__m256d a)
126 {
127   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128 }
129 
130 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_sqrt_ps(__m256 a)131 _mm256_sqrt_ps(__m256 a)
132 {
133   return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134 }
135 
136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_rsqrt_ps(__m256 a)137 _mm256_rsqrt_ps(__m256 a)
138 {
139   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140 }
141 
142 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_rcp_ps(__m256 a)143 _mm256_rcp_ps(__m256 a)
144 {
145   return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146 }
147 
148 #define _mm256_round_pd(V, M) __extension__ ({ \
149     __m256d __V = (V); \
150     (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
151 
152 #define _mm256_round_ps(V, M) __extension__ ({ \
153   __m256 __V = (V); \
154   (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
155 
156 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
157 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
159 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160 
161 /* Logical */
162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_and_pd(__m256d a,__m256d b)163 _mm256_and_pd(__m256d a, __m256d b)
164 {
165   return (__m256d)((__v4di)a & (__v4di)b);
166 }
167 
168 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_and_ps(__m256 a,__m256 b)169 _mm256_and_ps(__m256 a, __m256 b)
170 {
171   return (__m256)((__v8si)a & (__v8si)b);
172 }
173 
174 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_andnot_pd(__m256d a,__m256d b)175 _mm256_andnot_pd(__m256d a, __m256d b)
176 {
177   return (__m256d)(~(__v4di)a & (__v4di)b);
178 }
179 
180 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_andnot_ps(__m256 a,__m256 b)181 _mm256_andnot_ps(__m256 a, __m256 b)
182 {
183   return (__m256)(~(__v8si)a & (__v8si)b);
184 }
185 
186 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_or_pd(__m256d a,__m256d b)187 _mm256_or_pd(__m256d a, __m256d b)
188 {
189   return (__m256d)((__v4di)a | (__v4di)b);
190 }
191 
192 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_or_ps(__m256 a,__m256 b)193 _mm256_or_ps(__m256 a, __m256 b)
194 {
195   return (__m256)((__v8si)a | (__v8si)b);
196 }
197 
198 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_xor_pd(__m256d a,__m256d b)199 _mm256_xor_pd(__m256d a, __m256d b)
200 {
201   return (__m256d)((__v4di)a ^ (__v4di)b);
202 }
203 
204 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_xor_ps(__m256 a,__m256 b)205 _mm256_xor_ps(__m256 a, __m256 b)
206 {
207   return (__m256)((__v8si)a ^ (__v8si)b);
208 }
209 
210 /* Horizontal arithmetic */
211 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_hadd_pd(__m256d a,__m256d b)212 _mm256_hadd_pd(__m256d a, __m256d b)
213 {
214   return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215 }
216 
217 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_hadd_ps(__m256 a,__m256 b)218 _mm256_hadd_ps(__m256 a, __m256 b)
219 {
220   return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221 }
222 
223 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_hsub_pd(__m256d a,__m256d b)224 _mm256_hsub_pd(__m256d a, __m256d b)
225 {
226   return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227 }
228 
229 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_hsub_ps(__m256 a,__m256 b)230 _mm256_hsub_ps(__m256 a, __m256 b)
231 {
232   return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233 }
234 
235 /* Vector permutations */
236 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_permutevar_pd(__m128d a,__m128i c)237 _mm_permutevar_pd(__m128d a, __m128i c)
238 {
239   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240 }
241 
242 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_permutevar_pd(__m256d a,__m256i c)243 _mm256_permutevar_pd(__m256d a, __m256i c)
244 {
245   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246 }
247 
248 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_permutevar_ps(__m128 a,__m128i c)249 _mm_permutevar_ps(__m128 a, __m128i c)
250 {
251   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252 }
253 
254 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_permutevar_ps(__m256 a,__m256i c)255 _mm256_permutevar_ps(__m256 a, __m256i c)
256 {
257   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 						  (__v8si)c);
259 }
260 
261 #define _mm_permute_pd(A, C) __extension__ ({ \
262   __m128d __A = (A); \
263   (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
264                                    (C) & 0x1, ((C) & 0x2) >> 1); })
265 
266 #define _mm256_permute_pd(A, C) __extension__ ({ \
267   __m256d __A = (A); \
268   (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
269                                    (C) & 0x1, ((C) & 0x2) >> 1, \
270                                    2 + (((C) & 0x4) >> 2), \
271                                    2 + (((C) & 0x8) >> 3)); })
272 
273 #define _mm_permute_ps(A, C) __extension__ ({ \
274   __m128 __A = (A); \
275   (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
276                                    (C) & 0x3, ((C) & 0xc) >> 2, \
277                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
278 
279 #define _mm256_permute_ps(A, C) __extension__ ({ \
280   __m256 __A = (A); \
281   (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
282                                   (C) & 0x3, ((C) & 0xc) >> 2, \
283                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
284                                   4 + (((C) & 0x03) >> 0), \
285                                   4 + (((C) & 0x0c) >> 2), \
286                                   4 + (((C) & 0x30) >> 4), \
287                                   4 + (((C) & 0xc0) >> 6)); })
288 
289 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290   __m256d __V1 = (V1); \
291   __m256d __V2 = (V2); \
292   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
293 
294 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
295   __m256 __V1 = (V1); \
296   __m256 __V2 = (V2); \
297   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
298 
299 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
300   __m256i __V1 = (V1); \
301   __m256i __V2 = (V2); \
302   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
303 
304 /* Vector Blend */
305 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
306   __m256d __V1 = (V1); \
307   __m256d __V2 = (V2); \
308   (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
309 
310 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
311   __m256 __V1 = (V1); \
312   __m256 __V2 = (V2); \
313   (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
314 
315 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_blendv_pd(__m256d a,__m256d b,__m256d c)316 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
317 {
318   return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
319 }
320 
321 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_blendv_ps(__m256 a,__m256 b,__m256 c)322 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
323 {
324   return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
325 }
326 
327 /* Vector Dot Product */
328 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
329   __m256 __V1 = (V1); \
330   __m256 __V2 = (V2); \
331   (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
332 
333 /* Vector shuffle */
334 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
335         __m256 __a = (a); \
336         __m256 __b = (b); \
337         (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
338         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
339         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
340         ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
341         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
342 
343 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
344         __m256d __a = (a); \
345         __m256d __b = (b); \
346         (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
347         (mask) & 0x1, \
348         (((mask) & 0x2) >> 1) + 4, \
349         (((mask) & 0x4) >> 2) + 2, \
350         (((mask) & 0x8) >> 3) + 6); })
351 
352 /* Compare */
353 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
354 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
355 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
356 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
357 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
358 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
359 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
360 #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
361 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
362 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
363 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
364 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
365 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
366 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
367 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
368 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
369 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
370 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
371 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
372 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
373 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
374 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
375 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
376 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
377 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
378 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
379 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
380 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
381 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
382 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
383 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
384 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
385 
386 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
387   __m128d __a = (a); \
388   __m128d __b = (b); \
389   (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
390 
391 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
392   __m128 __a = (a); \
393   __m128 __b = (b); \
394   (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
395 
396 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
397   __m256d __a = (a); \
398   __m256d __b = (b); \
399   (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
400 
401 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
402   __m256 __a = (a); \
403   __m256 __b = (b); \
404   (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
405 
406 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
407   __m128d __a = (a); \
408   __m128d __b = (b); \
409   (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
410 
411 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
412   __m128 __a = (a); \
413   __m128 __b = (b); \
414   (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
415 
416 /* Vector extract */
417 #define _mm256_extractf128_pd(A, O) __extension__ ({ \
418   __m256d __A = (A); \
419   (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
420 
421 #define _mm256_extractf128_ps(A, O) __extension__ ({ \
422   __m256 __A = (A); \
423   (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
424 
425 #define _mm256_extractf128_si256(A, O) __extension__ ({ \
426   __m256i __A = (A); \
427   (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
428 
429 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi32(__m256i a,int const imm)430 _mm256_extract_epi32(__m256i a, int const imm)
431 {
432   __v8si b = (__v8si)a;
433   return b[imm];
434 }
435 
436 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi16(__m256i a,int const imm)437 _mm256_extract_epi16(__m256i a, int const imm)
438 {
439   __v16hi b = (__v16hi)a;
440   return b[imm];
441 }
442 
443 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi8(__m256i a,int const imm)444 _mm256_extract_epi8(__m256i a, int const imm)
445 {
446   __v32qi b = (__v32qi)a;
447   return b[imm];
448 }
449 
450 #ifdef __x86_64__
451 static __inline long long  __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi64(__m256i a,const int imm)452 _mm256_extract_epi64(__m256i a, const int imm)
453 {
454   __v4di b = (__v4di)a;
455   return b[imm];
456 }
457 #endif
458 
459 /* Vector insert */
460 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
461   __m256d __V1 = (V1); \
462   __m128d __V2 = (V2); \
463   (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
464 
465 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
466   __m256 __V1 = (V1); \
467   __m128 __V2 = (V2); \
468   (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
469 
470 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
471   __m256i __V1 = (V1); \
472   __m128i __V2 = (V2); \
473   (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
474 
475 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi32(__m256i a,int b,int const imm)476 _mm256_insert_epi32(__m256i a, int b, int const imm)
477 {
478   __v8si c = (__v8si)a;
479   c[imm & 7] = b;
480   return (__m256i)c;
481 }
482 
483 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi16(__m256i a,int b,int const imm)484 _mm256_insert_epi16(__m256i a, int b, int const imm)
485 {
486   __v16hi c = (__v16hi)a;
487   c[imm & 15] = b;
488   return (__m256i)c;
489 }
490 
491 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi8(__m256i a,int b,int const imm)492 _mm256_insert_epi8(__m256i a, int b, int const imm)
493 {
494   __v32qi c = (__v32qi)a;
495   c[imm & 31] = b;
496   return (__m256i)c;
497 }
498 
499 #ifdef __x86_64__
500 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi64(__m256i a,int b,int const imm)501 _mm256_insert_epi64(__m256i a, int b, int const imm)
502 {
503   __v4di c = (__v4di)a;
504   c[imm & 3] = b;
505   return (__m256i)c;
506 }
507 #endif
508 
509 /* Conversion */
510 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_cvtepi32_pd(__m128i a)511 _mm256_cvtepi32_pd(__m128i a)
512 {
513   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
514 }
515 
516 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_cvtepi32_ps(__m256i a)517 _mm256_cvtepi32_ps(__m256i a)
518 {
519   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
520 }
521 
522 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm256_cvtpd_ps(__m256d a)523 _mm256_cvtpd_ps(__m256d a)
524 {
525   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
526 }
527 
528 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_cvtps_epi32(__m256 a)529 _mm256_cvtps_epi32(__m256 a)
530 {
531   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
532 }
533 
534 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_cvtps_pd(__m128 a)535 _mm256_cvtps_pd(__m128 a)
536 {
537   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
538 }
539 
540 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_cvttpd_epi32(__m256d a)541 _mm256_cvttpd_epi32(__m256d a)
542 {
543   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
544 }
545 
546 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_cvtpd_epi32(__m256d a)547 _mm256_cvtpd_epi32(__m256d a)
548 {
549   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
550 }
551 
552 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_cvttps_epi32(__m256 a)553 _mm256_cvttps_epi32(__m256 a)
554 {
555   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
556 }
557 
558 /* Vector replicate */
559 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_movehdup_ps(__m256 a)560 _mm256_movehdup_ps(__m256 a)
561 {
562   return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
563 }
564 
565 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_moveldup_ps(__m256 a)566 _mm256_moveldup_ps(__m256 a)
567 {
568   return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
569 }
570 
571 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_movedup_pd(__m256d a)572 _mm256_movedup_pd(__m256d a)
573 {
574   return __builtin_shufflevector(a, a, 0, 0, 2, 2);
575 }
576 
577 /* Unpack and Interleave */
578 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_unpackhi_pd(__m256d a,__m256d b)579 _mm256_unpackhi_pd(__m256d a, __m256d b)
580 {
581   return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
582 }
583 
584 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_unpacklo_pd(__m256d a,__m256d b)585 _mm256_unpacklo_pd(__m256d a, __m256d b)
586 {
587   return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
588 }
589 
590 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_unpackhi_ps(__m256 a,__m256 b)591 _mm256_unpackhi_ps(__m256 a, __m256 b)
592 {
593   return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
594 }
595 
596 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_unpacklo_ps(__m256 a,__m256 b)597 _mm256_unpacklo_ps(__m256 a, __m256 b)
598 {
599   return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
600 }
601 
602 /* Bit Test */
603 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testz_pd(__m128d a,__m128d b)604 _mm_testz_pd(__m128d a, __m128d b)
605 {
606   return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
607 }
608 
609 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testc_pd(__m128d a,__m128d b)610 _mm_testc_pd(__m128d a, __m128d b)
611 {
612   return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
613 }
614 
615 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_pd(__m128d a,__m128d b)616 _mm_testnzc_pd(__m128d a, __m128d b)
617 {
618   return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
619 }
620 
621 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testz_ps(__m128 a,__m128 b)622 _mm_testz_ps(__m128 a, __m128 b)
623 {
624   return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
625 }
626 
627 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testc_ps(__m128 a,__m128 b)628 _mm_testc_ps(__m128 a, __m128 b)
629 {
630   return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
631 }
632 
633 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_ps(__m128 a,__m128 b)634 _mm_testnzc_ps(__m128 a, __m128 b)
635 {
636   return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
637 }
638 
639 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_pd(__m256d a,__m256d b)640 _mm256_testz_pd(__m256d a, __m256d b)
641 {
642   return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
643 }
644 
645 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_pd(__m256d a,__m256d b)646 _mm256_testc_pd(__m256d a, __m256d b)
647 {
648   return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
649 }
650 
651 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_pd(__m256d a,__m256d b)652 _mm256_testnzc_pd(__m256d a, __m256d b)
653 {
654   return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
655 }
656 
657 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_ps(__m256 a,__m256 b)658 _mm256_testz_ps(__m256 a, __m256 b)
659 {
660   return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
661 }
662 
663 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_ps(__m256 a,__m256 b)664 _mm256_testc_ps(__m256 a, __m256 b)
665 {
666   return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
667 }
668 
669 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_ps(__m256 a,__m256 b)670 _mm256_testnzc_ps(__m256 a, __m256 b)
671 {
672   return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
673 }
674 
675 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_si256(__m256i a,__m256i b)676 _mm256_testz_si256(__m256i a, __m256i b)
677 {
678   return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
679 }
680 
681 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_si256(__m256i a,__m256i b)682 _mm256_testc_si256(__m256i a, __m256i b)
683 {
684   return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
685 }
686 
687 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_si256(__m256i a,__m256i b)688 _mm256_testnzc_si256(__m256i a, __m256i b)
689 {
690   return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
691 }
692 
693 /* Vector extract sign mask */
694 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_movemask_pd(__m256d a)695 _mm256_movemask_pd(__m256d a)
696 {
697   return __builtin_ia32_movmskpd256((__v4df)a);
698 }
699 
700 static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_movemask_ps(__m256 a)701 _mm256_movemask_ps(__m256 a)
702 {
703   return __builtin_ia32_movmskps256((__v8sf)a);
704 }
705 
706 /* Vector zero */
707 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_zeroall(void)708 _mm256_zeroall(void)
709 {
710   __builtin_ia32_vzeroall();
711 }
712 
713 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_zeroupper(void)714 _mm256_zeroupper(void)
715 {
716   __builtin_ia32_vzeroupper();
717 }
718 
719 /* Vector load with broadcast */
720 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_broadcast_ss(float const * a)721 _mm_broadcast_ss(float const *a)
722 {
723   return (__m128)__builtin_ia32_vbroadcastss(a);
724 }
725 
726 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_sd(double const * a)727 _mm256_broadcast_sd(double const *a)
728 {
729   return (__m256d)__builtin_ia32_vbroadcastsd256(a);
730 }
731 
732 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_ss(float const * a)733 _mm256_broadcast_ss(float const *a)
734 {
735   return (__m256)__builtin_ia32_vbroadcastss256(a);
736 }
737 
738 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_pd(__m128d const * a)739 _mm256_broadcast_pd(__m128d const *a)
740 {
741   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
742 }
743 
744 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_ps(__m128 const * a)745 _mm256_broadcast_ps(__m128 const *a)
746 {
747   return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
748 }
749 
750 /* SIMD load ops */
751 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_load_pd(double const * p)752 _mm256_load_pd(double const *p)
753 {
754   return *(__m256d *)p;
755 }
756 
757 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_load_ps(float const * p)758 _mm256_load_ps(float const *p)
759 {
760   return *(__m256 *)p;
761 }
762 
763 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_pd(double const * p)764 _mm256_loadu_pd(double const *p)
765 {
766   struct __loadu_pd {
767     __m256d v;
768   } __attribute__((packed, may_alias));
769   return ((struct __loadu_pd*)p)->v;
770 }
771 
772 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_ps(float const * p)773 _mm256_loadu_ps(float const *p)
774 {
775   struct __loadu_ps {
776     __m256 v;
777   } __attribute__((packed, may_alias));
778   return ((struct __loadu_ps*)p)->v;
779 }
780 
781 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_load_si256(__m256i const * p)782 _mm256_load_si256(__m256i const *p)
783 {
784   return *p;
785 }
786 
787 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_si256(__m256i const * p)788 _mm256_loadu_si256(__m256i const *p)
789 {
790   struct __loadu_si256 {
791     __m256i v;
792   } __attribute__((packed, may_alias));
793   return ((struct __loadu_si256*)p)->v;
794 }
795 
796 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_lddqu_si256(__m256i const * p)797 _mm256_lddqu_si256(__m256i const *p)
798 {
799   return (__m256i)__builtin_ia32_lddqu256((char const *)p);
800 }
801 
802 /* SIMD store ops */
803 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_pd(double * p,__m256d a)804 _mm256_store_pd(double *p, __m256d a)
805 {
806   *(__m256d *)p = a;
807 }
808 
809 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_ps(float * p,__m256 a)810 _mm256_store_ps(float *p, __m256 a)
811 {
812   *(__m256 *)p = a;
813 }
814 
815 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_pd(double * p,__m256d a)816 _mm256_storeu_pd(double *p, __m256d a)
817 {
818   __builtin_ia32_storeupd256(p, (__v4df)a);
819 }
820 
821 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_ps(float * p,__m256 a)822 _mm256_storeu_ps(float *p, __m256 a)
823 {
824   __builtin_ia32_storeups256(p, (__v8sf)a);
825 }
826 
827 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_si256(__m256i * p,__m256i a)828 _mm256_store_si256(__m256i *p, __m256i a)
829 {
830   *p = a;
831 }
832 
833 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_si256(__m256i * p,__m256i a)834 _mm256_storeu_si256(__m256i *p, __m256i a)
835 {
836   __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
837 }
838 
839 /* Conditional load ops */
840 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_maskload_pd(double const * p,__m128d m)841 _mm_maskload_pd(double const *p, __m128d m)
842 {
843   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
844 }
845 
846 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_maskload_pd(double const * p,__m256d m)847 _mm256_maskload_pd(double const *p, __m256d m)
848 {
849   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
850 }
851 
852 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_maskload_ps(float const * p,__m128 m)853 _mm_maskload_ps(float const *p, __m128 m)
854 {
855   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
856 }
857 
858 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_maskload_ps(float const * p,__m256 m)859 _mm256_maskload_ps(float const *p, __m256 m)
860 {
861   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
862 }
863 
864 /* Conditional store ops */
865 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_maskstore_ps(float * p,__m256 m,__m256 a)866 _mm256_maskstore_ps(float *p, __m256 m, __m256 a)
867 {
868   __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
869 }
870 
871 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskstore_pd(double * p,__m128d m,__m128d a)872 _mm_maskstore_pd(double *p, __m128d m, __m128d a)
873 {
874   __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
875 }
876 
877 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_maskstore_pd(double * p,__m256d m,__m256d a)878 _mm256_maskstore_pd(double *p, __m256d m, __m256d a)
879 {
880   __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
881 }
882 
883 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskstore_ps(float * p,__m128 m,__m128 a)884 _mm_maskstore_ps(float *p, __m128 m, __m128 a)
885 {
886   __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
887 }
888 
889 /* Cacheability support ops */
890 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_si256(__m256i * a,__m256i b)891 _mm256_stream_si256(__m256i *a, __m256i b)
892 {
893   __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
894 }
895 
896 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_pd(double * a,__m256d b)897 _mm256_stream_pd(double *a, __m256d b)
898 {
899   __builtin_ia32_movntpd256(a, (__v4df)b);
900 }
901 
902 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_ps(float * p,__m256 a)903 _mm256_stream_ps(float *p, __m256 a)
904 {
905   __builtin_ia32_movntps256(p, (__v8sf)a);
906 }
907 
908 /* Create vectors */
909 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_set_pd(double a,double b,double c,double d)910 _mm256_set_pd(double a, double b, double c, double d)
911 {
912   return (__m256d){ d, c, b, a };
913 }
914 
915 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_set_ps(float a,float b,float c,float d,float e,float f,float g,float h)916 _mm256_set_ps(float a, float b, float c, float d,
917 	            float e, float f, float g, float h)
918 {
919   return (__m256){ h, g, f, e, d, c, b, a };
920 }
921 
922 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi32(int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7)923 _mm256_set_epi32(int i0, int i1, int i2, int i3,
924 		             int i4, int i5, int i6, int i7)
925 {
926   return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
927 }
928 
929 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi16(short w15,short w14,short w13,short w12,short w11,short w10,short w09,short w08,short w07,short w06,short w05,short w04,short w03,short w02,short w01,short w00)930 _mm256_set_epi16(short w15, short w14, short w13, short w12,
931 		             short w11, short w10, short w09, short w08,
932 		             short w07, short w06, short w05, short w04,
933 		             short w03, short w02, short w01, short w00)
934 {
935   return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
936                              w08, w09, w10, w11, w12, w13, w14, w15 };
937 }
938 
939 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi8(char b31,char b30,char b29,char b28,char b27,char b26,char b25,char b24,char b23,char b22,char b21,char b20,char b19,char b18,char b17,char b16,char b15,char b14,char b13,char b12,char b11,char b10,char b09,char b08,char b07,char b06,char b05,char b04,char b03,char b02,char b01,char b00)940 _mm256_set_epi8(char b31, char b30, char b29, char b28,
941 		            char b27, char b26, char b25, char b24,
942 		            char b23, char b22, char b21, char b20,
943 		            char b19, char b18, char b17, char b16,
944 		            char b15, char b14, char b13, char b12,
945 		            char b11, char b10, char b09, char b08,
946 		            char b07, char b06, char b05, char b04,
947 		            char b03, char b02, char b01, char b00)
948 {
949   return (__m256i)(__v32qi){
950     b00, b01, b02, b03, b04, b05, b06, b07,
951     b08, b09, b10, b11, b12, b13, b14, b15,
952     b16, b17, b18, b19, b20, b21, b22, b23,
953     b24, b25, b26, b27, b28, b29, b30, b31
954   };
955 }
956 
957 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi64x(long long a,long long b,long long c,long long d)958 _mm256_set_epi64x(long long a, long long b, long long c, long long d)
959 {
960   return (__m256i)(__v4di){ d, c, b, a };
961 }
962 
963 /* Create vectors with elements in reverse order */
964 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_setr_pd(double a,double b,double c,double d)965 _mm256_setr_pd(double a, double b, double c, double d)
966 {
967   return (__m256d){ a, b, c, d };
968 }
969 
970 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_setr_ps(float a,float b,float c,float d,float e,float f,float g,float h)971 _mm256_setr_ps(float a, float b, float c, float d,
972 		           float e, float f, float g, float h)
973 {
974   return (__m256){ a, b, c, d, e, f, g, h };
975 }
976 
977 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi32(int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7)978 _mm256_setr_epi32(int i0, int i1, int i2, int i3,
979 		              int i4, int i5, int i6, int i7)
980 {
981   return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
982 }
983 
984 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi16(short w15,short w14,short w13,short w12,short w11,short w10,short w09,short w08,short w07,short w06,short w05,short w04,short w03,short w02,short w01,short w00)985 _mm256_setr_epi16(short w15, short w14, short w13, short w12,
986 		   short w11, short w10, short w09, short w08,
987 		   short w07, short w06, short w05, short w04,
988 		   short w03, short w02, short w01, short w00)
989 {
990   return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
991 			                       w07, w06, w05, w04, w03, w02, w01, w00 };
992 }
993 
994 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi8(char b31,char b30,char b29,char b28,char b27,char b26,char b25,char b24,char b23,char b22,char b21,char b20,char b19,char b18,char b17,char b16,char b15,char b14,char b13,char b12,char b11,char b10,char b09,char b08,char b07,char b06,char b05,char b04,char b03,char b02,char b01,char b00)995 _mm256_setr_epi8(char b31, char b30, char b29, char b28,
996 		             char b27, char b26, char b25, char b24,
997 		             char b23, char b22, char b21, char b20,
998 		             char b19, char b18, char b17, char b16,
999 		             char b15, char b14, char b13, char b12,
1000 		             char b11, char b10, char b09, char b08,
1001 		             char b07, char b06, char b05, char b04,
1002 		             char b03, char b02, char b01, char b00)
1003 {
1004   return (__m256i)(__v32qi){
1005     b31, b30, b29, b28, b27, b26, b25, b24,
1006 		b23, b22, b21, b20, b19, b18, b17, b16,
1007 		b15, b14, b13, b12, b11, b10, b09, b08,
1008 		b07, b06, b05, b04, b03, b02, b01, b00 };
1009 }
1010 
1011 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi64x(long long a,long long b,long long c,long long d)1012 _mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1013 {
1014   return (__m256i)(__v4di){ a, b, c, d };
1015 }
1016 
1017 /* Create vectors with repeated elements */
1018 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_set1_pd(double w)1019 _mm256_set1_pd(double w)
1020 {
1021   return (__m256d){ w, w, w, w };
1022 }
1023 
1024 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_set1_ps(float w)1025 _mm256_set1_ps(float w)
1026 {
1027   return (__m256){ w, w, w, w, w, w, w, w };
1028 }
1029 
1030 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi32(int i)1031 _mm256_set1_epi32(int i)
1032 {
1033   return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1034 }
1035 
1036 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi16(short w)1037 _mm256_set1_epi16(short w)
1038 {
1039   return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1040 }
1041 
1042 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi8(char b)1043 _mm256_set1_epi8(char b)
1044 {
1045   return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1046                              b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1047 }
1048 
1049 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi64x(long long q)1050 _mm256_set1_epi64x(long long q)
1051 {
1052   return (__m256i)(__v4di){ q, q, q, q };
1053 }
1054 
1055 /* Create zeroed vectors */
1056 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_pd(void)1057 _mm256_setzero_pd(void)
1058 {
1059   return (__m256d){ 0, 0, 0, 0 };
1060 }
1061 
1062 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_ps(void)1063 _mm256_setzero_ps(void)
1064 {
1065   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1066 }
1067 
1068 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_si256(void)1069 _mm256_setzero_si256(void)
1070 {
1071   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1072 }
1073 
1074 /* Cast between vector types */
1075 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castpd_ps(__m256d in)1076 _mm256_castpd_ps(__m256d in)
1077 {
1078   return (__m256)in;
1079 }
1080 
1081 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castpd_si256(__m256d in)1082 _mm256_castpd_si256(__m256d in)
1083 {
1084   return (__m256i)in;
1085 }
1086 
1087 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castps_pd(__m256 in)1088 _mm256_castps_pd(__m256 in)
1089 {
1090   return (__m256d)in;
1091 }
1092 
1093 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castps_si256(__m256 in)1094 _mm256_castps_si256(__m256 in)
1095 {
1096   return (__m256i)in;
1097 }
1098 
1099 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_ps(__m256i in)1100 _mm256_castsi256_ps(__m256i in)
1101 {
1102   return (__m256)in;
1103 }
1104 
1105 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_pd(__m256i in)1106 _mm256_castsi256_pd(__m256i in)
1107 {
1108   return (__m256d)in;
1109 }
1110 
1111 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm256_castpd256_pd128(__m256d in)1112 _mm256_castpd256_pd128(__m256d in)
1113 {
1114   return __builtin_shufflevector(in, in, 0, 1);
1115 }
1116 
1117 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm256_castps256_ps128(__m256 in)1118 _mm256_castps256_ps128(__m256 in)
1119 {
1120   return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1121 }
1122 
1123 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_si128(__m256i in)1124 _mm256_castsi256_si128(__m256i in)
1125 {
1126   return __builtin_shufflevector(in, in, 0, 1);
1127 }
1128 
1129 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castpd128_pd256(__m128d in)1130 _mm256_castpd128_pd256(__m128d in)
1131 {
1132   __m128d zero = _mm_setzero_pd();
1133   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1134 }
1135 
1136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castps128_ps256(__m128 in)1137 _mm256_castps128_ps256(__m128 in)
1138 {
1139   __m128 zero = _mm_setzero_ps();
1140   return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1141 }
1142 
1143 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castsi128_si256(__m128i in)1144 _mm256_castsi128_si256(__m128i in)
1145 {
1146   __m128i zero = _mm_setzero_si128();
1147   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1148 }
1149 
1150 /* SIMD load ops (unaligned) */
1151 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128(float const * addr_hi,float const * addr_lo)1152 _mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
1153 {
1154   struct __loadu_ps {
1155     __m128 v;
1156   } __attribute__((__packed__, __may_alias__));
1157 
1158   __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
1159   return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
1160 }
1161 
1162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128d(double const * addr_hi,double const * addr_lo)1163 _mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
1164 {
1165   struct __loadu_pd {
1166     __m128d v;
1167   } __attribute__((__packed__, __may_alias__));
1168 
1169   __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
1170   return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
1171 }
1172 
1173 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128i(__m128i const * addr_hi,__m128i const * addr_lo)1174 _mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
1175 {
1176   struct __loadu_si128 {
1177     __m128i v;
1178   } __attribute__((packed, may_alias));
1179   __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
1180   return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
1181 }
1182 
1183 /* SIMD store ops (unaligned) */
1184 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128(float * addr_hi,float * addr_lo,__m256 a)1185 _mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
1186 {
1187   __m128 v128;
1188 
1189   v128 = _mm256_castps256_ps128(a);
1190   __builtin_ia32_storeups(addr_lo, v128);
1191   v128 = _mm256_extractf128_ps(a, 1);
1192   __builtin_ia32_storeups(addr_hi, v128);
1193 }
1194 
1195 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128d(double * addr_hi,double * addr_lo,__m256d a)1196 _mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
1197 {
1198   __m128d v128;
1199 
1200   v128 = _mm256_castpd256_pd128(a);
1201   __builtin_ia32_storeupd(addr_lo, v128);
1202   v128 = _mm256_extractf128_pd(a, 1);
1203   __builtin_ia32_storeupd(addr_hi, v128);
1204 }
1205 
1206 static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128i(__m128i * addr_hi,__m128i * addr_lo,__m256i a)1207 _mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
1208 {
1209   __m128i v128;
1210 
1211   v128 = _mm256_castsi256_si128(a);
1212   __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
1213   v128 = _mm256_extractf128_si256(a, 1);
1214   __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
1215 }
1216