1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __IMMINTRIN_H
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12 #endif
13
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
16
17 /* Define the default attributes for the functions in this file. */
18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
19 #define __DEFAULT_FN_ATTRS256 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx2,no-evex512"), __min_vector_width__(256)))
22 #define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx2,no-evex512"), __min_vector_width__(128)))
25 #else
26 #define __DEFAULT_FN_ATTRS256 \
27 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
28 __min_vector_width__(256)))
29 #define __DEFAULT_FN_ATTRS128 \
30 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
31 __min_vector_width__(128)))
32 #endif
33
34 /* SSE4 Multiple Packed Sums of Absolute Difference. */
35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
36 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
37 /// \a Y.
38 ///
39 /// Eight SAD results are computed using the lower half of the input
40 /// vectors, and another eight using the upper half. These 16-bit values
41 /// are returned in the lower and upper halves of the 256-bit result,
42 /// respectively.
43 ///
44 /// A single SAD operation selects four bytes from \a X and four bytes from
45 /// \a Y as input. It computes the differences between each \a X byte and
46 /// the corresponding \a Y byte, takes the absolute value of each
47 /// difference, and sums these four values to form one 16-bit result. The
48 /// intrinsic computes 16 of these results with different sets of input
49 /// bytes.
50 ///
51 /// For each set of eight results, the SAD operations use the same four
52 /// bytes from \a Y; the starting bit position for these four bytes is
53 /// specified by \a M[1:0] times 32. The eight operations use successive
54 /// sets of four bytes from \a X; the starting bit position for the first
55 /// set of four bytes is specified by \a M[2] times 32. These bit positions
56 /// are all relative to the 128-bit lane for each set of eight operations.
57 ///
58 /// \code{.operation}
59 /// r := 0
60 /// FOR i := 0 TO 1
61 /// j := i*3
62 /// Ybase := M[j+1:j]*32 + i*128
63 /// Xbase := M[j+2]*32 + i*128
64 /// FOR k := 0 TO 3
65 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
66 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
67 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
68 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
69 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3
70 /// Xbase := Xbase + 8
71 /// r := r + 16
72 /// ENDFOR
73 /// ENDFOR
74 /// \endcode
75 ///
76 /// \headerfile <immintrin.h>
77 ///
78 /// \code
79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
80 /// \endcode
81 ///
82 /// This intrinsic corresponds to the \c VMPSADBW instruction.
83 ///
84 /// \param X
85 /// A 256-bit integer vector containing one of the inputs.
86 /// \param Y
87 /// A 256-bit integer vector containing one of the inputs.
88 /// \param M
89 /// An unsigned immediate value specifying the starting positions of the
90 /// bytes to operate on.
91 /// \returns A 256-bit vector of [16 x i16] containing the result.
92 #define _mm256_mpsadbw_epu8(X, Y, M) \
93 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
94 (__v32qi)(__m256i)(Y), (int)(M)))
95
96 /// Computes the absolute value of each signed byte in the 256-bit integer
97 /// vector \a __a and returns each value in the corresponding byte of
98 /// the result.
99 ///
100 /// \headerfile <immintrin.h>
101 ///
102 /// This intrinsic corresponds to the \c VPABSB instruction.
103 ///
104 /// \param __a
105 /// A 256-bit integer vector.
106 /// \returns A 256-bit integer vector containing the result.
107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi8(__m256i __a)108 _mm256_abs_epi8(__m256i __a)
109 {
110 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
111 }
112
113 /// Computes the absolute value of each signed 16-bit element in the 256-bit
114 /// vector of [16 x i16] in \a __a and returns each value in the
115 /// corresponding element of the result.
116 ///
117 /// \headerfile <immintrin.h>
118 ///
119 /// This intrinsic corresponds to the \c VPABSW instruction.
120 ///
121 /// \param __a
122 /// A 256-bit vector of [16 x i16].
123 /// \returns A 256-bit vector of [16 x i16] containing the result.
124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi16(__m256i __a)125 _mm256_abs_epi16(__m256i __a)
126 {
127 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
128 }
129
130 /// Computes the absolute value of each signed 32-bit element in the 256-bit
131 /// vector of [8 x i32] in \a __a and returns each value in the
132 /// corresponding element of the result.
133 ///
134 /// \headerfile <immintrin.h>
135 ///
136 /// This intrinsic corresponds to the \c VPABSD instruction.
137 ///
138 /// \param __a
139 /// A 256-bit vector of [8 x i32].
140 /// \returns A 256-bit vector of [8 x i32] containing the result.
141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi32(__m256i __a)142 _mm256_abs_epi32(__m256i __a)
143 {
144 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
145 }
146
147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
148 /// integers using signed saturation, and returns the 256-bit result.
149 ///
150 /// \code{.operation}
151 /// FOR i := 0 TO 7
152 /// j := i*16
153 /// k := i*8
154 /// result[7+k:k] := SATURATE8(__a[15+j:j])
155 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
156 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
157 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
158 /// ENDFOR
159 /// \endcode
160 ///
161 /// \headerfile <immintrin.h>
162 ///
163 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
164 ///
165 /// \param __a
166 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
167 /// result[191:128].
168 /// \param __b
169 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
170 /// result[255:192].
171 /// \returns A 256-bit integer vector containing the result.
172 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_packs_epi16(__m256i __a,__m256i __b)173 _mm256_packs_epi16(__m256i __a, __m256i __b)
174 {
175 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
176 }
177
178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
179 /// integers using signed saturation, and returns the resulting 256-bit
180 /// vector of [16 x i16].
181 ///
182 /// \code{.operation}
183 /// FOR i := 0 TO 3
184 /// j := i*32
185 /// k := i*16
186 /// result[15+k:k] := SATURATE16(__a[31+j:j])
187 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
188 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
189 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
190 /// ENDFOR
191 /// \endcode
192 ///
193 /// \headerfile <immintrin.h>
194 ///
195 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
196 ///
197 /// \param __a
198 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
199 /// result[191:128].
200 /// \param __b
201 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
202 /// result[255:192].
203 /// \returns A 256-bit vector of [16 x i16] containing the result.
204 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_packs_epi32(__m256i __a,__m256i __b)205 _mm256_packs_epi32(__m256i __a, __m256i __b)
206 {
207 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
208 }
209
210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
211 /// using unsigned saturation, and returns the 256-bit result.
212 ///
213 /// \code{.operation}
214 /// FOR i := 0 TO 7
215 /// j := i*16
216 /// k := i*8
217 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
218 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
219 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
220 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
221 /// ENDFOR
222 /// \endcode
223 ///
224 /// \headerfile <immintrin.h>
225 ///
226 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
227 ///
228 /// \param __a
229 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
230 /// result[191:128].
231 /// \param __b
232 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
233 /// result[255:192].
234 /// \returns A 256-bit integer vector containing the result.
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_packus_epi16(__m256i __a,__m256i __b)236 _mm256_packus_epi16(__m256i __a, __m256i __b)
237 {
238 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
239 }
240
241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
242 /// using unsigned saturation, and returns the resulting 256-bit vector of
243 /// [16 x i16].
244 ///
245 /// \code{.operation}
246 /// FOR i := 0 TO 3
247 /// j := i*32
248 /// k := i*16
249 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
250 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
251 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
252 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
253 /// ENDFOR
254 /// \endcode
255 ///
256 /// \headerfile <immintrin.h>
257 ///
258 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
259 ///
260 /// \param __V1
261 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
262 /// result[191:128].
263 /// \param __V2
264 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
265 /// result[255:192].
266 /// \returns A 256-bit vector of [16 x i16] containing the result.
267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_packus_epi32(__m256i __V1,__m256i __V2)268 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
269 {
270 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
271 }
272
273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
274 /// vectors and returns the lower 8 bits of each sum in the corresponding
275 /// byte of the 256-bit integer vector result (overflow is ignored).
276 ///
277 /// \headerfile <immintrin.h>
278 ///
279 /// This intrinsic corresponds to the \c VPADDB instruction.
280 ///
281 /// \param __a
282 /// A 256-bit integer vector containing one of the source operands.
283 /// \param __b
284 /// A 256-bit integer vector containing one of the source operands.
285 /// \returns A 256-bit integer vector containing the sums.
286 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_add_epi8(__m256i __a,__m256i __b)287 _mm256_add_epi8(__m256i __a, __m256i __b)
288 {
289 return (__m256i)((__v32qu)__a + (__v32qu)__b);
290 }
291
292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
293 /// [16 x i16] and returns the lower 16 bits of each sum in the
294 /// corresponding element of the [16 x i16] result (overflow is ignored).
295 ///
296 /// \headerfile <immintrin.h>
297 ///
298 /// This intrinsic corresponds to the \c VPADDW instruction.
299 ///
300 /// \param __a
301 /// A 256-bit vector of [16 x i16] containing one of the source operands.
302 /// \param __b
303 /// A 256-bit vector of [16 x i16] containing one of the source operands.
304 /// \returns A 256-bit vector of [16 x i16] containing the sums.
305 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_add_epi16(__m256i __a,__m256i __b)306 _mm256_add_epi16(__m256i __a, __m256i __b)
307 {
308 return (__m256i)((__v16hu)__a + (__v16hu)__b);
309 }
310
311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
312 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
313 /// element of the [8 x i32] result (overflow is ignored).
314 ///
315 /// \headerfile <immintrin.h>
316 ///
317 /// This intrinsic corresponds to the \c VPADDD instruction.
318 ///
319 /// \param __a
320 /// A 256-bit vector of [8 x i32] containing one of the source operands.
321 /// \param __b
322 /// A 256-bit vector of [8 x i32] containing one of the source operands.
323 /// \returns A 256-bit vector of [8 x i32] containing the sums.
324 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_add_epi32(__m256i __a,__m256i __b)325 _mm256_add_epi32(__m256i __a, __m256i __b)
326 {
327 return (__m256i)((__v8su)__a + (__v8su)__b);
328 }
329
330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
331 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
332 /// element of the [4 x i64] result (overflow is ignored).
333 ///
334 /// \headerfile <immintrin.h>
335 ///
336 /// This intrinsic corresponds to the \c VPADDQ instruction.
337 ///
338 /// \param __a
339 /// A 256-bit vector of [4 x i64] containing one of the source operands.
340 /// \param __b
341 /// A 256-bit vector of [4 x i64] containing one of the source operands.
342 /// \returns A 256-bit vector of [4 x i64] containing the sums.
343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_add_epi64(__m256i __a,__m256i __b)344 _mm256_add_epi64(__m256i __a, __m256i __b)
345 {
346 return (__m256i)((__v4du)__a + (__v4du)__b);
347 }
348
349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
350 /// vectors using signed saturation, and returns each sum in the
351 /// corresponding byte of the 256-bit integer vector result.
352 ///
353 /// \headerfile <immintrin.h>
354 ///
355 /// This intrinsic corresponds to the \c VPADDSB instruction.
356 ///
357 /// \param __a
358 /// A 256-bit integer vector containing one of the source operands.
359 /// \param __b
360 /// A 256-bit integer vector containing one of the source operands.
361 /// \returns A 256-bit integer vector containing the sums.
362 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi8(__m256i __a,__m256i __b)363 _mm256_adds_epi8(__m256i __a, __m256i __b)
364 {
365 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
366 }
367
368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
369 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
370 ///
371 /// \headerfile <immintrin.h>
372 ///
373 /// This intrinsic corresponds to the \c VPADDSW instruction.
374 ///
375 /// \param __a
376 /// A 256-bit vector of [16 x i16] containing one of the source operands.
377 /// \param __b
378 /// A 256-bit vector of [16 x i16] containing one of the source operands.
379 /// \returns A 256-bit vector of [16 x i16] containing the sums.
380 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi16(__m256i __a,__m256i __b)381 _mm256_adds_epi16(__m256i __a, __m256i __b)
382 {
383 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
384 }
385
386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
387 /// vectors using unsigned saturation, and returns each sum in the
388 /// corresponding byte of the 256-bit integer vector result.
389 ///
390 /// \headerfile <immintrin.h>
391 ///
392 /// This intrinsic corresponds to the \c VPADDUSB instruction.
393 ///
394 /// \param __a
395 /// A 256-bit integer vector containing one of the source operands.
396 /// \param __b
397 /// A 256-bit integer vector containing one of the source operands.
398 /// \returns A 256-bit integer vector containing the sums.
399 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu8(__m256i __a,__m256i __b)400 _mm256_adds_epu8(__m256i __a, __m256i __b)
401 {
402 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
403 }
404
405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
406 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
407 ///
408 /// \headerfile <immintrin.h>
409 ///
410 /// This intrinsic corresponds to the \c VPADDUSW instruction.
411 ///
412 /// \param __a
413 /// A 256-bit vector of [16 x i16] containing one of the source operands.
414 /// \param __b
415 /// A 256-bit vector of [16 x i16] containing one of the source operands.
416 /// \returns A 256-bit vector of [16 x i16] containing the sums.
417 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu16(__m256i __a,__m256i __b)418 _mm256_adds_epu16(__m256i __a, __m256i __b)
419 {
420 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
421 }
422
423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
424 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
425 /// as the lower half of the temporary value. Right-shifts the temporary
426 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
427 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
428 /// \a b to make another temporary value, right shifts by \a n, and uses
429 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
430 /// result.
431 ///
432 /// \headerfile <immintrin.h>
433 ///
434 /// \code
435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
436 /// \endcode
437 ///
438 /// This intrinsic corresponds to the \c VPALIGNR instruction.
439 ///
440 /// \param a
441 /// A 256-bit integer vector containing source values.
442 /// \param b
443 /// A 256-bit integer vector containing source values.
444 /// \param n
445 /// An immediate value specifying the number of bytes to shift.
446 /// \returns A 256-bit integer vector containing the result.
447 #define _mm256_alignr_epi8(a, b, n) \
448 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
449 (__v32qi)(__m256i)(b), (n)))
450
451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
452 /// \a __b.
453 ///
454 /// \headerfile <immintrin.h>
455 ///
456 /// This intrinsic corresponds to the \c VPAND instruction.
457 ///
458 /// \param __a
459 /// A 256-bit integer vector.
460 /// \param __b
461 /// A 256-bit integer vector.
462 /// \returns A 256-bit integer vector containing the result.
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_and_si256(__m256i __a,__m256i __b)464 _mm256_and_si256(__m256i __a, __m256i __b)
465 {
466 return (__m256i)((__v4du)__a & (__v4du)__b);
467 }
468
469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
470 /// the bitwise NOT of the 256-bit integer vector in \a __a.
471 ///
472 /// \headerfile <immintrin.h>
473 ///
474 /// This intrinsic corresponds to the \c VPANDN instruction.
475 ///
476 /// \param __a
477 /// A 256-bit integer vector.
478 /// \param __b
479 /// A 256-bit integer vector.
480 /// \returns A 256-bit integer vector containing the result.
481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_andnot_si256(__m256i __a,__m256i __b)482 _mm256_andnot_si256(__m256i __a, __m256i __b)
483 {
484 return (__m256i)(~(__v4du)__a & (__v4du)__b);
485 }
486
487 /// Computes the averages of the corresponding unsigned bytes in the two
488 /// 256-bit integer vectors in \a __a and \a __b and returns each
489 /// average in the corresponding byte of the 256-bit result.
490 ///
491 /// \code{.operation}
492 /// FOR i := 0 TO 31
493 /// j := i*8
494 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
495 /// ENDFOR
496 /// \endcode
497 ///
498 /// \headerfile <immintrin.h>
499 ///
500 /// This intrinsic corresponds to the \c VPAVGB instruction.
501 ///
502 /// \param __a
503 /// A 256-bit integer vector.
504 /// \param __b
505 /// A 256-bit integer vector.
506 /// \returns A 256-bit integer vector containing the result.
507 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_avg_epu8(__m256i __a,__m256i __b)508 _mm256_avg_epu8(__m256i __a, __m256i __b)
509 {
510 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
511 }
512
513 /// Computes the averages of the corresponding unsigned 16-bit integers in
514 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
515 /// each average in the corresponding element of the 256-bit result.
516 ///
517 /// \code{.operation}
518 /// FOR i := 0 TO 15
519 /// j := i*16
520 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
521 /// ENDFOR
522 /// \endcode
523 ///
524 /// \headerfile <immintrin.h>
525 ///
526 /// This intrinsic corresponds to the \c VPAVGW instruction.
527 ///
528 /// \param __a
529 /// A 256-bit vector of [16 x i16].
530 /// \param __b
531 /// A 256-bit vector of [16 x i16].
532 /// \returns A 256-bit vector of [16 x i16] containing the result.
533 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_avg_epu16(__m256i __a,__m256i __b)534 _mm256_avg_epu16(__m256i __a, __m256i __b)
535 {
536 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
537 }
538
539 /// Merges 8-bit integer values from either of the two 256-bit vectors
540 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
541 /// the resulting 256-bit integer vector.
542 ///
543 /// \code{.operation}
544 /// FOR i := 0 TO 31
545 /// j := i*8
546 /// IF __M[7+i] == 0
547 /// result[7+j:j] := __V1[7+j:j]
548 /// ELSE
549 /// result[7+j:j] := __V2[7+j:j]
550 /// FI
551 /// ENDFOR
552 /// \endcode
553 ///
554 /// \headerfile <immintrin.h>
555 ///
556 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
557 ///
558 /// \param __V1
559 /// A 256-bit integer vector containing source values.
560 /// \param __V2
561 /// A 256-bit integer vector containing source values.
562 /// \param __M
563 /// A 256-bit integer vector, with bit [7] of each byte specifying the
564 /// source for each corresponding byte of the result. When the mask bit
565 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
566 /// \a __V2.
567 /// \returns A 256-bit integer vector containing the result.
568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_blendv_epi8(__m256i __V1,__m256i __V2,__m256i __M)569 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
570 {
571 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
572 (__v32qi)__M);
573 }
574
575 /// Merges 16-bit integer values from either of the two 256-bit vectors
576 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
577 /// and returns the resulting 256-bit vector of [16 x i16].
578 ///
579 /// \code{.operation}
580 /// FOR i := 0 TO 7
581 /// j := i*16
582 /// IF M[i] == 0
583 /// result[7+j:j] := V1[7+j:j]
584 /// result[135+j:128+j] := V1[135+j:128+j]
585 /// ELSE
586 /// result[7+j:j] := V2[7+j:j]
587 /// result[135+j:128+j] := V2[135+j:128+j]
588 /// FI
589 /// ENDFOR
590 /// \endcode
591 ///
592 /// \headerfile <immintrin.h>
593 ///
594 /// \code
595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
596 /// \endcode
597 ///
598 /// This intrinsic corresponds to the \c VPBLENDW instruction.
599 ///
600 /// \param V1
601 /// A 256-bit vector of [16 x i16] containing source values.
602 /// \param V2
603 /// A 256-bit vector of [16 x i16] containing source values.
604 /// \param M
605 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
606 /// source for each element of the result. The position of the mask bit
607 /// corresponds to the index of a copied value. When a mask bit is 0, the
608 /// element is copied from \a V1; otherwise, it is copied from \a V2.
609 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
610 /// elements 1 and 9, and so forth.
611 /// \returns A 256-bit vector of [16 x i16] containing the result.
612 #define _mm256_blend_epi16(V1, V2, M) \
613 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
614 (__v16hi)(__m256i)(V2), (int)(M)))
615
616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
617 /// \a __b for equality and returns the outcomes in the corresponding
618 /// bytes of the 256-bit result.
619 ///
620 /// \code{.operation}
621 /// FOR i := 0 TO 31
622 /// j := i*8
623 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
624 /// ENDFOR
625 /// \endcode
626 ///
627 /// \headerfile <immintrin.h>
628 ///
629 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
630 ///
631 /// \param __a
632 /// A 256-bit integer vector containing one of the inputs.
633 /// \param __b
634 /// A 256-bit integer vector containing one of the inputs.
635 /// \returns A 256-bit integer vector containing the result.
636 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi8(__m256i __a,__m256i __b)637 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
638 {
639 return (__m256i)((__v32qi)__a == (__v32qi)__b);
640 }
641
642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
643 /// \a __a and \a __b for equality and returns the outcomes in the
644 /// corresponding elements of the 256-bit result.
645 ///
646 /// \code{.operation}
647 /// FOR i := 0 TO 15
648 /// j := i*16
649 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
650 /// ENDFOR
651 /// \endcode
652 ///
653 /// \headerfile <immintrin.h>
654 ///
655 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
656 ///
657 /// \param __a
658 /// A 256-bit vector of [16 x i16] containing one of the inputs.
659 /// \param __b
660 /// A 256-bit vector of [16 x i16] containing one of the inputs.
661 /// \returns A 256-bit vector of [16 x i16] containing the result.
662 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi16(__m256i __a,__m256i __b)663 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
664 {
665 return (__m256i)((__v16hi)__a == (__v16hi)__b);
666 }
667
668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
669 /// \a __a and \a __b for equality and returns the outcomes in the
670 /// corresponding elements of the 256-bit result.
671 ///
672 /// \code{.operation}
673 /// FOR i := 0 TO 7
674 /// j := i*32
675 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
676 /// ENDFOR
677 /// \endcode
678 ///
679 /// \headerfile <immintrin.h>
680 ///
681 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
682 ///
683 /// \param __a
684 /// A 256-bit vector of [8 x i32] containing one of the inputs.
685 /// \param __b
686 /// A 256-bit vector of [8 x i32] containing one of the inputs.
687 /// \returns A 256-bit vector of [8 x i32] containing the result.
688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi32(__m256i __a,__m256i __b)689 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
690 {
691 return (__m256i)((__v8si)__a == (__v8si)__b);
692 }
693
694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
695 /// \a __a and \a __b for equality and returns the outcomes in the
696 /// corresponding elements of the 256-bit result.
697 ///
698 /// \code{.operation}
699 /// FOR i := 0 TO 3
700 /// j := i*64
701 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
702 /// ENDFOR
703 /// \endcode
704 ///
705 /// \headerfile <immintrin.h>
706 ///
707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
708 ///
709 /// \param __a
710 /// A 256-bit vector of [4 x i64] containing one of the inputs.
711 /// \param __b
712 /// A 256-bit vector of [4 x i64] containing one of the inputs.
713 /// \returns A 256-bit vector of [4 x i64] containing the result.
714 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi64(__m256i __a,__m256i __b)715 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
716 {
717 return (__m256i)((__v4di)__a == (__v4di)__b);
718 }
719
720 /// Compares corresponding signed bytes in the 256-bit integer vectors in
721 /// \a __a and \a __b for greater-than and returns the outcomes in the
722 /// corresponding bytes of the 256-bit result.
723 ///
724 /// \code{.operation}
725 /// FOR i := 0 TO 31
726 /// j := i*8
727 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
728 /// ENDFOR
729 /// \endcode
730 ///
731 /// \headerfile <immintrin.h>
732 ///
733 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
734 ///
735 /// \param __a
736 /// A 256-bit integer vector containing one of the inputs.
737 /// \param __b
738 /// A 256-bit integer vector containing one of the inputs.
739 /// \returns A 256-bit integer vector containing the result.
740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpgt_epi8(__m256i __a,__m256i __b)741 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
742 {
743 /* This function always performs a signed comparison, but __v32qi is a char
744 which may be signed or unsigned, so use __v32qs. */
745 return (__m256i)((__v32qs)__a > (__v32qs)__b);
746 }
747
748 /// Compares corresponding signed elements in the 256-bit vectors of
749 /// [16 x i16] in \a __a and \a __b for greater-than and returns the
750 /// outcomes in the corresponding elements of the 256-bit result.
751 ///
752 /// \code{.operation}
753 /// FOR i := 0 TO 15
754 /// j := i*16
755 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
756 /// ENDFOR
757 /// \endcode
758 ///
759 /// \headerfile <immintrin.h>
760 ///
761 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
762 ///
763 /// \param __a
764 /// A 256-bit vector of [16 x i16] containing one of the inputs.
765 /// \param __b
766 /// A 256-bit vector of [16 x i16] containing one of the inputs.
767 /// \returns A 256-bit vector of [16 x i16] containing the result.
768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpgt_epi16(__m256i __a,__m256i __b)769 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
770 {
771 return (__m256i)((__v16hi)__a > (__v16hi)__b);
772 }
773
774 /// Compares corresponding signed elements in the 256-bit vectors of
775 /// [8 x i32] in \a __a and \a __b for greater-than and returns the
776 /// outcomes in the corresponding elements of the 256-bit result.
777 ///
778 /// \code{.operation}
779 /// FOR i := 0 TO 7
780 /// j := i*32
781 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
782 /// ENDFOR
783 /// \endcode
784 ///
785 /// \headerfile <immintrin.h>
786 ///
787 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
788 ///
789 /// \param __a
790 /// A 256-bit vector of [8 x i32] containing one of the inputs.
791 /// \param __b
792 /// A 256-bit vector of [8 x i32] containing one of the inputs.
793 /// \returns A 256-bit vector of [8 x i32] containing the result.
794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpgt_epi32(__m256i __a,__m256i __b)795 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
796 {
797 return (__m256i)((__v8si)__a > (__v8si)__b);
798 }
799
800 /// Compares corresponding signed elements in the 256-bit vectors of
801 /// [4 x i64] in \a __a and \a __b for greater-than and returns the
802 /// outcomes in the corresponding elements of the 256-bit result.
803 ///
804 /// \code{.operation}
805 /// FOR i := 0 TO 3
806 /// j := i*64
807 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
808 /// ENDFOR
809 /// \endcode
810 ///
811 /// \headerfile <immintrin.h>
812 ///
813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
814 ///
815 /// \param __a
816 /// A 256-bit vector of [4 x i64] containing one of the inputs.
817 /// \param __b
818 /// A 256-bit vector of [4 x i64] containing one of the inputs.
819 /// \returns A 256-bit vector of [4 x i64] containing the result.
820 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpgt_epi64(__m256i __a,__m256i __b)821 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
822 {
823 return (__m256i)((__v4di)__a > (__v4di)__b);
824 }
825
826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
827 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
828 /// element of the [16 x i16] result (overflow is ignored). Sums from
829 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
830 /// result; sums from \a __b are returned in the upper 64 bits of each
831 /// 128-bit half of the result.
832 ///
833 /// \code{.operation}
834 /// FOR i := 0 TO 1
835 /// j := i*128
836 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
837 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
838 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
839 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
840 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
841 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
842 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
843 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
844 /// ENDFOR
845 /// \endcode
846 ///
847 /// \headerfile <immintrin.h>
848 ///
849 /// This intrinsic corresponds to the \c VPHADDW instruction.
850 ///
851 /// \param __a
852 /// A 256-bit vector of [16 x i16] containing one of the source operands.
853 /// \param __b
854 /// A 256-bit vector of [16 x i16] containing one of the source operands.
855 /// \returns A 256-bit vector of [16 x i16] containing the sums.
856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadd_epi16(__m256i __a,__m256i __b)857 _mm256_hadd_epi16(__m256i __a, __m256i __b)
858 {
859 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
860 }
861
862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
863 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
864 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
865 /// are returned in the lower 64 bits of each 128-bit half of the result;
866 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
867 /// of the result.
868 ///
869 /// \code{.operation}
870 /// FOR i := 0 TO 1
871 /// j := i*128
872 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
873 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
874 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
875 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
876 /// ENDFOR
877 /// \endcode
878 ///
879 /// \headerfile <immintrin.h>
880 ///
881 /// This intrinsic corresponds to the \c VPHADDD instruction.
882 ///
883 /// \param __a
884 /// A 256-bit vector of [8 x i32] containing one of the source operands.
885 /// \param __b
886 /// A 256-bit vector of [8 x i32] containing one of the source operands.
887 /// \returns A 256-bit vector of [8 x i32] containing the sums.
888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadd_epi32(__m256i __a,__m256i __b)889 _mm256_hadd_epi32(__m256i __a, __m256i __b)
890 {
891 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
892 }
893
894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
895 /// vectors of [16 x i16] using signed saturation and returns each sum in
896 /// an element of the [16 x i16] result. Sums from \a __a are returned in
897 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
898 /// are returned in the upper 64 bits of each 128-bit half of the result.
899 ///
900 /// \code{.operation}
901 /// FOR i := 0 TO 1
902 /// j := i*128
903 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
904 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
905 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
906 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
907 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
908 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
909 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
910 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
911 /// ENDFOR
912 /// \endcode
913 ///
914 /// \headerfile <immintrin.h>
915 ///
916 /// This intrinsic corresponds to the \c VPHADDSW instruction.
917 ///
918 /// \param __a
919 /// A 256-bit vector of [16 x i16] containing one of the source operands.
920 /// \param __b
921 /// A 256-bit vector of [16 x i16] containing one of the source operands.
922 /// \returns A 256-bit vector of [16 x i16] containing the sums.
923 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadds_epi16(__m256i __a,__m256i __b)924 _mm256_hadds_epi16(__m256i __a, __m256i __b)
925 {
926 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
927 }
928
929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
930 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
931 /// in an element of the [16 x i16] result (overflow is ignored).
932 /// Differences from \a __a are returned in the lower 64 bits of each
933 /// 128-bit half of the result; differences from \a __b are returned in the
934 /// upper 64 bits of each 128-bit half of the result.
935 ///
936 /// \code{.operation}
937 /// FOR i := 0 TO 1
938 /// j := i*128
939 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
940 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
941 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
942 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
943 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
944 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
945 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
946 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
947 /// ENDFOR
948 /// \endcode
949 ///
950 /// \headerfile <immintrin.h>
951 ///
952 /// This intrinsic corresponds to the \c VPHSUBW instruction.
953 ///
954 /// \param __a
955 /// A 256-bit vector of [16 x i16] containing one of the source operands.
956 /// \param __b
957 /// A 256-bit vector of [16 x i16] containing one of the source operands.
958 /// \returns A 256-bit vector of [16 x i16] containing the differences.
959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsub_epi16(__m256i __a,__m256i __b)960 _mm256_hsub_epi16(__m256i __a, __m256i __b)
961 {
962 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
963 }
964
965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
966 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
967 /// an element of the [8 x i32] result (overflow is ignored). Differences
968 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
969 /// the result; differences from \a __b are returned in the upper 64 bits
970 /// of each 128-bit half of the result.
971 ///
972 /// \code{.operation}
973 /// FOR i := 0 TO 1
974 /// j := i*128
975 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
976 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
977 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
978 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
979 /// ENDFOR
980 /// \endcode
981 ///
982 /// \headerfile <immintrin.h>
983 ///
984 /// This intrinsic corresponds to the \c VPHSUBD instruction.
985 ///
986 /// \param __a
987 /// A 256-bit vector of [8 x i32] containing one of the source operands.
988 /// \param __b
989 /// A 256-bit vector of [8 x i32] containing one of the source operands.
990 /// \returns A 256-bit vector of [8 x i32] containing the differences.
991 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsub_epi32(__m256i __a,__m256i __b)992 _mm256_hsub_epi32(__m256i __a, __m256i __b)
993 {
994 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
995 }
996
997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
998 /// vectors of [16 x i16] using signed saturation and returns each sum in
999 /// an element of the [16 x i16] result. Differences from \a __a are
1000 /// returned in the lower 64 bits of each 128-bit half of the result;
1001 /// differences from \a __b are returned in the upper 64 bits of each
1002 /// 128-bit half of the result.
1003 ///
1004 /// \code{.operation}
1005 /// FOR i := 0 TO 1
1006 /// j := i*128
1007 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1015 /// ENDFOR
1016 /// \endcode
1017 ///
1018 /// \headerfile <immintrin.h>
1019 ///
1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1021 ///
1022 /// \param __a
1023 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1024 /// \param __b
1025 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1026 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsubs_epi16(__m256i __a,__m256i __b)1028 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1029 {
1030 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1031 }
1032
1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034 /// with the corresponding signed byte from the 256-bit integer vector in
1035 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036 /// pairs of those products using signed saturation to form 16-bit sums
1037 /// returned as elements of the [16 x i16] result.
1038 ///
1039 /// \code{.operation}
1040 /// FOR i := 0 TO 15
1041 /// j := i*16
1042 /// temp1 := __a[j+7:j] * __b[j+7:j]
1043 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044 /// result[j+15:j] := SATURATE16(temp1 + temp2)
1045 /// ENDFOR
1046 /// \endcode
1047 ///
1048 /// \headerfile <immintrin.h>
1049 ///
1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1051 ///
1052 /// \param __a
1053 /// A 256-bit vector containing one of the source operands.
1054 /// \param __b
1055 /// A 256-bit vector containing one of the source operands.
1056 /// \returns A 256-bit vector of [16 x i16] containing the result.
1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maddubs_epi16(__m256i __a,__m256i __b)1058 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1059 {
1060 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1061 }
1062
1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065 /// those products to form 32-bit sums returned as elements of the
1066 /// [8 x i32] result.
1067 ///
1068 /// There is only one wraparound case: when all four of the 16-bit sources
1069 /// are \c 0x8000, the result will be \c 0x80000000.
1070 ///
1071 /// \code{.operation}
1072 /// FOR i := 0 TO 7
1073 /// j := i*32
1074 /// temp1 := __a[j+15:j] * __b[j+15:j]
1075 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076 /// result[j+31:j] := temp1 + temp2
1077 /// ENDFOR
1078 /// \endcode
1079 ///
1080 /// \headerfile <immintrin.h>
1081 ///
1082 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1083 ///
1084 /// \param __a
1085 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1086 /// \param __b
1087 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1088 /// \returns A 256-bit vector of [8 x i32] containing the result.
1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd_epi16(__m256i __a,__m256i __b)1090 _mm256_madd_epi16(__m256i __a, __m256i __b)
1091 {
1092 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1093 }
1094
1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096 /// in \a __a and \a __b and returns the larger of each pair in the
1097 /// corresponding byte of the 256-bit result.
1098 ///
1099 /// \headerfile <immintrin.h>
1100 ///
1101 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1102 ///
1103 /// \param __a
1104 /// A 256-bit integer vector.
1105 /// \param __b
1106 /// A 256-bit integer vector.
1107 /// \returns A 256-bit integer vector containing the result.
1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi8(__m256i __a,__m256i __b)1109 _mm256_max_epi8(__m256i __a, __m256i __b)
1110 {
1111 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1112 }
1113
1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1115 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116 /// each pair in the corresponding element of the 256-bit result.
1117 ///
1118 /// \headerfile <immintrin.h>
1119 ///
1120 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1121 ///
1122 /// \param __a
1123 /// A 256-bit vector of [16 x i16].
1124 /// \param __b
1125 /// A 256-bit vector of [16 x i16].
1126 /// \returns A 256-bit vector of [16 x i16] containing the result.
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi16(__m256i __a,__m256i __b)1128 _mm256_max_epi16(__m256i __a, __m256i __b)
1129 {
1130 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1131 }
1132
1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1134 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135 /// each pair in the corresponding element of the 256-bit result.
1136 ///
1137 /// \headerfile <immintrin.h>
1138 ///
1139 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1140 ///
1141 /// \param __a
1142 /// A 256-bit vector of [8 x i32].
1143 /// \param __b
1144 /// A 256-bit vector of [8 x i32].
1145 /// \returns A 256-bit vector of [8 x i32] containing the result.
1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi32(__m256i __a,__m256i __b)1147 _mm256_max_epi32(__m256i __a, __m256i __b)
1148 {
1149 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1150 }
1151
1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1153 /// vectors in \a __a and \a __b and returns the larger of each pair in
1154 /// the corresponding byte of the 256-bit result.
1155 ///
1156 /// \headerfile <immintrin.h>
1157 ///
1158 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1159 ///
1160 /// \param __a
1161 /// A 256-bit integer vector.
1162 /// \param __b
1163 /// A 256-bit integer vector.
1164 /// \returns A 256-bit integer vector containing the result.
1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu8(__m256i __a,__m256i __b)1166 _mm256_max_epu8(__m256i __a, __m256i __b)
1167 {
1168 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1169 }
1170
1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173 /// each pair in the corresponding element of the 256-bit result.
1174 ///
1175 /// \headerfile <immintrin.h>
1176 ///
1177 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1178 ///
1179 /// \param __a
1180 /// A 256-bit vector of [16 x i16].
1181 /// \param __b
1182 /// A 256-bit vector of [16 x i16].
1183 /// \returns A 256-bit vector of [16 x i16] containing the result.
1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu16(__m256i __a,__m256i __b)1185 _mm256_max_epu16(__m256i __a, __m256i __b)
1186 {
1187 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1188 }
1189
1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192 /// each pair in the corresponding element of the 256-bit result.
1193 ///
1194 /// \headerfile <immintrin.h>
1195 ///
1196 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1197 ///
1198 /// \param __a
1199 /// A 256-bit vector of [8 x i32].
1200 /// \param __b
1201 /// A 256-bit vector of [8 x i32].
1202 /// \returns A 256-bit vector of [8 x i32] containing the result.
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu32(__m256i __a,__m256i __b)1204 _mm256_max_epu32(__m256i __a, __m256i __b)
1205 {
1206 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1207 }
1208
1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210 /// in \a __a and \a __b and returns the smaller of each pair in the
1211 /// corresponding byte of the 256-bit result.
1212 ///
1213 /// \headerfile <immintrin.h>
1214 ///
1215 /// This intrinsic corresponds to the \c VPMINSB instruction.
1216 ///
1217 /// \param __a
1218 /// A 256-bit integer vector.
1219 /// \param __b
1220 /// A 256-bit integer vector.
1221 /// \returns A 256-bit integer vector containing the result.
1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi8(__m256i __a,__m256i __b)1223 _mm256_min_epi8(__m256i __a, __m256i __b)
1224 {
1225 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1226 }
1227
1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1229 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230 /// each pair in the corresponding element of the 256-bit result.
1231 ///
1232 /// \headerfile <immintrin.h>
1233 ///
1234 /// This intrinsic corresponds to the \c VPMINSW instruction.
1235 ///
1236 /// \param __a
1237 /// A 256-bit vector of [16 x i16].
1238 /// \param __b
1239 /// A 256-bit vector of [16 x i16].
1240 /// \returns A 256-bit vector of [16 x i16] containing the result.
1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi16(__m256i __a,__m256i __b)1242 _mm256_min_epi16(__m256i __a, __m256i __b)
1243 {
1244 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1245 }
1246
1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1248 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249 /// each pair in the corresponding element of the 256-bit result.
1250 ///
1251 /// \headerfile <immintrin.h>
1252 ///
1253 /// This intrinsic corresponds to the \c VPMINSD instruction.
1254 ///
1255 /// \param __a
1256 /// A 256-bit vector of [8 x i32].
1257 /// \param __b
1258 /// A 256-bit vector of [8 x i32].
1259 /// \returns A 256-bit vector of [8 x i32] containing the result.
1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi32(__m256i __a,__m256i __b)1261 _mm256_min_epi32(__m256i __a, __m256i __b)
1262 {
1263 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1264 }
1265
1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1267 /// vectors in \a __a and \a __b and returns the smaller of each pair in
1268 /// the corresponding byte of the 256-bit result.
1269 ///
1270 /// \headerfile <immintrin.h>
1271 ///
1272 /// This intrinsic corresponds to the \c VPMINUB instruction.
1273 ///
1274 /// \param __a
1275 /// A 256-bit integer vector.
1276 /// \param __b
1277 /// A 256-bit integer vector.
1278 /// \returns A 256-bit integer vector containing the result.
1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu8(__m256i __a,__m256i __b)1280 _mm256_min_epu8(__m256i __a, __m256i __b)
1281 {
1282 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1283 }
1284
1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287 /// each pair in the corresponding element of the 256-bit result.
1288 ///
1289 /// \headerfile <immintrin.h>
1290 ///
1291 /// This intrinsic corresponds to the \c VPMINUW instruction.
1292 ///
1293 /// \param __a
1294 /// A 256-bit vector of [16 x i16].
1295 /// \param __b
1296 /// A 256-bit vector of [16 x i16].
1297 /// \returns A 256-bit vector of [16 x i16] containing the result.
1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu16(__m256i __a,__m256i __b)1299 _mm256_min_epu16(__m256i __a, __m256i __b)
1300 {
1301 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1302 }
1303
1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306 /// each pair in the corresponding element of the 256-bit result.
1307 ///
1308 /// \headerfile <immintrin.h>
1309 ///
1310 /// This intrinsic corresponds to the \c VPMINUD instruction.
1311 ///
1312 /// \param __a
1313 /// A 256-bit vector of [8 x i32].
1314 /// \param __b
1315 /// A 256-bit vector of [8 x i32].
1316 /// \returns A 256-bit vector of [8 x i32] containing the result.
1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu32(__m256i __a,__m256i __b)1318 _mm256_min_epu32(__m256i __a, __m256i __b)
1319 {
1320 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1321 }
1322
1323 /// Creates a 32-bit integer mask from the most significant bit of each byte
1324 /// in the 256-bit integer vector in \a __a and returns the result.
1325 ///
1326 /// \code{.operation}
1327 /// FOR i := 0 TO 31
1328 /// j := i*8
1329 /// result[i] := __a[j+7]
1330 /// ENDFOR
1331 /// \endcode
1332 ///
1333 /// \headerfile <immintrin.h>
1334 ///
1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1336 ///
1337 /// \param __a
1338 /// A 256-bit integer vector containing the source bytes.
1339 /// \returns The 32-bit integer mask.
1340 static __inline__ int __DEFAULT_FN_ATTRS256
_mm256_movemask_epi8(__m256i __a)1341 _mm256_movemask_epi8(__m256i __a)
1342 {
1343 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1344 }
1345
1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347 /// the 16-bit values in the corresponding elements of a 256-bit vector
1348 /// of [16 x i16].
1349 ///
1350 /// \code{.operation}
1351 /// FOR i := 0 TO 15
1352 /// j := i*8
1353 /// k := i*16
1354 /// result[k+15:k] := SignExtend(__V[j+7:j])
1355 /// ENDFOR
1356 /// \endcode
1357 ///
1358 /// \headerfile <immintrin.h>
1359 ///
1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1361 ///
1362 /// \param __V
1363 /// A 128-bit integer vector containing the source bytes.
1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365 /// values.
1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi8_epi16(__m128i __V)1367 _mm256_cvtepi8_epi16(__m128i __V)
1368 {
1369 /* This function always performs a signed extension, but __v16qi is a char
1370 which may be signed or unsigned, so use __v16qs. */
1371 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1372 }
1373
1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375 /// \a __V and returns the 32-bit values in the corresponding elements of a
1376 /// 256-bit vector of [8 x i32].
1377 ///
1378 /// \code{.operation}
1379 /// FOR i := 0 TO 7
1380 /// j := i*8
1381 /// k := i*32
1382 /// result[k+31:k] := SignExtend(__V[j+7:j])
1383 /// ENDFOR
1384 /// \endcode
1385 ///
1386 /// \headerfile <immintrin.h>
1387 ///
1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1389 ///
1390 /// \param __V
1391 /// A 128-bit integer vector containing the source bytes.
1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393 /// values.
1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi8_epi32(__m128i __V)1395 _mm256_cvtepi8_epi32(__m128i __V)
1396 {
1397 /* This function always performs a signed extension, but __v16qi is a char
1398 which may be signed or unsigned, so use __v16qs. */
1399 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1400 }
1401
1402 /// Sign-extends the first four bytes from the 128-bit integer vector in
1403 /// \a __V and returns the 64-bit values in the corresponding elements of a
1404 /// 256-bit vector of [4 x i64].
1405 ///
1406 /// \code{.operation}
1407 /// result[63:0] := SignExtend(__V[7:0])
1408 /// result[127:64] := SignExtend(__V[15:8])
1409 /// result[191:128] := SignExtend(__V[23:16])
1410 /// result[255:192] := SignExtend(__V[31:24])
1411 /// \endcode
1412 ///
1413 /// \headerfile <immintrin.h>
1414 ///
1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1416 ///
1417 /// \param __V
1418 /// A 128-bit integer vector containing the source bytes.
1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420 /// values.
1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi8_epi64(__m128i __V)1422 _mm256_cvtepi8_epi64(__m128i __V)
1423 {
1424 /* This function always performs a signed extension, but __v16qi is a char
1425 which may be signed or unsigned, so use __v16qs. */
1426 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1427 }
1428
1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430 /// \a __V and returns the 32-bit values in the corresponding elements of a
1431 /// 256-bit vector of [8 x i32].
1432 ///
1433 /// \code{.operation}
1434 /// FOR i := 0 TO 7
1435 /// j := i*16
1436 /// k := i*32
1437 /// result[k+31:k] := SignExtend(__V[j+15:j])
1438 /// ENDFOR
1439 /// \endcode
1440 ///
1441 /// \headerfile <immintrin.h>
1442 ///
1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1444 ///
1445 /// \param __V
1446 /// A 128-bit vector of [8 x i16] containing the source values.
1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1448 /// values.
1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi16_epi32(__m128i __V)1450 _mm256_cvtepi16_epi32(__m128i __V)
1451 {
1452 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1453 }
1454
1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457 /// elements of a 256-bit vector of [4 x i64].
1458 ///
1459 /// \code{.operation}
1460 /// result[63:0] := SignExtend(__V[15:0])
1461 /// result[127:64] := SignExtend(__V[31:16])
1462 /// result[191:128] := SignExtend(__V[47:32])
1463 /// result[255:192] := SignExtend(__V[64:48])
1464 /// \endcode
1465 ///
1466 /// \headerfile <immintrin.h>
1467 ///
1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1469 ///
1470 /// \param __V
1471 /// A 128-bit vector of [8 x i16] containing the source values.
1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1473 /// values.
1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi16_epi64(__m128i __V)1475 _mm256_cvtepi16_epi64(__m128i __V)
1476 {
1477 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1478 }
1479
1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481 /// \a __V and returns the 64-bit values in the corresponding elements of a
1482 /// 256-bit vector of [4 x i64].
1483 ///
1484 /// \code{.operation}
1485 /// result[63:0] := SignExtend(__V[31:0])
1486 /// result[127:64] := SignExtend(__V[63:32])
1487 /// result[191:128] := SignExtend(__V[95:64])
1488 /// result[255:192] := SignExtend(__V[127:96])
1489 /// \endcode
1490 ///
1491 /// \headerfile <immintrin.h>
1492 ///
1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1494 ///
1495 /// \param __V
1496 /// A 128-bit vector of [4 x i32] containing the source values.
1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1498 /// values.
1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepi32_epi64(__m128i __V)1500 _mm256_cvtepi32_epi64(__m128i __V)
1501 {
1502 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1503 }
1504
1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506 /// the 16-bit values in the corresponding elements of a 256-bit vector
1507 /// of [16 x i16].
1508 ///
1509 /// \code{.operation}
1510 /// FOR i := 0 TO 15
1511 /// j := i*8
1512 /// k := i*16
1513 /// result[k+15:k] := ZeroExtend(__V[j+7:j])
1514 /// ENDFOR
1515 /// \endcode
1516 ///
1517 /// \headerfile <immintrin.h>
1518 ///
1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1520 ///
1521 /// \param __V
1522 /// A 128-bit integer vector containing the source bytes.
1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1524 /// values.
1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu8_epi16(__m128i __V)1526 _mm256_cvtepu8_epi16(__m128i __V)
1527 {
1528 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1529 }
1530
1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532 /// \a __V and returns the 32-bit values in the corresponding elements of a
1533 /// 256-bit vector of [8 x i32].
1534 ///
1535 /// \code{.operation}
1536 /// FOR i := 0 TO 7
1537 /// j := i*8
1538 /// k := i*32
1539 /// result[k+31:k] := ZeroExtend(__V[j+7:j])
1540 /// ENDFOR
1541 /// \endcode
1542 ///
1543 /// \headerfile <immintrin.h>
1544 ///
1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1546 ///
1547 /// \param __V
1548 /// A 128-bit integer vector containing the source bytes.
1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1550 /// values.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu8_epi32(__m128i __V)1552 _mm256_cvtepu8_epi32(__m128i __V)
1553 {
1554 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1555 }
1556
1557 /// Zero-extends the first four bytes from the 128-bit integer vector in
1558 /// \a __V and returns the 64-bit values in the corresponding elements of a
1559 /// 256-bit vector of [4 x i64].
1560 ///
1561 /// \code{.operation}
1562 /// result[63:0] := ZeroExtend(__V[7:0])
1563 /// result[127:64] := ZeroExtend(__V[15:8])
1564 /// result[191:128] := ZeroExtend(__V[23:16])
1565 /// result[255:192] := ZeroExtend(__V[31:24])
1566 /// \endcode
1567 ///
1568 /// \headerfile <immintrin.h>
1569 ///
1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1571 ///
1572 /// \param __V
1573 /// A 128-bit integer vector containing the source bytes.
1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1575 /// values.
1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu8_epi64(__m128i __V)1577 _mm256_cvtepu8_epi64(__m128i __V)
1578 {
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1580 }
1581
1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583 /// \a __V and returns the 32-bit values in the corresponding elements of a
1584 /// 256-bit vector of [8 x i32].
1585 ///
1586 /// \code{.operation}
1587 /// FOR i := 0 TO 7
1588 /// j := i*16
1589 /// k := i*32
1590 /// result[k+31:k] := ZeroExtend(__V[j+15:j])
1591 /// ENDFOR
1592 /// \endcode
1593 ///
1594 /// \headerfile <immintrin.h>
1595 ///
1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1597 ///
1598 /// \param __V
1599 /// A 128-bit vector of [8 x i16] containing the source values.
1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1601 /// values.
1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu16_epi32(__m128i __V)1603 _mm256_cvtepu16_epi32(__m128i __V)
1604 {
1605 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1606 }
1607
1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610 /// elements of a 256-bit vector of [4 x i64].
1611 ///
1612 /// \code{.operation}
1613 /// result[63:0] := ZeroExtend(__V[15:0])
1614 /// result[127:64] := ZeroExtend(__V[31:16])
1615 /// result[191:128] := ZeroExtend(__V[47:32])
1616 /// result[255:192] := ZeroExtend(__V[64:48])
1617 /// \endcode
1618 ///
1619 /// \headerfile <immintrin.h>
1620 ///
1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1622 ///
1623 /// \param __V
1624 /// A 128-bit vector of [8 x i16] containing the source values.
1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1626 /// values.
1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu16_epi64(__m128i __V)1628 _mm256_cvtepu16_epi64(__m128i __V)
1629 {
1630 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1631 }
1632
1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634 /// \a __V and returns the 64-bit values in the corresponding elements of a
1635 /// 256-bit vector of [4 x i64].
1636 ///
1637 /// \code{.operation}
1638 /// result[63:0] := ZeroExtend(__V[31:0])
1639 /// result[127:64] := ZeroExtend(__V[63:32])
1640 /// result[191:128] := ZeroExtend(__V[95:64])
1641 /// result[255:192] := ZeroExtend(__V[127:96])
1642 /// \endcode
1643 ///
1644 /// \headerfile <immintrin.h>
1645 ///
1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1647 ///
1648 /// \param __V
1649 /// A 128-bit vector of [4 x i32] containing the source values.
1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1651 /// values.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtepu32_epi64(__m128i __V)1653 _mm256_cvtepu32_epi64(__m128i __V)
1654 {
1655 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1656 }
1657
1658 /// Multiplies signed 32-bit integers from even-numbered elements of two
1659 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660 /// [4 x i64] result.
1661 ///
1662 /// \code{.operation}
1663 /// result[63:0] := __a[31:0] * __b[31:0]
1664 /// result[127:64] := __a[95:64] * __b[95:64]
1665 /// result[191:128] := __a[159:128] * __b[159:128]
1666 /// result[255:192] := __a[223:192] * __b[223:192]
1667 /// \endcode
1668 ///
1669 /// \headerfile <immintrin.h>
1670 ///
1671 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1672 ///
1673 /// \param __a
1674 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1675 /// \param __b
1676 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1677 /// \returns A 256-bit vector of [4 x i64] containing the products.
1678 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mul_epi32(__m256i __a,__m256i __b)1679 _mm256_mul_epi32(__m256i __a, __m256i __b)
1680 {
1681 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1682 }
1683
1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685 /// [16 x i16], truncates the 32-bit results to the most significant 18
1686 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687 /// product in the [16 x i16] result.
1688 ///
1689 /// \code{.operation}
1690 /// FOR i := 0 TO 15
1691 /// j := i*16
1692 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693 /// result[j+15:j] := temp[16:1]
1694 /// \endcode
1695 ///
1696 /// \headerfile <immintrin.h>
1697 ///
1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1699 ///
1700 /// \param __a
1701 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1702 /// \param __b
1703 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mulhrs_epi16(__m256i __a,__m256i __b)1706 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1707 {
1708 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1709 }
1710
1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713 /// [16 x i16] result.
1714 ///
1715 /// \headerfile <immintrin.h>
1716 ///
1717 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1718 ///
1719 /// \param __a
1720 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1721 /// \param __b
1722 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1723 /// \returns A 256-bit vector of [16 x i16] containing the products.
1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mulhi_epu16(__m256i __a,__m256i __b)1725 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1726 {
1727 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1728 }
1729
1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732 /// [16 x i16] result.
1733 ///
1734 /// \headerfile <immintrin.h>
1735 ///
1736 /// This intrinsic corresponds to the \c VPMULHW instruction.
1737 ///
1738 /// \param __a
1739 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1740 /// \param __b
1741 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1742 /// \returns A 256-bit vector of [16 x i16] containing the products.
1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mulhi_epi16(__m256i __a,__m256i __b)1744 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1745 {
1746 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1747 }
1748
1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751 /// [16 x i16] result.
1752 ///
1753 /// \headerfile <immintrin.h>
1754 ///
1755 /// This intrinsic corresponds to the \c VPMULLW instruction.
1756 ///
1757 /// \param __a
1758 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1759 /// \param __b
1760 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1761 /// \returns A 256-bit vector of [16 x i16] containing the products.
1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mullo_epi16(__m256i __a,__m256i __b)1763 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1764 {
1765 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1766 }
1767
1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770 /// [8 x i32] result.
1771 ///
1772 /// \headerfile <immintrin.h>
1773 ///
1774 /// This intrinsic corresponds to the \c VPMULLD instruction.
1775 ///
1776 /// \param __a
1777 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1778 /// \param __b
1779 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1780 /// \returns A 256-bit vector of [8 x i32] containing the products.
1781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mullo_epi32(__m256i __a,__m256i __b)1782 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1783 {
1784 return (__m256i)((__v8su)__a * (__v8su)__b);
1785 }
1786
1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1788 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789 /// [4 x i64] result.
1790 ///
1791 /// \code{.operation}
1792 /// result[63:0] := __a[31:0] * __b[31:0]
1793 /// result[127:64] := __a[95:64] * __b[95:64]
1794 /// result[191:128] := __a[159:128] * __b[159:128]
1795 /// result[255:192] := __a[223:192] * __b[223:192]
1796 /// \endcode
1797 ///
1798 /// \headerfile <immintrin.h>
1799 ///
1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1801 ///
1802 /// \param __a
1803 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1804 /// \param __b
1805 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1806 /// \returns A 256-bit vector of [4 x i64] containing the products.
1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mul_epu32(__m256i __a,__m256i __b)1808 _mm256_mul_epu32(__m256i __a, __m256i __b)
1809 {
1810 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1811 }
1812
1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1814 /// \a __b.
1815 ///
1816 /// \headerfile <immintrin.h>
1817 ///
1818 /// This intrinsic corresponds to the \c VPOR instruction.
1819 ///
1820 /// \param __a
1821 /// A 256-bit integer vector.
1822 /// \param __b
1823 /// A 256-bit integer vector.
1824 /// \returns A 256-bit integer vector containing the result.
1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_or_si256(__m256i __a,__m256i __b)1826 _mm256_or_si256(__m256i __a, __m256i __b)
1827 {
1828 return (__m256i)((__v4du)__a | (__v4du)__b);
1829 }
1830
1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1832 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1833 /// \a __b.
1834 ///
1835 /// One SAD result is computed for each set of eight bytes from \a __a and
1836 /// eight bytes from \a __b. The zero-extended SAD value is returned in the
1837 /// corresponding 64-bit element of the result.
1838 ///
1839 /// A single SAD operation takes the differences between the corresponding
1840 /// bytes of \a __a and \a __b, takes the absolute value of each difference,
1841 /// and sums these eight values to form one 16-bit result. This operation
1842 /// is repeated four times with successive sets of eight bytes.
1843 ///
1844 /// \code{.operation}
1845 /// FOR i := 0 TO 3
1846 /// j := i*64
1847 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856 /// temp4 + temp5 + temp6 + temp7
1857 /// result[j+63:j+16] := 0
1858 /// ENDFOR
1859 /// \endcode
1860 ///
1861 /// \headerfile <immintrin.h>
1862 ///
1863 /// This intrinsic corresponds to the \c VPSADBW instruction.
1864 ///
1865 /// \param __a
1866 /// A 256-bit integer vector.
1867 /// \param __b
1868 /// A 256-bit integer vector.
1869 /// \returns A 256-bit integer vector containing the result.
1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sad_epu8(__m256i __a,__m256i __b)1871 _mm256_sad_epu8(__m256i __a, __m256i __b)
1872 {
1873 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1874 }
1875
1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877 /// to control information in the 256-bit integer vector \a __b, and
1878 /// returns the 256-bit result. In effect there are two separate 128-bit
1879 /// shuffles in the lower and upper halves.
1880 ///
1881 /// \code{.operation}
1882 /// FOR i := 0 TO 31
1883 /// j := i*8
1884 /// IF __b[j+7] == 1
1885 /// result[j+7:j] := 0
1886 /// ELSE
1887 /// k := __b[j+3:j] * 8
1888 /// IF i > 15
1889 /// k := k + 128
1890 /// FI
1891 /// result[j+7:j] := __a[k+7:k]
1892 /// FI
1893 /// ENDFOR
1894 /// \endcode
1895 ///
1896 /// \headerfile <immintrin.h>
1897 ///
1898 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1899 ///
1900 /// \param __a
1901 /// A 256-bit integer vector containing source values.
1902 /// \param __b
1903 /// A 256-bit integer vector containing control information to determine
1904 /// what goes into the corresponding byte of the result. If bit 7 of the
1905 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906 /// control byte specify the index (within the same 128-bit half) of \a __a
1907 /// to copy to the result byte.
1908 /// \returns A 256-bit integer vector containing the result.
1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shuffle_epi8(__m256i __a,__m256i __b)1910 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1911 {
1912 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1913 }
1914
1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916 /// according to control information in the integer literal \a imm, and
1917 /// returns the 256-bit result. In effect there are two parallel 128-bit
1918 /// shuffles in the lower and upper halves.
1919 ///
1920 /// \code{.operation}
1921 /// FOR i := 0 to 3
1922 /// j := i*32
1923 /// k := (imm >> i*2)[1:0] * 32
1924 /// result[j+31:j] := a[k+31:k]
1925 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1926 /// ENDFOR
1927 /// \endcode
1928 ///
1929 /// \headerfile <immintrin.h>
1930 ///
1931 /// \code
1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1933 /// \endcode
1934 ///
1935 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1936 ///
1937 /// \param a
1938 /// A 256-bit vector of [8 x i32] containing source values.
1939 /// \param imm
1940 /// An immediate 8-bit value specifying which elements to copy from \a a.
1941 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1943 /// forth.
1944 /// \returns A 256-bit vector of [8 x i32] containing the result.
1945 #define _mm256_shuffle_epi32(a, imm) \
1946 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1947
1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949 /// according to control information in the integer literal \a imm, and
1950 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1951 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952 /// copied from \a a unchanged.
1953 ///
1954 /// \code{.operation}
1955 /// result[63:0] := a[63:0]
1956 /// result[191:128] := a[191:128]
1957 /// FOR i := 0 TO 3
1958 /// j := i * 16 + 64
1959 /// k := (imm >> i*2)[1:0] * 16 + 64
1960 /// result[j+15:j] := a[k+15:k]
1961 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1962 /// ENDFOR
1963 /// \endcode
1964 ///
1965 /// \headerfile <immintrin.h>
1966 ///
1967 /// \code
1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1969 /// \endcode
1970 ///
1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1972 ///
1973 /// \param a
1974 /// A 256-bit vector of [16 x i16] containing source values.
1975 /// \param imm
1976 /// An immediate 8-bit value specifying which elements to copy from \a a.
1977 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980 /// \returns A 256-bit vector of [16 x i16] containing the result.
1981 #define _mm256_shufflehi_epi16(a, imm) \
1982 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1983
1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985 /// according to control information in the integer literal \a imm, and
1986 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988 /// copied from \a a unchanged.
1989 ///
1990 /// \code{.operation}
1991 /// result[127:64] := a[127:64]
1992 /// result[255:192] := a[255:192]
1993 /// FOR i := 0 TO 3
1994 /// j := i * 16
1995 /// k := (imm >> i*2)[1:0] * 16
1996 /// result[j+15:j] := a[k+15:k]
1997 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1998 /// ENDFOR
1999 /// \endcode
2000 ///
2001 /// \headerfile <immintrin.h>
2002 ///
2003 /// \code
2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2005 /// \endcode
2006 ///
2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2008 ///
2009 /// \param a
2010 /// A 256-bit vector of [16 x i16] to use as a source of data for the
2011 /// result.
2012 /// \param imm
2013 /// An immediate 8-bit value specifying which elements to copy from \a a.
2014 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2016 /// forth.
2017 /// \returns A 256-bit vector of [16 x i16] containing the result.
2018 #define _mm256_shufflelo_epi16(a, imm) \
2019 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2020
2021 /// Sets each byte of the result to the corresponding byte of the 256-bit
2022 /// integer vector in \a __a, the negative of that byte, or zero, depending
2023 /// on whether the corresponding byte of the 256-bit integer vector in
2024 /// \a __b is greater than zero, less than zero, or equal to zero,
2025 /// respectively.
2026 ///
2027 /// \headerfile <immintrin.h>
2028 ///
2029 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2030 ///
2031 /// \param __a
2032 /// A 256-bit integer vector.
2033 /// \param __b
2034 /// A 256-bit integer vector].
2035 /// \returns A 256-bit integer vector containing the result.
2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sign_epi8(__m256i __a,__m256i __b)2037 _mm256_sign_epi8(__m256i __a, __m256i __b)
2038 {
2039 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2040 }
2041
2042 /// Sets each element of the result to the corresponding element of the
2043 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044 /// or zero, depending on whether the corresponding element of the 256-bit
2045 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046 /// equal to zero, respectively.
2047 ///
2048 /// \headerfile <immintrin.h>
2049 ///
2050 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2051 ///
2052 /// \param __a
2053 /// A 256-bit vector of [16 x i16].
2054 /// \param __b
2055 /// A 256-bit vector of [16 x i16].
2056 /// \returns A 256-bit vector of [16 x i16] containing the result.
2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sign_epi16(__m256i __a,__m256i __b)2058 _mm256_sign_epi16(__m256i __a, __m256i __b)
2059 {
2060 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2061 }
2062
2063 /// Sets each element of the result to the corresponding element of the
2064 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065 /// zero, depending on whether the corresponding element of the 256-bit
2066 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067 /// equal to zero, respectively.
2068 ///
2069 /// \headerfile <immintrin.h>
2070 ///
2071 /// This intrinsic corresponds to the \c VPSIGND instruction.
2072 ///
2073 /// \param __a
2074 /// A 256-bit vector of [8 x i32].
2075 /// \param __b
2076 /// A 256-bit vector of [8 x i32].
2077 /// \returns A 256-bit vector of [8 x i32] containing the result.
2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sign_epi32(__m256i __a,__m256i __b)2079 _mm256_sign_epi32(__m256i __a, __m256i __b)
2080 {
2081 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2082 }
2083
2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086 /// is greater than 15, the returned result is all zeroes.
2087 ///
2088 /// \headerfile <immintrin.h>
2089 ///
2090 /// \code
2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2092 /// \endcode
2093 ///
2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2095 ///
2096 /// \param a
2097 /// A 256-bit integer vector to be shifted.
2098 /// \param imm
2099 /// An unsigned immediate value specifying the shift count (in bytes).
2100 /// \returns A 256-bit integer vector containing the result.
2101 #define _mm256_slli_si256(a, imm) \
2102 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2103
2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106 /// is greater than 15, the returned result is all zeroes.
2107 ///
2108 /// \headerfile <immintrin.h>
2109 ///
2110 /// \code
2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2112 /// \endcode
2113 ///
2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2115 ///
2116 /// \param a
2117 /// A 256-bit integer vector to be shifted.
2118 /// \param imm
2119 /// An unsigned immediate value specifying the shift count (in bytes).
2120 /// \returns A 256-bit integer vector containing the result.
2121 #define _mm256_bslli_epi128(a, imm) \
2122 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2123
2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125 /// left by \a __count bits, shifting in zero bits, and returns the result.
2126 /// If \a __count is greater than 15, the returned result is all zeroes.
2127 ///
2128 /// \headerfile <immintrin.h>
2129 ///
2130 /// This intrinsic corresponds to the \c VPSLLW instruction.
2131 ///
2132 /// \param __a
2133 /// A 256-bit vector of [16 x i16] to be shifted.
2134 /// \param __count
2135 /// An unsigned integer value specifying the shift count (in bits).
2136 /// \returns A 256-bit vector of [16 x i16] containing the result.
2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_slli_epi16(__m256i __a,int __count)2138 _mm256_slli_epi16(__m256i __a, int __count)
2139 {
2140 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2141 }
2142
2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144 /// left by the number of bits specified by the lower 64 bits of \a __count,
2145 /// shifting in zero bits, and returns the result. If \a __count is greater
2146 /// than 15, the returned result is all zeroes.
2147 ///
2148 /// \headerfile <immintrin.h>
2149 ///
2150 /// This intrinsic corresponds to the \c VPSLLW instruction.
2151 ///
2152 /// \param __a
2153 /// A 256-bit vector of [16 x i16] to be shifted.
2154 /// \param __count
2155 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156 /// shift count (in bits). The upper element is ignored.
2157 /// \returns A 256-bit vector of [16 x i16] containing the result.
2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sll_epi16(__m256i __a,__m128i __count)2159 _mm256_sll_epi16(__m256i __a, __m128i __count)
2160 {
2161 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2162 }
2163
2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165 /// left by \a __count bits, shifting in zero bits, and returns the result.
2166 /// If \a __count is greater than 31, the returned result is all zeroes.
2167 ///
2168 /// \headerfile <immintrin.h>
2169 ///
2170 /// This intrinsic corresponds to the \c VPSLLD instruction.
2171 ///
2172 /// \param __a
2173 /// A 256-bit vector of [8 x i32] to be shifted.
2174 /// \param __count
2175 /// An unsigned integer value specifying the shift count (in bits).
2176 /// \returns A 256-bit vector of [8 x i32] containing the result.
2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_slli_epi32(__m256i __a,int __count)2178 _mm256_slli_epi32(__m256i __a, int __count)
2179 {
2180 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2181 }
2182
2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184 /// left by the number of bits given in the lower 64 bits of \a __count,
2185 /// shifting in zero bits, and returns the result. If \a __count is greater
2186 /// than 31, the returned result is all zeroes.
2187 ///
2188 /// \headerfile <immintrin.h>
2189 ///
2190 /// This intrinsic corresponds to the \c VPSLLD instruction.
2191 ///
2192 /// \param __a
2193 /// A 256-bit vector of [8 x i32] to be shifted.
2194 /// \param __count
2195 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196 /// shift count (in bits). The upper element is ignored.
2197 /// \returns A 256-bit vector of [8 x i32] containing the result.
2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sll_epi32(__m256i __a,__m128i __count)2199 _mm256_sll_epi32(__m256i __a, __m128i __count)
2200 {
2201 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2202 }
2203
2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205 /// left by \a __count bits, shifting in zero bits, and returns the result.
2206 /// If \a __count is greater than 63, the returned result is all zeroes.
2207 ///
2208 /// \headerfile <immintrin.h>
2209 ///
2210 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2211 ///
2212 /// \param __a
2213 /// A 256-bit vector of [4 x i64] to be shifted.
2214 /// \param __count
2215 /// An unsigned integer value specifying the shift count (in bits).
2216 /// \returns A 256-bit vector of [4 x i64] containing the result.
2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_slli_epi64(__m256i __a,int __count)2218 _mm256_slli_epi64(__m256i __a, int __count)
2219 {
2220 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2221 }
2222
2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224 /// left by the number of bits given in the lower 64 bits of \a __count,
2225 /// shifting in zero bits, and returns the result. If \a __count is greater
2226 /// than 63, the returned result is all zeroes.
2227 ///
2228 /// \headerfile <immintrin.h>
2229 ///
2230 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2231 ///
2232 /// \param __a
2233 /// A 256-bit vector of [4 x i64] to be shifted.
2234 /// \param __count
2235 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236 /// shift count (in bits). The upper element is ignored.
2237 /// \returns A 256-bit vector of [4 x i64] containing the result.
2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sll_epi64(__m256i __a,__m128i __count)2239 _mm256_sll_epi64(__m256i __a, __m128i __count)
2240 {
2241 return __builtin_ia32_psllq256((__v4di)__a, __count);
2242 }
2243
2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245 /// right by \a __count bits, shifting in sign bits, and returns the result.
2246 /// If \a __count is greater than 15, each element of the result is either
2247 /// 0 or -1 according to the corresponding input sign bit.
2248 ///
2249 /// \headerfile <immintrin.h>
2250 ///
2251 /// This intrinsic corresponds to the \c VPSRAW instruction.
2252 ///
2253 /// \param __a
2254 /// A 256-bit vector of [16 x i16] to be shifted.
2255 /// \param __count
2256 /// An unsigned integer value specifying the shift count (in bits).
2257 /// \returns A 256-bit vector of [16 x i16] containing the result.
2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srai_epi16(__m256i __a,int __count)2259 _mm256_srai_epi16(__m256i __a, int __count)
2260 {
2261 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2262 }
2263
2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265 /// right by the number of bits given in the lower 64 bits of \a __count,
2266 /// shifting in sign bits, and returns the result. If \a __count is greater
2267 /// than 15, each element of the result is either 0 or -1 according to the
2268 /// corresponding input sign bit.
2269 ///
2270 /// \headerfile <immintrin.h>
2271 ///
2272 /// This intrinsic corresponds to the \c VPSRAW instruction.
2273 ///
2274 /// \param __a
2275 /// A 256-bit vector of [16 x i16] to be shifted.
2276 /// \param __count
2277 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278 /// shift count (in bits). The upper element is ignored.
2279 /// \returns A 256-bit vector of [16 x i16] containing the result.
2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sra_epi16(__m256i __a,__m128i __count)2281 _mm256_sra_epi16(__m256i __a, __m128i __count)
2282 {
2283 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2284 }
2285
2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287 /// right by \a __count bits, shifting in sign bits, and returns the result.
2288 /// If \a __count is greater than 31, each element of the result is either
2289 /// 0 or -1 according to the corresponding input sign bit.
2290 ///
2291 /// \headerfile <immintrin.h>
2292 ///
2293 /// This intrinsic corresponds to the \c VPSRAD instruction.
2294 ///
2295 /// \param __a
2296 /// A 256-bit vector of [8 x i32] to be shifted.
2297 /// \param __count
2298 /// An unsigned integer value specifying the shift count (in bits).
2299 /// \returns A 256-bit vector of [8 x i32] containing the result.
2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srai_epi32(__m256i __a,int __count)2301 _mm256_srai_epi32(__m256i __a, int __count)
2302 {
2303 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2304 }
2305
2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307 /// right by the number of bits given in the lower 64 bits of \a __count,
2308 /// shifting in sign bits, and returns the result. If \a __count is greater
2309 /// than 31, each element of the result is either 0 or -1 according to the
2310 /// corresponding input sign bit.
2311 ///
2312 /// \headerfile <immintrin.h>
2313 ///
2314 /// This intrinsic corresponds to the \c VPSRAD instruction.
2315 ///
2316 /// \param __a
2317 /// A 256-bit vector of [8 x i32] to be shifted.
2318 /// \param __count
2319 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320 /// shift count (in bits). The upper element is ignored.
2321 /// \returns A 256-bit vector of [8 x i32] containing the result.
2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sra_epi32(__m256i __a,__m128i __count)2323 _mm256_sra_epi32(__m256i __a, __m128i __count)
2324 {
2325 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2326 }
2327
2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2330 /// \a imm is greater than 15, the returned result is all zeroes.
2331 ///
2332 /// \headerfile <immintrin.h>
2333 ///
2334 /// \code
2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2336 /// \endcode
2337 ///
2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2339 ///
2340 /// \param a
2341 /// A 256-bit integer vector to be shifted.
2342 /// \param imm
2343 /// An unsigned immediate value specifying the shift count (in bytes).
2344 /// \returns A 256-bit integer vector containing the result.
2345 #define _mm256_srli_si256(a, imm) \
2346 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2347
2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2350 /// \a imm is greater than 15, the returned result is all zeroes.
2351 ///
2352 /// \headerfile <immintrin.h>
2353 ///
2354 /// \code
2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2356 /// \endcode
2357 ///
2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2359 ///
2360 /// \param a
2361 /// A 256-bit integer vector to be shifted.
2362 /// \param imm
2363 /// An unsigned immediate value specifying the shift count (in bytes).
2364 /// \returns A 256-bit integer vector containing the result.
2365 #define _mm256_bsrli_epi128(a, imm) \
2366 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2367
2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369 /// right by \a __count bits, shifting in zero bits, and returns the result.
2370 /// If \a __count is greater than 15, the returned result is all zeroes.
2371 ///
2372 /// \headerfile <immintrin.h>
2373 ///
2374 /// This intrinsic corresponds to the \c VPSRLW instruction.
2375 ///
2376 /// \param __a
2377 /// A 256-bit vector of [16 x i16] to be shifted.
2378 /// \param __count
2379 /// An unsigned integer value specifying the shift count (in bits).
2380 /// \returns A 256-bit vector of [16 x i16] containing the result.
2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srli_epi16(__m256i __a,int __count)2382 _mm256_srli_epi16(__m256i __a, int __count)
2383 {
2384 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2385 }
2386
2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388 /// right by the number of bits given in the lower 64 bits of \a __count,
2389 /// shifting in zero bits, and returns the result. If \a __count is greater
2390 /// than 15, the returned result is all zeroes.
2391 ///
2392 /// \headerfile <immintrin.h>
2393 ///
2394 /// This intrinsic corresponds to the \c VPSRLW instruction.
2395 ///
2396 /// \param __a
2397 /// A 256-bit vector of [16 x i16] to be shifted.
2398 /// \param __count
2399 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400 /// shift count (in bits). The upper element is ignored.
2401 /// \returns A 256-bit vector of [16 x i16] containing the result.
2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srl_epi16(__m256i __a,__m128i __count)2403 _mm256_srl_epi16(__m256i __a, __m128i __count)
2404 {
2405 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2406 }
2407
2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409 /// right by \a __count bits, shifting in zero bits, and returns the result.
2410 /// If \a __count is greater than 31, the returned result is all zeroes.
2411 ///
2412 /// \headerfile <immintrin.h>
2413 ///
2414 /// This intrinsic corresponds to the \c VPSRLD instruction.
2415 ///
2416 /// \param __a
2417 /// A 256-bit vector of [8 x i32] to be shifted.
2418 /// \param __count
2419 /// An unsigned integer value specifying the shift count (in bits).
2420 /// \returns A 256-bit vector of [8 x i32] containing the result.
2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srli_epi32(__m256i __a,int __count)2422 _mm256_srli_epi32(__m256i __a, int __count)
2423 {
2424 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2425 }
2426
2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428 /// right by the number of bits given in the lower 64 bits of \a __count,
2429 /// shifting in zero bits, and returns the result. If \a __count is greater
2430 /// than 31, the returned result is all zeroes.
2431 ///
2432 /// \headerfile <immintrin.h>
2433 ///
2434 /// This intrinsic corresponds to the \c VPSRLD instruction.
2435 ///
2436 /// \param __a
2437 /// A 256-bit vector of [8 x i32] to be shifted.
2438 /// \param __count
2439 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440 /// shift count (in bits). The upper element is ignored.
2441 /// \returns A 256-bit vector of [8 x i32] containing the result.
2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srl_epi32(__m256i __a,__m128i __count)2443 _mm256_srl_epi32(__m256i __a, __m128i __count)
2444 {
2445 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2446 }
2447
2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449 /// right by \a __count bits, shifting in zero bits, and returns the result.
2450 /// If \a __count is greater than 63, the returned result is all zeroes.
2451 ///
2452 /// \headerfile <immintrin.h>
2453 ///
2454 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2455 ///
2456 /// \param __a
2457 /// A 256-bit vector of [4 x i64] to be shifted.
2458 /// \param __count
2459 /// An unsigned integer value specifying the shift count (in bits).
2460 /// \returns A 256-bit vector of [4 x i64] containing the result.
2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srli_epi64(__m256i __a,int __count)2462 _mm256_srli_epi64(__m256i __a, int __count)
2463 {
2464 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2465 }
2466
2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468 /// right by the number of bits given in the lower 64 bits of \a __count,
2469 /// shifting in zero bits, and returns the result. If \a __count is greater
2470 /// than 63, the returned result is all zeroes.
2471 ///
2472 /// \headerfile <immintrin.h>
2473 ///
2474 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2475 ///
2476 /// \param __a
2477 /// A 256-bit vector of [4 x i64] to be shifted.
2478 /// \param __count
2479 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480 /// shift count (in bits). The upper element is ignored.
2481 /// \returns A 256-bit vector of [4 x i64] containing the result.
2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srl_epi64(__m256i __a,__m128i __count)2483 _mm256_srl_epi64(__m256i __a, __m128i __count)
2484 {
2485 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2486 }
2487
2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489 /// vectors. Returns the lower 8 bits of each difference in the
2490 /// corresponding byte of the 256-bit integer vector result (overflow is
2491 /// ignored).
2492 ///
2493 /// \code{.operation}
2494 /// FOR i := 0 TO 31
2495 /// j := i*8
2496 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2497 /// ENDFOR
2498 /// \endcode
2499 ///
2500 /// \headerfile <immintrin.h>
2501 ///
2502 /// This intrinsic corresponds to the \c VPSUBB instruction.
2503 ///
2504 /// \param __a
2505 /// A 256-bit integer vector containing the minuends.
2506 /// \param __b
2507 /// A 256-bit integer vector containing the subtrahends.
2508 /// \returns A 256-bit integer vector containing the differences.
2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sub_epi8(__m256i __a,__m256i __b)2510 _mm256_sub_epi8(__m256i __a, __m256i __b)
2511 {
2512 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2513 }
2514
2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517 /// the corresponding element of the [16 x i16] result (overflow is
2518 /// ignored).
2519 ///
2520 /// \code{.operation}
2521 /// FOR i := 0 TO 15
2522 /// j := i*16
2523 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2524 /// ENDFOR
2525 /// \endcode
2526 ///
2527 /// \headerfile <immintrin.h>
2528 ///
2529 /// This intrinsic corresponds to the \c VPSUBW instruction.
2530 ///
2531 /// \param __a
2532 /// A 256-bit vector of [16 x i16] containing the minuends.
2533 /// \param __b
2534 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2535 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sub_epi16(__m256i __a,__m256i __b)2537 _mm256_sub_epi16(__m256i __a, __m256i __b)
2538 {
2539 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2540 }
2541
2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544 /// the corresponding element of the [8 x i32] result (overflow is ignored).
2545 ///
2546 /// \code{.operation}
2547 /// FOR i := 0 TO 7
2548 /// j := i*32
2549 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2550 /// ENDFOR
2551 /// \endcode
2552 ///
2553 /// \headerfile <immintrin.h>
2554 ///
2555 /// This intrinsic corresponds to the \c VPSUBD instruction.
2556 ///
2557 /// \param __a
2558 /// A 256-bit vector of [8 x i32] containing the minuends.
2559 /// \param __b
2560 /// A 256-bit vector of [8 x i32] containing the subtrahends.
2561 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sub_epi32(__m256i __a,__m256i __b)2563 _mm256_sub_epi32(__m256i __a, __m256i __b)
2564 {
2565 return (__m256i)((__v8su)__a - (__v8su)__b);
2566 }
2567
2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570 /// the corresponding element of the [4 x i64] result (overflow is ignored).
2571 ///
2572 /// \code{.operation}
2573 /// FOR i := 0 TO 3
2574 /// j := i*64
2575 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2576 /// ENDFOR
2577 /// \endcode
2578 ///
2579 /// \headerfile <immintrin.h>
2580 ///
2581 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2582 ///
2583 /// \param __a
2584 /// A 256-bit vector of [4 x i64] containing the minuends.
2585 /// \param __b
2586 /// A 256-bit vector of [4 x i64] containing the subtrahends.
2587 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sub_epi64(__m256i __a,__m256i __b)2589 _mm256_sub_epi64(__m256i __a, __m256i __b)
2590 {
2591 return (__m256i)((__v4du)__a - (__v4du)__b);
2592 }
2593
2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595 /// vectors using signed saturation, and returns each differences in the
2596 /// corresponding byte of the 256-bit integer vector result.
2597 ///
2598 /// \code{.operation}
2599 /// FOR i := 0 TO 31
2600 /// j := i*8
2601 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2602 /// ENDFOR
2603 /// \endcode
2604 ///
2605 /// \headerfile <immintrin.h>
2606 ///
2607 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2608 ///
2609 /// \param __a
2610 /// A 256-bit integer vector containing the minuends.
2611 /// \param __b
2612 /// A 256-bit integer vector containing the subtrahends.
2613 /// \returns A 256-bit integer vector containing the differences.
2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi8(__m256i __a,__m256i __b)2615 _mm256_subs_epi8(__m256i __a, __m256i __b)
2616 {
2617 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2618 }
2619
2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621 /// vectors of [16 x i16] using signed saturation, and returns each
2622 /// difference in the corresponding element of the [16 x i16] result.
2623 ///
2624 /// \code{.operation}
2625 /// FOR i := 0 TO 15
2626 /// j := i*16
2627 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2628 /// ENDFOR
2629 /// \endcode
2630 ///
2631 /// \headerfile <immintrin.h>
2632 ///
2633 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2634 ///
2635 /// \param __a
2636 /// A 256-bit vector of [16 x i16] containing the minuends.
2637 /// \param __b
2638 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2639 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi16(__m256i __a,__m256i __b)2641 _mm256_subs_epi16(__m256i __a, __m256i __b)
2642 {
2643 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2644 }
2645
2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647 /// vectors using unsigned saturation, and returns each difference in the
2648 /// corresponding byte of the 256-bit integer vector result. For each byte,
2649 /// computes <c> result = __a - __b </c>.
2650 ///
2651 /// \code{.operation}
2652 /// FOR i := 0 TO 31
2653 /// j := i*8
2654 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2655 /// ENDFOR
2656 /// \endcode
2657 ///
2658 /// \headerfile <immintrin.h>
2659 ///
2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2661 ///
2662 /// \param __a
2663 /// A 256-bit integer vector containing the minuends.
2664 /// \param __b
2665 /// A 256-bit integer vector containing the subtrahends.
2666 /// \returns A 256-bit integer vector containing the differences.
2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu8(__m256i __a,__m256i __b)2668 _mm256_subs_epu8(__m256i __a, __m256i __b)
2669 {
2670 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2671 }
2672
2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674 /// vectors of [16 x i16] using unsigned saturation, and returns each
2675 /// difference in the corresponding element of the [16 x i16] result.
2676 ///
2677 /// \code{.operation}
2678 /// FOR i := 0 TO 15
2679 /// j := i*16
2680 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2681 /// ENDFOR
2682 /// \endcode
2683 ///
2684 /// \headerfile <immintrin.h>
2685 ///
2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2687 ///
2688 /// \param __a
2689 /// A 256-bit vector of [16 x i16] containing the minuends.
2690 /// \param __b
2691 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2692 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu16(__m256i __a,__m256i __b)2694 _mm256_subs_epu16(__m256i __a, __m256i __b)
2695 {
2696 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2697 }
2698
2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702 /// input; other bits in these parameters are ignored.
2703 ///
2704 /// \code{.operation}
2705 /// result[7:0] := __a[71:64]
2706 /// result[15:8] := __b[71:64]
2707 /// result[23:16] := __a[79:72]
2708 /// result[31:24] := __b[79:72]
2709 /// . . .
2710 /// result[127:120] := __b[127:120]
2711 /// result[135:128] := __a[199:192]
2712 /// . . .
2713 /// result[255:248] := __b[255:248]
2714 /// \endcode
2715 ///
2716 /// \headerfile <immintrin.h>
2717 ///
2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2719 ///
2720 /// \param __a
2721 /// A 256-bit integer vector used as the source for the even-numbered bytes
2722 /// of the result.
2723 /// \param __b
2724 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2725 /// of the result.
2726 /// \returns A 256-bit integer vector containing the result.
2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpackhi_epi8(__m256i __a,__m256i __b)2728 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2729 {
2730 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2731 }
2732
2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736 /// 128-bit half of \a __a and \a __b as input; other bits in these
2737 /// parameters are ignored.
2738 ///
2739 /// \code{.operation}
2740 /// result[15:0] := __a[79:64]
2741 /// result[31:16] := __b[79:64]
2742 /// result[47:32] := __a[95:80]
2743 /// result[63:48] := __b[95:80]
2744 /// . . .
2745 /// result[127:112] := __b[127:112]
2746 /// result[143:128] := __a[211:196]
2747 /// . . .
2748 /// result[255:240] := __b[255:240]
2749 /// \endcode
2750 ///
2751 /// \headerfile <immintrin.h>
2752 ///
2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2754 ///
2755 /// \param __a
2756 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757 /// elements of the result.
2758 /// \param __b
2759 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760 /// elements of the result.
2761 /// \returns A 256-bit vector of [16 x i16] containing the result.
2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpackhi_epi16(__m256i __a,__m256i __b)2763 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2764 {
2765 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2766 }
2767
2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771 /// of \a __a and \a __b as input; other bits in these parameters are
2772 /// ignored.
2773 ///
2774 /// \code{.operation}
2775 /// result[31:0] := __a[95:64]
2776 /// result[63:32] := __b[95:64]
2777 /// result[95:64] := __a[127:96]
2778 /// result[127:96] := __b[127:96]
2779 /// result[159:128] := __a[223:192]
2780 /// result[191:160] := __b[223:192]
2781 /// result[223:192] := __a[255:224]
2782 /// result[255:224] := __b[255:224]
2783 /// \endcode
2784 ///
2785 /// \headerfile <immintrin.h>
2786 ///
2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2788 ///
2789 /// \param __a
2790 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791 /// elements of the result.
2792 /// \param __b
2793 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794 /// elements of the result.
2795 /// \returns A 256-bit vector of [8 x i32] containing the result.
2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpackhi_epi32(__m256i __a,__m256i __b)2797 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2798 {
2799 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2800 }
2801
2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805 /// of \a __a and \a __b as input; other bits in these parameters are
2806 /// ignored.
2807 ///
2808 /// \code{.operation}
2809 /// result[63:0] := __a[127:64]
2810 /// result[127:64] := __b[127:64]
2811 /// result[191:128] := __a[255:192]
2812 /// result[255:192] := __b[255:192]
2813 /// \endcode
2814 ///
2815 /// \headerfile <immintrin.h>
2816 ///
2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2818 ///
2819 /// \param __a
2820 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821 /// elements of the result.
2822 /// \param __b
2823 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824 /// elements of the result.
2825 /// \returns A 256-bit vector of [4 x i64] containing the result.
2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpackhi_epi64(__m256i __a,__m256i __b)2827 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2828 {
2829 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2830 }
2831
2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835 /// input; other bits in these parameters are ignored.
2836 ///
2837 /// \code{.operation}
2838 /// result[7:0] := __a[7:0]
2839 /// result[15:8] := __b[7:0]
2840 /// result[23:16] := __a[15:8]
2841 /// result[31:24] := __b[15:8]
2842 /// . . .
2843 /// result[127:120] := __b[63:56]
2844 /// result[135:128] := __a[135:128]
2845 /// . . .
2846 /// result[255:248] := __b[191:184]
2847 /// \endcode
2848 ///
2849 /// \headerfile <immintrin.h>
2850 ///
2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2852 ///
2853 /// \param __a
2854 /// A 256-bit integer vector used as the source for the even-numbered bytes
2855 /// of the result.
2856 /// \param __b
2857 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2858 /// of the result.
2859 /// \returns A 256-bit integer vector containing the result.
2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpacklo_epi8(__m256i __a,__m256i __b)2861 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2862 {
2863 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2864 }
2865
2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869 /// 128-bit half of \a __a and \a __b as input; other bits in these
2870 /// parameters are ignored.
2871 ///
2872 /// \code{.operation}
2873 /// result[15:0] := __a[15:0]
2874 /// result[31:16] := __b[15:0]
2875 /// result[47:32] := __a[31:16]
2876 /// result[63:48] := __b[31:16]
2877 /// . . .
2878 /// result[127:112] := __b[63:48]
2879 /// result[143:128] := __a[143:128]
2880 /// . . .
2881 /// result[255:239] := __b[191:176]
2882 /// \endcode
2883 ///
2884 /// \headerfile <immintrin.h>
2885 ///
2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2887 ///
2888 /// \param __a
2889 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890 /// elements of the result.
2891 /// \param __b
2892 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893 /// elements of the result.
2894 /// \returns A 256-bit vector of [16 x i16] containing the result.
2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpacklo_epi16(__m256i __a,__m256i __b)2896 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2897 {
2898 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2899 }
2900
2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904 /// of \a __a and \a __b as input; other bits in these parameters are
2905 /// ignored.
2906 ///
2907 /// \code{.operation}
2908 /// result[31:0] := __a[31:0]
2909 /// result[63:32] := __b[31:0]
2910 /// result[95:64] := __a[63:32]
2911 /// result[127:96] := __b[63:32]
2912 /// result[159:128] := __a[159:128]
2913 /// result[191:160] := __b[159:128]
2914 /// result[223:192] := __a[191:160]
2915 /// result[255:224] := __b[191:190]
2916 /// \endcode
2917 ///
2918 /// \headerfile <immintrin.h>
2919 ///
2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2921 ///
2922 /// \param __a
2923 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924 /// elements of the result.
2925 /// \param __b
2926 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927 /// elements of the result.
2928 /// \returns A 256-bit vector of [8 x i32] containing the result.
2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpacklo_epi32(__m256i __a,__m256i __b)2930 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2931 {
2932 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2933 }
2934
2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938 /// of \a __a and \a __b as input; other bits in these parameters are
2939 /// ignored.
2940 ///
2941 /// \code{.operation}
2942 /// result[63:0] := __a[63:0]
2943 /// result[127:64] := __b[63:0]
2944 /// result[191:128] := __a[191:128]
2945 /// result[255:192] := __b[191:128]
2946 /// \endcode
2947 ///
2948 /// \headerfile <immintrin.h>
2949 ///
2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2951 ///
2952 /// \param __a
2953 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954 /// elements of the result.
2955 /// \param __b
2956 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957 /// elements of the result.
2958 /// \returns A 256-bit vector of [4 x i64] containing the result.
2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_unpacklo_epi64(__m256i __a,__m256i __b)2960 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2961 {
2962 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2963 }
2964
2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2966 /// \a __b.
2967 ///
2968 /// \headerfile <immintrin.h>
2969 ///
2970 /// This intrinsic corresponds to the \c VPXOR instruction.
2971 ///
2972 /// \param __a
2973 /// A 256-bit integer vector.
2974 /// \param __b
2975 /// A 256-bit integer vector.
2976 /// \returns A 256-bit integer vector containing the result.
2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_xor_si256(__m256i __a,__m256i __b)2978 _mm256_xor_si256(__m256i __a, __m256i __b)
2979 {
2980 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2981 }
2982
2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2985 /// boundary.
2986 ///
2987 /// \headerfile <immintrin.h>
2988 ///
2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2990 ///
2991 /// \param __V
2992 /// A pointer to the 32-byte aligned memory containing the vector to load.
2993 /// \returns A 256-bit integer vector loaded from memory.
2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_stream_load_si256(const void * __V)2995 _mm256_stream_load_si256(const void *__V)
2996 {
2997 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2998 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2999 }
3000
3001 /// Broadcasts the 32-bit floating-point value from the low element of the
3002 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
3003 /// 128-bit vector of [4 x float].
3004 ///
3005 /// \headerfile <immintrin.h>
3006 ///
3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3008 ///
3009 /// \param __X
3010 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3011 /// \returns A 128-bit vector of [4 x float] containing the result.
3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_broadcastss_ps(__m128 __X)3013 _mm_broadcastss_ps(__m128 __X)
3014 {
3015 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3016 }
3017
3018 /// Broadcasts the 64-bit floating-point value from the low element of the
3019 /// 128-bit vector of [2 x double] in \a __a to both elements of the
3020 /// result's 128-bit vector of [2 x double].
3021 ///
3022 /// \headerfile <immintrin.h>
3023 ///
3024 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3025 ///
3026 /// \param __a
3027 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3028 /// \returns A 128-bit vector of [2 x double] containing the result.
3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_broadcastsd_pd(__m128d __a)3030 _mm_broadcastsd_pd(__m128d __a)
3031 {
3032 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3033 }
3034
3035 /// Broadcasts the 32-bit floating-point value from the low element of the
3036 /// 128-bit vector of [4 x float] in \a __X to all elements of the
3037 /// result's 256-bit vector of [8 x float].
3038 ///
3039 /// \headerfile <immintrin.h>
3040 ///
3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3042 ///
3043 /// \param __X
3044 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3045 /// \returns A 256-bit vector of [8 x float] containing the result.
3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_broadcastss_ps(__m128 __X)3047 _mm256_broadcastss_ps(__m128 __X)
3048 {
3049 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3050 }
3051
3052 /// Broadcasts the 64-bit floating-point value from the low element of the
3053 /// 128-bit vector of [2 x double] in \a __X to all elements of the
3054 /// result's 256-bit vector of [4 x double].
3055 ///
3056 /// \headerfile <immintrin.h>
3057 ///
3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3059 ///
3060 /// \param __X
3061 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3062 /// \returns A 256-bit vector of [4 x double] containing the result.
3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_broadcastsd_pd(__m128d __X)3064 _mm256_broadcastsd_pd(__m128d __X)
3065 {
3066 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3067 }
3068
3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070 /// upper halves of the 256-bit result.
3071 ///
3072 /// \headerfile <immintrin.h>
3073 ///
3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3075 ///
3076 /// \param __X
3077 /// A 128-bit integer vector to be broadcast.
3078 /// \returns A 256-bit integer vector containing the result.
3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastsi128_si256(__m128i __X)3080 _mm256_broadcastsi128_si256(__m128i __X)
3081 {
3082 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3083 }
3084
3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3086
3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089 /// as specified by the immediate integer operand \a M.
3090 ///
3091 /// \code{.operation}
3092 /// FOR i := 0 TO 3
3093 /// j := i*32
3094 /// IF M[i] == 0
3095 /// result[31+j:j] := V1[31+j:j]
3096 /// ELSE
3097 /// result[31+j:j] := V2[32+j:j]
3098 /// FI
3099 /// ENDFOR
3100 /// \endcode
3101 ///
3102 /// \headerfile <immintrin.h>
3103 ///
3104 /// \code
3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3106 /// \endcode
3107 ///
3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3109 ///
3110 /// \param V1
3111 /// A 128-bit vector of [4 x i32] containing source values.
3112 /// \param V2
3113 /// A 128-bit vector of [4 x i32] containing source values.
3114 /// \param M
3115 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
3116 /// source for each element of the result. The position of the mask bit
3117 /// corresponds to the index of a copied value. When a mask bit is 0, the
3118 /// element is copied from \a V1; otherwise, it is copied from \a V2.
3119 /// \returns A 128-bit vector of [4 x i32] containing the result.
3120 #define _mm_blend_epi32(V1, V2, M) \
3121 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122 (__v4si)(__m128i)(V2), (int)(M)))
3123
3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126 /// as specified by the immediate integer operand \a M.
3127 ///
3128 /// \code{.operation}
3129 /// FOR i := 0 TO 7
3130 /// j := i*32
3131 /// IF M[i] == 0
3132 /// result[31+j:j] := V1[31+j:j]
3133 /// ELSE
3134 /// result[31+j:j] := V2[32+j:j]
3135 /// FI
3136 /// ENDFOR
3137 /// \endcode
3138 ///
3139 /// \headerfile <immintrin.h>
3140 ///
3141 /// \code
3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3143 /// \endcode
3144 ///
3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3146 ///
3147 /// \param V1
3148 /// A 256-bit vector of [8 x i32] containing source values.
3149 /// \param V2
3150 /// A 256-bit vector of [8 x i32] containing source values.
3151 /// \param M
3152 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
3153 /// source for each element of the result. The position of the mask bit
3154 /// corresponds to the index of a copied value. When a mask bit is 0, the
3155 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
3156 /// \returns A 256-bit vector of [8 x i32] containing the result.
3157 #define _mm256_blend_epi32(V1, V2, M) \
3158 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159 (__v8si)(__m256i)(V2), (int)(M)))
3160
3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162 /// bytes of the 256-bit result.
3163 ///
3164 /// \headerfile <immintrin.h>
3165 ///
3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3167 ///
3168 /// \param __X
3169 /// A 128-bit integer vector whose low byte will be broadcast.
3170 /// \returns A 256-bit integer vector containing the result.
3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastb_epi8(__m128i __X)3172 _mm256_broadcastb_epi8(__m128i __X)
3173 {
3174 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3175 }
3176
3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178 /// to all elements of the result's 256-bit vector of [16 x i16].
3179 ///
3180 /// \headerfile <immintrin.h>
3181 ///
3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3183 ///
3184 /// \param __X
3185 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186 /// \returns A 256-bit vector of [16 x i16] containing the result.
3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastw_epi16(__m128i __X)3188 _mm256_broadcastw_epi16(__m128i __X)
3189 {
3190 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3191 }
3192
3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194 /// to all elements of the result's 256-bit vector of [8 x i32].
3195 ///
3196 /// \headerfile <immintrin.h>
3197 ///
3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3199 ///
3200 /// \param __X
3201 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202 /// \returns A 256-bit vector of [8 x i32] containing the result.
3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastd_epi32(__m128i __X)3204 _mm256_broadcastd_epi32(__m128i __X)
3205 {
3206 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3207 }
3208
3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210 /// to all elements of the result's 256-bit vector of [4 x i64].
3211 ///
3212 /// \headerfile <immintrin.h>
3213 ///
3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3215 ///
3216 /// \param __X
3217 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218 /// \returns A 256-bit vector of [4 x i64] containing the result.
3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastq_epi64(__m128i __X)3220 _mm256_broadcastq_epi64(__m128i __X)
3221 {
3222 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3223 }
3224
3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226 /// bytes of the 128-bit result.
3227 ///
3228 /// \headerfile <immintrin.h>
3229 ///
3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3231 ///
3232 /// \param __X
3233 /// A 128-bit integer vector whose low byte will be broadcast.
3234 /// \returns A 128-bit integer vector containing the result.
3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastb_epi8(__m128i __X)3236 _mm_broadcastb_epi8(__m128i __X)
3237 {
3238 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3239 }
3240
3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3243 ///
3244 /// \headerfile <immintrin.h>
3245 ///
3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3247 ///
3248 /// \param __X
3249 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250 /// \returns A 128-bit vector of [8 x i16] containing the result.
3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastw_epi16(__m128i __X)3252 _mm_broadcastw_epi16(__m128i __X)
3253 {
3254 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3255 }
3256
3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258 /// to all elements of the result's vector of [4 x i32].
3259 ///
3260 /// \headerfile <immintrin.h>
3261 ///
3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3263 ///
3264 /// \param __X
3265 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266 /// \returns A 128-bit vector of [4 x i32] containing the result.
3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastd_epi32(__m128i __X)3268 _mm_broadcastd_epi32(__m128i __X)
3269 {
3270 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3271 }
3272
3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274 /// to both elements of the result's 128-bit vector of [2 x i64].
3275 ///
3276 /// \headerfile <immintrin.h>
3277 ///
3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3279 ///
3280 /// \param __X
3281 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282 /// \returns A 128-bit vector of [2 x i64] containing the result.
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastq_epi64(__m128i __X)3284 _mm_broadcastq_epi64(__m128i __X)
3285 {
3286 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3287 }
3288
3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291 /// elements of the 256-bit vector of [8 x i32] in \a __b.
3292 ///
3293 /// \code{.operation}
3294 /// FOR i := 0 TO 7
3295 /// j := i*32
3296 /// k := __b[j+2:j] * 32
3297 /// result[j+31:j] := __a[k+31:k]
3298 /// ENDFOR
3299 /// \endcode
3300 ///
3301 /// \headerfile <immintrin.h>
3302 ///
3303 /// This intrinsic corresponds to the \c VPERMD instruction.
3304 ///
3305 /// \param __a
3306 /// A 256-bit vector of [8 x i32] containing the source values.
3307 /// \param __b
3308 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3309 /// \a __a.
3310 /// \returns A 256-bit vector of [8 x i32] containing the result.
3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutevar8x32_epi32(__m256i __a,__m256i __b)3312 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3313 {
3314 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3315 }
3316
3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318 /// the 256-bit vector of [4 x double] in \a V as specified by the
3319 /// immediate value \a M.
3320 ///
3321 /// \code{.operation}
3322 /// FOR i := 0 TO 3
3323 /// j := i*64
3324 /// k := (M >> i*2)[1:0] * 64
3325 /// result[j+63:j] := V[k+63:k]
3326 /// ENDFOR
3327 /// \endcode
3328 ///
3329 /// \headerfile <immintrin.h>
3330 ///
3331 /// \code
3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3333 /// \endcode
3334 ///
3335 /// This intrinsic corresponds to the \c VPERMPD instruction.
3336 ///
3337 /// \param V
3338 /// A 256-bit vector of [4 x double] containing the source values.
3339 /// \param M
3340 /// An immediate 8-bit value specifying which elements to copy from \a V.
3341 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3342 /// \a M[3:2] specifies the index for element 1, and so forth.
3343 /// \returns A 256-bit vector of [4 x double] containing the result.
3344 #define _mm256_permute4x64_pd(V, M) \
3345 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3346
3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
3350 ///
3351 /// \code{.operation}
3352 /// FOR i := 0 TO 7
3353 /// j := i*32
3354 /// k := __b[j+2:j] * 32
3355 /// result[j+31:j] := __a[k+31:k]
3356 /// ENDFOR
3357 /// \endcode
3358 ///
3359 /// \headerfile <immintrin.h>
3360 ///
3361 /// This intrinsic corresponds to the \c VPERMPS instruction.
3362 ///
3363 /// \param __a
3364 /// A 256-bit vector of [8 x float] containing the source values.
3365 /// \param __b
3366 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3367 /// \a __a.
3368 /// \returns A 256-bit vector of [8 x float] containing the result.
3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_permutevar8x32_ps(__m256 __a,__m256i __b)3370 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3371 {
3372 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3373 }
3374
3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
3377 /// immediate value \a M.
3378 ///
3379 /// \code{.operation}
3380 /// FOR i := 0 TO 3
3381 /// j := i*64
3382 /// k := (M >> i*2)[1:0] * 64
3383 /// result[j+63:j] := V[k+63:k]
3384 /// ENDFOR
3385 /// \endcode
3386 ///
3387 /// \headerfile <immintrin.h>
3388 ///
3389 /// \code
3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3391 /// \endcode
3392 ///
3393 /// This intrinsic corresponds to the \c VPERMQ instruction.
3394 ///
3395 /// \param V
3396 /// A 256-bit vector of [4 x i64] containing the source values.
3397 /// \param M
3398 /// An immediate 8-bit value specifying which elements to copy from \a V.
3399 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3400 /// \a M[3:2] specifies the index for element 1, and so forth.
3401 /// \returns A 256-bit vector of [4 x i64] containing the result.
3402 #define _mm256_permute4x64_epi64(V, M) \
3403 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3404
3405 /// Sets each half of the 256-bit result either to zero or to one of the
3406 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407 /// as specified by the immediate value \a M.
3408 ///
3409 /// \code{.operation}
3410 /// FOR i := 0 TO 1
3411 /// j := i*128
3412 /// k := M >> (i*4)
3413 /// IF k[3] == 0
3414 /// CASE (k[1:0]) OF
3415 /// 0: result[127+j:j] := V1[127:0]
3416 /// 1: result[127+j:j] := V1[255:128]
3417 /// 2: result[127+j:j] := V2[127:0]
3418 /// 3: result[127+j:j] := V2[255:128]
3419 /// ESAC
3420 /// ELSE
3421 /// result[127+j:j] := 0
3422 /// FI
3423 /// ENDFOR
3424 /// \endcode
3425 ///
3426 /// \headerfile <immintrin.h>
3427 ///
3428 /// \code
3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3430 /// \endcode
3431 ///
3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3433 ///
3434 /// \param V1
3435 /// A 256-bit integer vector containing source values.
3436 /// \param V2
3437 /// A 256-bit integer vector containing source values.
3438 /// \param M
3439 /// An immediate value specifying how to form the result. Bits [3:0]
3440 /// control the lower half of the result, bits [7:4] control the upper half.
3441 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442 /// otherwise bits [1:0] determine the source as follows. \n
3443 /// 0: the lower half of \a V1 \n
3444 /// 1: the upper half of \a V1 \n
3445 /// 2: the lower half of \a V2 \n
3446 /// 3: the upper half of \a V2
3447 /// \returns A 256-bit integer vector containing the result.
3448 #define _mm256_permute2x128_si256(V1, V2, M) \
3449 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3450
3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452 /// of the immediate \a M is zero, extracts the lower half of the result;
3453 /// otherwise, extracts the upper half.
3454 ///
3455 /// \headerfile <immintrin.h>
3456 ///
3457 /// \code
3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3459 /// \endcode
3460 ///
3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3462 ///
3463 /// \param V
3464 /// A 256-bit integer vector containing the source values.
3465 /// \param M
3466 /// An immediate value specifying which half of \a V to extract.
3467 /// \returns A 128-bit integer vector containing the result.
3468 #define _mm256_extracti128_si256(V, M) \
3469 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3470
3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473 /// is zero, overwrites the lower half of the result; otherwise,
3474 /// overwrites the upper half.
3475 ///
3476 /// \headerfile <immintrin.h>
3477 ///
3478 /// \code
3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3480 /// \endcode
3481 ///
3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3483 ///
3484 /// \param V1
3485 /// A 256-bit integer vector containing a source value.
3486 /// \param V2
3487 /// A 128-bit integer vector containing a source value.
3488 /// \param M
3489 /// An immediate value specifying where to put \a V2 in the result.
3490 /// \returns A 256-bit integer vector containing the result.
3491 #define _mm256_inserti128_si256(V1, V2, M) \
3492 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493 (__v2di)(__m128i)(V2), (int)(M)))
3494
3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496 /// the most significant bit of the corresponding element in the mask
3497 /// \a __M is set; otherwise, sets that element of the result to zero.
3498 /// Returns the 256-bit [8 x i32] result.
3499 ///
3500 /// \code{.operation}
3501 /// FOR i := 0 TO 7
3502 /// j := i*32
3503 /// IF __M[j+31] == 1
3504 /// result[j+31:j] := Load32(__X+(i*4))
3505 /// ELSE
3506 /// result[j+31:j] := 0
3507 /// FI
3508 /// ENDFOR
3509 /// \endcode
3510 ///
3511 /// \headerfile <immintrin.h>
3512 ///
3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3514 ///
3515 /// \param __X
3516 /// A pointer to the memory used for loading values.
3517 /// \param __M
3518 /// A 256-bit vector of [8 x i32] containing the mask bits.
3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3520 /// elements.
3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskload_epi32(int const * __X,__m256i __M)3522 _mm256_maskload_epi32(int const *__X, __m256i __M)
3523 {
3524 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3525 }
3526
3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528 /// the most significant bit of the corresponding element in the mask
3529 /// \a __M is set; otherwise, sets that element of the result to zero.
3530 /// Returns the 256-bit [4 x i64] result.
3531 ///
3532 /// \code{.operation}
3533 /// FOR i := 0 TO 3
3534 /// j := i*64
3535 /// IF __M[j+63] == 1
3536 /// result[j+63:j] := Load64(__X+(i*8))
3537 /// ELSE
3538 /// result[j+63:j] := 0
3539 /// FI
3540 /// ENDFOR
3541 /// \endcode
3542 ///
3543 /// \headerfile <immintrin.h>
3544 ///
3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3546 ///
3547 /// \param __X
3548 /// A pointer to the memory used for loading values.
3549 /// \param __M
3550 /// A 256-bit vector of [4 x i64] containing the mask bits.
3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3552 /// elements.
3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskload_epi64(long long const * __X,__m256i __M)3554 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3555 {
3556 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3557 }
3558
3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560 /// the most significant bit of the corresponding element in the mask
3561 /// \a __M is set; otherwise, sets that element of the result to zero.
3562 /// Returns the 128-bit [4 x i32] result.
3563 ///
3564 /// \code{.operation}
3565 /// FOR i := 0 TO 3
3566 /// j := i*32
3567 /// IF __M[j+31] == 1
3568 /// result[j+31:j] := Load32(__X+(i*4))
3569 /// ELSE
3570 /// result[j+31:j] := 0
3571 /// FI
3572 /// ENDFOR
3573 /// \endcode
3574 ///
3575 /// \headerfile <immintrin.h>
3576 ///
3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3578 ///
3579 /// \param __X
3580 /// A pointer to the memory used for loading values.
3581 /// \param __M
3582 /// A 128-bit vector of [4 x i32] containing the mask bits.
3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3584 /// elements.
3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskload_epi32(int const * __X,__m128i __M)3586 _mm_maskload_epi32(int const *__X, __m128i __M)
3587 {
3588 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3589 }
3590
3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592 /// the most significant bit of the corresponding element in the mask
3593 /// \a __M is set; otherwise, sets that element of the result to zero.
3594 /// Returns the 128-bit [2 x i64] result.
3595 ///
3596 /// \code{.operation}
3597 /// FOR i := 0 TO 1
3598 /// j := i*64
3599 /// IF __M[j+63] == 1
3600 /// result[j+63:j] := Load64(__X+(i*8))
3601 /// ELSE
3602 /// result[j+63:j] := 0
3603 /// FI
3604 /// ENDFOR
3605 /// \endcode
3606 ///
3607 /// \headerfile <immintrin.h>
3608 ///
3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3610 ///
3611 /// \param __X
3612 /// A pointer to the memory used for loading values.
3613 /// \param __M
3614 /// A 128-bit vector of [2 x i64] containing the mask bits.
3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3616 /// elements.
3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskload_epi64(long long const * __X,__m128i __M)3618 _mm_maskload_epi64(long long const *__X, __m128i __M)
3619 {
3620 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3621 }
3622
3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625 /// the corresponding element in the mask \a __M is set; otherwise, the
3626 /// memory element is unchanged.
3627 ///
3628 /// \code{.operation}
3629 /// FOR i := 0 TO 7
3630 /// j := i*32
3631 /// IF __M[j+31] == 1
3632 /// Store32(__X+(i*4), __Y[j+31:j])
3633 /// FI
3634 /// ENDFOR
3635 /// \endcode
3636 ///
3637 /// \headerfile <immintrin.h>
3638 ///
3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3640 ///
3641 /// \param __X
3642 /// A pointer to the memory used for storing values.
3643 /// \param __M
3644 /// A 256-bit vector of [8 x i32] containing the mask bits.
3645 /// \param __Y
3646 /// A 256-bit vector of [8 x i32] containing the values to store.
3647 static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_maskstore_epi32(int * __X,__m256i __M,__m256i __Y)3648 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3649 {
3650 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3651 }
3652
3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655 /// the corresponding element in the mask \a __M is set; otherwise, the
3656 /// memory element is unchanged.
3657 ///
3658 /// \code{.operation}
3659 /// FOR i := 0 TO 3
3660 /// j := i*64
3661 /// IF __M[j+63] == 1
3662 /// Store64(__X+(i*8), __Y[j+63:j])
3663 /// FI
3664 /// ENDFOR
3665 /// \endcode
3666 ///
3667 /// \headerfile <immintrin.h>
3668 ///
3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3670 ///
3671 /// \param __X
3672 /// A pointer to the memory used for storing values.
3673 /// \param __M
3674 /// A 256-bit vector of [4 x i64] containing the mask bits.
3675 /// \param __Y
3676 /// A 256-bit vector of [4 x i64] containing the values to store.
3677 static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_maskstore_epi64(long long * __X,__m256i __M,__m256i __Y)3678 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3679 {
3680 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3681 }
3682
3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685 /// the corresponding element in the mask \a __M is set; otherwise, the
3686 /// memory element is unchanged.
3687 ///
3688 /// \code{.operation}
3689 /// FOR i := 0 TO 3
3690 /// j := i*32
3691 /// IF __M[j+31] == 1
3692 /// Store32(__X+(i*4), __Y[j+31:j])
3693 /// FI
3694 /// ENDFOR
3695 /// \endcode
3696 ///
3697 /// \headerfile <immintrin.h>
3698 ///
3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3700 ///
3701 /// \param __X
3702 /// A pointer to the memory used for storing values.
3703 /// \param __M
3704 /// A 128-bit vector of [4 x i32] containing the mask bits.
3705 /// \param __Y
3706 /// A 128-bit vector of [4 x i32] containing the values to store.
3707 static __inline__ void __DEFAULT_FN_ATTRS128
_mm_maskstore_epi32(int * __X,__m128i __M,__m128i __Y)3708 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3709 {
3710 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3711 }
3712
3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715 /// the corresponding element in the mask \a __M is set; otherwise, the
3716 /// memory element is unchanged.
3717 ///
3718 /// \code{.operation}
3719 /// FOR i := 0 TO 1
3720 /// j := i*64
3721 /// IF __M[j+63] == 1
3722 /// Store64(__X+(i*8), __Y[j+63:j])
3723 /// FI
3724 /// ENDFOR
3725 /// \endcode
3726 ///
3727 /// \headerfile <immintrin.h>
3728 ///
3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3730 ///
3731 /// \param __X
3732 /// A pointer to the memory used for storing values.
3733 /// \param __M
3734 /// A 128-bit vector of [2 x i64] containing the mask bits.
3735 /// \param __Y
3736 /// A 128-bit vector of [2 x i64] containing the values to store.
3737 static __inline__ void __DEFAULT_FN_ATTRS128
_mm_maskstore_epi64(long long * __X,__m128i __M,__m128i __Y)3738 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3739 {
3740 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3741 }
3742
3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744 /// left by the number of bits given in the corresponding element of the
3745 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746 /// returns the result. If the shift count for any element is greater than
3747 /// 31, the result for that element is zero.
3748 ///
3749 /// \headerfile <immintrin.h>
3750 ///
3751 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3752 ///
3753 /// \param __X
3754 /// A 256-bit vector of [8 x i32] to be shifted.
3755 /// \param __Y
3756 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3757 /// bits).
3758 /// \returns A 256-bit vector of [8 x i32] containing the result.
3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sllv_epi32(__m256i __X,__m256i __Y)3760 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3761 {
3762 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3763 }
3764
3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766 /// left by the number of bits given in the corresponding element of the
3767 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768 /// returns the result. If the shift count for any element is greater than
3769 /// 31, the result for that element is zero.
3770 ///
3771 /// \headerfile <immintrin.h>
3772 ///
3773 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3774 ///
3775 /// \param __X
3776 /// A 128-bit vector of [4 x i32] to be shifted.
3777 /// \param __Y
3778 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3779 /// bits).
3780 /// \returns A 128-bit vector of [4 x i32] containing the result.
3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_sllv_epi32(__m128i __X,__m128i __Y)3782 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3783 {
3784 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3785 }
3786
3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788 /// left by the number of bits given in the corresponding element of the
3789 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790 /// returns the result. If the shift count for any element is greater than
3791 /// 63, the result for that element is zero.
3792 ///
3793 /// \headerfile <immintrin.h>
3794 ///
3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3796 ///
3797 /// \param __X
3798 /// A 256-bit vector of [4 x i64] to be shifted.
3799 /// \param __Y
3800 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3801 /// bits).
3802 /// \returns A 256-bit vector of [4 x i64] containing the result.
3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sllv_epi64(__m256i __X,__m256i __Y)3804 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3805 {
3806 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3807 }
3808
3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810 /// left by the number of bits given in the corresponding element of the
3811 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812 /// returns the result. If the shift count for any element is greater than
3813 /// 63, the result for that element is zero.
3814 ///
3815 /// \headerfile <immintrin.h>
3816 ///
3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3818 ///
3819 /// \param __X
3820 /// A 128-bit vector of [2 x i64] to be shifted.
3821 /// \param __Y
3822 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3823 /// bits).
3824 /// \returns A 128-bit vector of [2 x i64] containing the result.
3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_sllv_epi64(__m128i __X,__m128i __Y)3826 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3827 {
3828 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3829 }
3830
3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832 /// right by the number of bits given in the corresponding element of the
3833 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834 /// returns the result. If the shift count for any element is greater than
3835 /// 31, the result for that element is 0 or -1 according to the sign bit
3836 /// for that element.
3837 ///
3838 /// \headerfile <immintrin.h>
3839 ///
3840 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3841 ///
3842 /// \param __X
3843 /// A 256-bit vector of [8 x i32] to be shifted.
3844 /// \param __Y
3845 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3846 /// bits).
3847 /// \returns A 256-bit vector of [8 x i32] containing the result.
3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srav_epi32(__m256i __X,__m256i __Y)3849 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3850 {
3851 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3852 }
3853
3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855 /// right by the number of bits given in the corresponding element of the
3856 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857 /// returns the result. If the shift count for any element is greater than
3858 /// 31, the result for that element is 0 or -1 according to the sign bit
3859 /// for that element.
3860 ///
3861 /// \headerfile <immintrin.h>
3862 ///
3863 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3864 ///
3865 /// \param __X
3866 /// A 128-bit vector of [4 x i32] to be shifted.
3867 /// \param __Y
3868 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3869 /// bits).
3870 /// \returns A 128-bit vector of [4 x i32] containing the result.
3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_srav_epi32(__m128i __X,__m128i __Y)3872 _mm_srav_epi32(__m128i __X, __m128i __Y)
3873 {
3874 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3875 }
3876
3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878 /// right by the number of bits given in the corresponding element of the
3879 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880 /// returns the result. If the shift count for any element is greater than
3881 /// 31, the result for that element is zero.
3882 ///
3883 /// \headerfile <immintrin.h>
3884 ///
3885 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3886 ///
3887 /// \param __X
3888 /// A 256-bit vector of [8 x i32] to be shifted.
3889 /// \param __Y
3890 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3891 /// bits).
3892 /// \returns A 256-bit vector of [8 x i32] containing the result.
3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srlv_epi32(__m256i __X,__m256i __Y)3894 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3895 {
3896 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3897 }
3898
3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900 /// right by the number of bits given in the corresponding element of the
3901 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902 /// returns the result. If the shift count for any element is greater than
3903 /// 31, the result for that element is zero.
3904 ///
3905 /// \headerfile <immintrin.h>
3906 ///
3907 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3908 ///
3909 /// \param __X
3910 /// A 128-bit vector of [4 x i32] to be shifted.
3911 /// \param __Y
3912 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3913 /// bits).
3914 /// \returns A 128-bit vector of [4 x i32] containing the result.
3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_srlv_epi32(__m128i __X,__m128i __Y)3916 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3917 {
3918 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3919 }
3920
3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922 /// right by the number of bits given in the corresponding element of the
3923 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924 /// returns the result. If the shift count for any element is greater than
3925 /// 63, the result for that element is zero.
3926 ///
3927 /// \headerfile <immintrin.h>
3928 ///
3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3930 ///
3931 /// \param __X
3932 /// A 256-bit vector of [4 x i64] to be shifted.
3933 /// \param __Y
3934 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3935 /// bits).
3936 /// \returns A 256-bit vector of [4 x i64] containing the result.
3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srlv_epi64(__m256i __X,__m256i __Y)3938 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3939 {
3940 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3941 }
3942
3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944 /// right by the number of bits given in the corresponding element of the
3945 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946 /// returns the result. If the shift count for any element is greater than
3947 /// 63, the result for that element is zero.
3948 ///
3949 /// \headerfile <immintrin.h>
3950 ///
3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3952 ///
3953 /// \param __X
3954 /// A 128-bit vector of [2 x i64] to be shifted.
3955 /// \param __Y
3956 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3957 /// bits).
3958 /// \returns A 128-bit vector of [2 x i64] containing the result.
3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_srlv_epi64(__m128i __X,__m128i __Y)3960 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3961 {
3962 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3963 }
3964
3965 /// Conditionally gathers two 64-bit floating-point values, either from the
3966 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968 /// of [2 x double] in \a mask determines the source for each element.
3969 ///
3970 /// \code{.operation}
3971 /// FOR element := 0 to 1
3972 /// j := element*64
3973 /// k := element*32
3974 /// IF mask[j+63] == 0
3975 /// result[j+63:j] := a[j+63:j]
3976 /// ELSE
3977 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3978 /// FI
3979 /// ENDFOR
3980 /// \endcode
3981 ///
3982 /// \headerfile <immintrin.h>
3983 ///
3984 /// \code
3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986 /// __m128d mask, const int s);
3987 /// \endcode
3988 ///
3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3990 ///
3991 /// \param a
3992 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3993 /// zero.
3994 /// \param m
3995 /// A pointer to the memory used for loading values.
3996 /// \param i
3997 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998 /// the first two elements are used.
3999 /// \param mask
4000 /// A 128-bit vector of [2 x double] containing the mask. The most
4001 /// significant bit of each element in the mask vector represents the mask
4002 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4003 /// is gathered; otherwise the value is loaded from memory.
4004 /// \param s
4005 /// A literal constant scale factor for the indexes in \a i. Must be
4006 /// 1, 2, 4, or 8.
4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010 (double const *)(m), \
4011 (__v4si)(__m128i)(i), \
4012 (__v2df)(__m128d)(mask), (s)))
4013
4014 /// Conditionally gathers four 64-bit floating-point values, either from the
4015 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017 /// of [4 x double] in \a mask determines the source for each element.
4018 ///
4019 /// \code{.operation}
4020 /// FOR element := 0 to 3
4021 /// j := element*64
4022 /// k := element*32
4023 /// IF mask[j+63] == 0
4024 /// result[j+63:j] := a[j+63:j]
4025 /// ELSE
4026 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4027 /// FI
4028 /// ENDFOR
4029 /// \endcode
4030 ///
4031 /// \headerfile <immintrin.h>
4032 ///
4033 /// \code
4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035 /// __m256d mask, const int s);
4036 /// \endcode
4037 ///
4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4039 ///
4040 /// \param a
4041 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4042 /// zero.
4043 /// \param m
4044 /// A pointer to the memory used for loading values.
4045 /// \param i
4046 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4047 /// \param mask
4048 /// A 256-bit vector of [4 x double] containing the mask. The most
4049 /// significant bit of each element in the mask vector represents the mask
4050 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4051 /// is gathered; otherwise the value is loaded from memory.
4052 /// \param s
4053 /// A literal constant scale factor for the indexes in \a i. Must be
4054 /// 1, 2, 4, or 8.
4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058 (double const *)(m), \
4059 (__v4si)(__m128i)(i), \
4060 (__v4df)(__m256d)(mask), (s)))
4061
4062 /// Conditionally gathers two 64-bit floating-point values, either from the
4063 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065 /// of [2 x double] in \a mask determines the source for each element.
4066 ///
4067 /// \code{.operation}
4068 /// FOR element := 0 to 1
4069 /// j := element*64
4070 /// k := element*64
4071 /// IF mask[j+63] == 0
4072 /// result[j+63:j] := a[j+63:j]
4073 /// ELSE
4074 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4075 /// FI
4076 /// ENDFOR
4077 /// \endcode
4078 ///
4079 /// \headerfile <immintrin.h>
4080 ///
4081 /// \code
4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083 /// __m128d mask, const int s);
4084 /// \endcode
4085 ///
4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4087 ///
4088 /// \param a
4089 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
4090 /// zero.
4091 /// \param m
4092 /// A pointer to the memory used for loading values.
4093 /// \param i
4094 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4095 /// \param mask
4096 /// A 128-bit vector of [2 x double] containing the mask. The most
4097 /// significant bit of each element in the mask vector represents the mask
4098 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4099 /// is gathered; otherwise the value is loaded from memory.
4100 /// \param s
4101 /// A literal constant scale factor for the indexes in \a i. Must be
4102 /// 1, 2, 4, or 8.
4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106 (double const *)(m), \
4107 (__v2di)(__m128i)(i), \
4108 (__v2df)(__m128d)(mask), (s)))
4109
4110 /// Conditionally gathers four 64-bit floating-point values, either from the
4111 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113 /// of [4 x double] in \a mask determines the source for each element.
4114 ///
4115 /// \code{.operation}
4116 /// FOR element := 0 to 3
4117 /// j := element*64
4118 /// k := element*64
4119 /// IF mask[j+63] == 0
4120 /// result[j+63:j] := a[j+63:j]
4121 /// ELSE
4122 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4123 /// FI
4124 /// ENDFOR
4125 /// \endcode
4126 ///
4127 /// \headerfile <immintrin.h>
4128 ///
4129 /// \code
4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131 /// __m256d mask, const int s);
4132 /// \endcode
4133 ///
4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4135 ///
4136 /// \param a
4137 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4138 /// zero.
4139 /// \param m
4140 /// A pointer to the memory used for loading values.
4141 /// \param i
4142 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4143 /// \param mask
4144 /// A 256-bit vector of [4 x double] containing the mask. The most
4145 /// significant bit of each element in the mask vector represents the mask
4146 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4147 /// is gathered; otherwise the value is loaded from memory.
4148 /// \param s
4149 /// A literal constant scale factor for the indexes in \a i. Must be
4150 /// 1, 2, 4, or 8.
4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154 (double const *)(m), \
4155 (__v4di)(__m256i)(i), \
4156 (__v4df)(__m256d)(mask), (s)))
4157
4158 /// Conditionally gathers four 32-bit floating-point values, either from the
4159 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161 /// of [4 x float] in \a mask determines the source for each element.
4162 ///
4163 /// \code{.operation}
4164 /// FOR element := 0 to 3
4165 /// j := element*32
4166 /// k := element*32
4167 /// IF mask[j+31] == 0
4168 /// result[j+31:j] := a[j+31:j]
4169 /// ELSE
4170 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4171 /// FI
4172 /// ENDFOR
4173 /// \endcode
4174 ///
4175 /// \headerfile <immintrin.h>
4176 ///
4177 /// \code
4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179 /// __m128 mask, const int s);
4180 /// \endcode
4181 ///
4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4183 ///
4184 /// \param a
4185 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4186 /// zero.
4187 /// \param m
4188 /// A pointer to the memory used for loading values.
4189 /// \param i
4190 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4191 /// \param mask
4192 /// A 128-bit vector of [4 x float] containing the mask. The most
4193 /// significant bit of each element in the mask vector represents the mask
4194 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4195 /// is gathered; otherwise the value is loaded from memory.
4196 /// \param s
4197 /// A literal constant scale factor for the indexes in \a i. Must be
4198 /// 1, 2, 4, or 8.
4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202 (float const *)(m), \
4203 (__v4si)(__m128i)(i), \
4204 (__v4sf)(__m128)(mask), (s)))
4205
4206 /// Conditionally gathers eight 32-bit floating-point values, either from the
4207 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209 /// of [8 x float] in \a mask determines the source for each element.
4210 ///
4211 /// \code{.operation}
4212 /// FOR element := 0 to 7
4213 /// j := element*32
4214 /// k := element*32
4215 /// IF mask[j+31] == 0
4216 /// result[j+31:j] := a[j+31:j]
4217 /// ELSE
4218 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4219 /// FI
4220 /// ENDFOR
4221 /// \endcode
4222 ///
4223 /// \headerfile <immintrin.h>
4224 ///
4225 /// \code
4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227 /// __m256 mask, const int s);
4228 /// \endcode
4229 ///
4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4231 ///
4232 /// \param a
4233 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
4234 /// zero.
4235 /// \param m
4236 /// A pointer to the memory used for loading values.
4237 /// \param i
4238 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4239 /// \param mask
4240 /// A 256-bit vector of [8 x float] containing the mask. The most
4241 /// significant bit of each element in the mask vector represents the mask
4242 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4243 /// is gathered; otherwise the value is loaded from memory.
4244 /// \param s
4245 /// A literal constant scale factor for the indexes in \a i. Must be
4246 /// 1, 2, 4, or 8.
4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250 (float const *)(m), \
4251 (__v8si)(__m256i)(i), \
4252 (__v8sf)(__m256)(mask), (s)))
4253
4254 /// Conditionally gathers two 32-bit floating-point values, either from the
4255 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257 /// of [4 x float] in \a mask determines the source for the lower two
4258 /// elements. The upper two elements of the result are zeroed.
4259 ///
4260 /// \code{.operation}
4261 /// FOR element := 0 to 1
4262 /// j := element*32
4263 /// k := element*64
4264 /// IF mask[j+31] == 0
4265 /// result[j+31:j] := a[j+31:j]
4266 /// ELSE
4267 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4268 /// FI
4269 /// ENDFOR
4270 /// result[127:64] := 0
4271 /// \endcode
4272 ///
4273 /// \headerfile <immintrin.h>
4274 ///
4275 /// \code
4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277 /// __m128 mask, const int s);
4278 /// \endcode
4279 ///
4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4281 ///
4282 /// \param a
4283 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4284 /// zero. Only the first two elements are used.
4285 /// \param m
4286 /// A pointer to the memory used for loading values.
4287 /// \param i
4288 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4289 /// \param mask
4290 /// A 128-bit vector of [4 x float] containing the mask. The most
4291 /// significant bit of each element in the mask vector represents the mask
4292 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4293 /// is gathered; otherwise the value is loaded from memory. Only the first
4294 /// two elements are used.
4295 /// \param s
4296 /// A literal constant scale factor for the indexes in \a i. Must be
4297 /// 1, 2, 4, or 8.
4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301 (float const *)(m), \
4302 (__v2di)(__m128i)(i), \
4303 (__v4sf)(__m128)(mask), (s)))
4304
4305 /// Conditionally gathers four 32-bit floating-point values, either from the
4306 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308 /// of [4 x float] in \a mask determines the source for each element.
4309 ///
4310 /// \code{.operation}
4311 /// FOR element := 0 to 3
4312 /// j := element*32
4313 /// k := element*64
4314 /// IF mask[j+31] == 0
4315 /// result[j+31:j] := a[j+31:j]
4316 /// ELSE
4317 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4318 /// FI
4319 /// ENDFOR
4320 /// \endcode
4321 ///
4322 /// \headerfile <immintrin.h>
4323 ///
4324 /// \code
4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326 /// __m128 mask, const int s);
4327 /// \endcode
4328 ///
4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4330 ///
4331 /// \param a
4332 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4333 /// zero.
4334 /// \param m
4335 /// A pointer to the memory used for loading values.
4336 /// \param i
4337 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4338 /// \param mask
4339 /// A 128-bit vector of [4 x float] containing the mask. The most
4340 /// significant bit of each element in the mask vector represents the mask
4341 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4342 /// is gathered; otherwise the value is loaded from memory.
4343 /// \param s
4344 /// A literal constant scale factor for the indexes in \a i. Must be
4345 /// 1, 2, 4, or 8.
4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349 (float const *)(m), \
4350 (__v4di)(__m256i)(i), \
4351 (__v4sf)(__m128)(mask), (s)))
4352
4353 /// Conditionally gathers four 32-bit integer values, either from the
4354 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356 /// of [4 x i32] in \a mask determines the source for each element.
4357 ///
4358 /// \code{.operation}
4359 /// FOR element := 0 to 3
4360 /// j := element*32
4361 /// k := element*32
4362 /// IF mask[j+31] == 0
4363 /// result[j+31:j] := a[j+31:j]
4364 /// ELSE
4365 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4366 /// FI
4367 /// ENDFOR
4368 /// \endcode
4369 ///
4370 /// \headerfile <immintrin.h>
4371 ///
4372 /// \code
4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374 /// __m128i mask, const int s);
4375 /// \endcode
4376 ///
4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4378 ///
4379 /// \param a
4380 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4381 /// zero.
4382 /// \param m
4383 /// A pointer to the memory used for loading values.
4384 /// \param i
4385 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4386 /// \param mask
4387 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4388 /// bit of each element in the mask vector represents the mask bits. If a
4389 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4390 /// otherwise the value is loaded from memory.
4391 /// \param s
4392 /// A literal constant scale factor for the indexes in \a i. Must be
4393 /// 1, 2, 4, or 8.
4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4397 (int const *)(m), \
4398 (__v4si)(__m128i)(i), \
4399 (__v4si)(__m128i)(mask), (s)))
4400
4401 /// Conditionally gathers eight 32-bit integer values, either from the
4402 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404 /// of [8 x i32] in \a mask determines the source for each element.
4405 ///
4406 /// \code{.operation}
4407 /// FOR element := 0 to 7
4408 /// j := element*32
4409 /// k := element*32
4410 /// IF mask[j+31] == 0
4411 /// result[j+31:j] := a[j+31:j]
4412 /// ELSE
4413 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4414 /// FI
4415 /// ENDFOR
4416 /// \endcode
4417 ///
4418 /// \headerfile <immintrin.h>
4419 ///
4420 /// \code
4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422 /// __m256i mask, const int s);
4423 /// \endcode
4424 ///
4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4426 ///
4427 /// \param a
4428 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4429 /// zero.
4430 /// \param m
4431 /// A pointer to the memory used for loading values.
4432 /// \param i
4433 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4434 /// \param mask
4435 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
4436 /// bit of each element in the mask vector represents the mask bits. If a
4437 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4438 /// otherwise the value is loaded from memory.
4439 /// \param s
4440 /// A literal constant scale factor for the indexes in \a i. Must be
4441 /// 1, 2, 4, or 8.
4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4445 (int const *)(m), \
4446 (__v8si)(__m256i)(i), \
4447 (__v8si)(__m256i)(mask), (s)))
4448
4449 /// Conditionally gathers two 32-bit integer values, either from the
4450 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452 /// of [4 x i32] in \a mask determines the source for the lower two
4453 /// elements. The upper two elements of the result are zeroed.
4454 ///
4455 /// \code{.operation}
4456 /// FOR element := 0 to 1
4457 /// j := element*32
4458 /// k := element*64
4459 /// IF mask[j+31] == 0
4460 /// result[j+31:j] := a[j+31:j]
4461 /// ELSE
4462 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4463 /// FI
4464 /// ENDFOR
4465 /// result[127:64] := 0
4466 /// \endcode
4467 ///
4468 /// \headerfile <immintrin.h>
4469 ///
4470 /// \code
4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472 /// __m128i mask, const int s);
4473 /// \endcode
4474 ///
4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4476 ///
4477 /// \param a
4478 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479 /// zero. Only the first two elements are used.
4480 /// \param m
4481 /// A pointer to the memory used for loading values.
4482 /// \param i
4483 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
4484 /// \param mask
4485 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4486 /// bit of each element in the mask vector represents the mask bits. If a
4487 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4488 /// otherwise the value is loaded from memory. Only the first two elements
4489 /// are used.
4490 /// \param s
4491 /// A literal constant scale factor for the indexes in \a i. Must be
4492 /// 1, 2, 4, or 8.
4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4496 (int const *)(m), \
4497 (__v2di)(__m128i)(i), \
4498 (__v4si)(__m128i)(mask), (s)))
4499
4500 /// Conditionally gathers four 32-bit integer values, either from the
4501 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503 /// of [4 x i32] in \a mask determines the source for each element.
4504 ///
4505 /// \code{.operation}
4506 /// FOR element := 0 to 3
4507 /// j := element*32
4508 /// k := element*64
4509 /// IF mask[j+31] == 0
4510 /// result[j+31:j] := a[j+31:j]
4511 /// ELSE
4512 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4513 /// FI
4514 /// ENDFOR
4515 /// \endcode
4516 ///
4517 /// \headerfile <immintrin.h>
4518 ///
4519 /// \code
4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521 /// __m128i mask, const int s);
4522 /// \endcode
4523 ///
4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4525 ///
4526 /// \param a
4527 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4528 /// zero.
4529 /// \param m
4530 /// A pointer to the memory used for loading values.
4531 /// \param i
4532 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4533 /// \param mask
4534 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4535 /// bit of each element in the mask vector represents the mask bits. If a
4536 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4537 /// otherwise the value is loaded from memory.
4538 /// \param s
4539 /// A literal constant scale factor for the indexes in \a i. Must be
4540 /// 1, 2, 4, or 8.
4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4544 (int const *)(m), \
4545 (__v4di)(__m256i)(i), \
4546 (__v4si)(__m128i)(mask), (s)))
4547
4548 /// Conditionally gathers two 64-bit integer values, either from the
4549 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551 /// of [2 x i64] in \a mask determines the source for each element.
4552 ///
4553 /// \code{.operation}
4554 /// FOR element := 0 to 1
4555 /// j := element*64
4556 /// k := element*32
4557 /// IF mask[j+63] == 0
4558 /// result[j+63:j] := a[j+63:j]
4559 /// ELSE
4560 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4561 /// FI
4562 /// ENDFOR
4563 /// \endcode
4564 ///
4565 /// \headerfile <immintrin.h>
4566 ///
4567 /// \code
4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569 /// __m128i mask, const int s);
4570 /// \endcode
4571 ///
4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4573 ///
4574 /// \param a
4575 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4576 /// zero.
4577 /// \param m
4578 /// A pointer to the memory used for loading values.
4579 /// \param i
4580 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581 /// the first two elements are used.
4582 /// \param mask
4583 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4584 /// bit of each element in the mask vector represents the mask bits. If a
4585 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4586 /// otherwise the value is loaded from memory.
4587 /// \param s
4588 /// A literal constant scale factor for the indexes in \a i. Must be
4589 /// 1, 2, 4, or 8.
4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593 (long long const *)(m), \
4594 (__v4si)(__m128i)(i), \
4595 (__v2di)(__m128i)(mask), (s)))
4596
4597 /// Conditionally gathers four 64-bit integer values, either from the
4598 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600 /// of [4 x i64] in \a mask determines the source for each element.
4601 ///
4602 /// \code{.operation}
4603 /// FOR element := 0 to 3
4604 /// j := element*64
4605 /// k := element*32
4606 /// IF mask[j+63] == 0
4607 /// result[j+63:j] := a[j+63:j]
4608 /// ELSE
4609 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4610 /// FI
4611 /// ENDFOR
4612 /// \endcode
4613 ///
4614 /// \headerfile <immintrin.h>
4615 ///
4616 /// \code
4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618 /// __m128i i, __m256i mask, const int s);
4619 /// \endcode
4620 ///
4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4622 ///
4623 /// \param a
4624 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625 /// zero.
4626 /// \param m
4627 /// A pointer to the memory used for loading values.
4628 /// \param i
4629 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4630 /// \param mask
4631 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4632 /// bit of each element in the mask vector represents the mask bits. If a
4633 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4634 /// otherwise the value is loaded from memory.
4635 /// \param s
4636 /// A literal constant scale factor for the indexes in \a i. Must be
4637 /// 1, 2, 4, or 8.
4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641 (long long const *)(m), \
4642 (__v4si)(__m128i)(i), \
4643 (__v4di)(__m256i)(mask), (s)))
4644
4645 /// Conditionally gathers two 64-bit integer values, either from the
4646 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648 /// of [2 x i64] in \a mask determines the source for each element.
4649 ///
4650 /// \code{.operation}
4651 /// FOR element := 0 to 1
4652 /// j := element*64
4653 /// k := element*64
4654 /// IF mask[j+63] == 0
4655 /// result[j+63:j] := a[j+63:j]
4656 /// ELSE
4657 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4658 /// FI
4659 /// ENDFOR
4660 /// \endcode
4661 ///
4662 /// \headerfile <immintrin.h>
4663 ///
4664 /// \code
4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666 /// __m128i mask, const int s);
4667 /// \endcode
4668 ///
4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4670 ///
4671 /// \param a
4672 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4673 /// zero.
4674 /// \param m
4675 /// A pointer to the memory used for loading values.
4676 /// \param i
4677 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4678 /// \param mask
4679 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4680 /// bit of each element in the mask vector represents the mask bits. If a
4681 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4682 /// otherwise the value is loaded from memory.
4683 /// \param s
4684 /// A literal constant scale factor for the indexes in \a i. Must be
4685 /// 1, 2, 4, or 8.
4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689 (long long const *)(m), \
4690 (__v2di)(__m128i)(i), \
4691 (__v2di)(__m128i)(mask), (s)))
4692
4693 /// Conditionally gathers four 64-bit integer values, either from the
4694 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696 /// of [4 x i64] in \a mask determines the source for each element.
4697 ///
4698 /// \code{.operation}
4699 /// FOR element := 0 to 3
4700 /// j := element*64
4701 /// k := element*64
4702 /// IF mask[j+63] == 0
4703 /// result[j+63:j] := a[j+63:j]
4704 /// ELSE
4705 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4706 /// FI
4707 /// ENDFOR
4708 /// \endcode
4709 ///
4710 /// \headerfile <immintrin.h>
4711 ///
4712 /// \code
4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714 /// __m256i i, __m256i mask, const int s);
4715 /// \endcode
4716 ///
4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4718 ///
4719 /// \param a
4720 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4721 /// zero.
4722 /// \param m
4723 /// A pointer to the memory used for loading values.
4724 /// \param i
4725 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4726 /// \param mask
4727 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4728 /// bit of each element in the mask vector represents the mask bits. If a
4729 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4730 /// otherwise the value is loaded from memory.
4731 /// \param s
4732 /// A literal constant scale factor for the indexes in \a i. Must be
4733 /// 1, 2, 4, or 8.
4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737 (long long const *)(m), \
4738 (__v4di)(__m256i)(i), \
4739 (__v4di)(__m256i)(mask), (s)))
4740
4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4742 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4743 ///
4744 /// \code{.operation}
4745 /// FOR element := 0 to 1
4746 /// j := element*64
4747 /// k := element*32
4748 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4749 /// ENDFOR
4750 /// \endcode
4751 ///
4752 /// \headerfile <immintrin.h>
4753 ///
4754 /// \code
4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4756 /// \endcode
4757 ///
4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4759 ///
4760 /// \param m
4761 /// A pointer to the memory used for loading values.
4762 /// \param i
4763 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764 /// the first two elements are used.
4765 /// \param s
4766 /// A literal constant scale factor for the indexes in \a i. Must be
4767 /// 1, 2, 4, or 8.
4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769 #define _mm_i32gather_pd(m, i, s) \
4770 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771 (double const *)(m), \
4772 (__v4si)(__m128i)(i), \
4773 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774 _mm_setzero_pd()), \
4775 (s)))
4776
4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4778 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4779 ///
4780 /// \code{.operation}
4781 /// FOR element := 0 to 3
4782 /// j := element*64
4783 /// k := element*32
4784 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4785 /// ENDFOR
4786 /// \endcode
4787 ///
4788 /// \headerfile <immintrin.h>
4789 ///
4790 /// \code
4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4792 /// \endcode
4793 ///
4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4795 ///
4796 /// \param m
4797 /// A pointer to the memory used for loading values.
4798 /// \param i
4799 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4800 /// \param s
4801 /// A literal constant scale factor for the indexes in \a i. Must be
4802 /// 1, 2, 4, or 8.
4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804 #define _mm256_i32gather_pd(m, i, s) \
4805 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806 (double const *)(m), \
4807 (__v4si)(__m128i)(i), \
4808 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809 _mm256_setzero_pd(), \
4810 _CMP_EQ_OQ), \
4811 (s)))
4812
4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4814 /// indexes from the 128-bit vector of [2 x i64] in \a i.
4815 ///
4816 /// \code{.operation}
4817 /// FOR element := 0 to 1
4818 /// j := element*64
4819 /// k := element*64
4820 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4821 /// ENDFOR
4822 /// \endcode
4823 ///
4824 /// \headerfile <immintrin.h>
4825 ///
4826 /// \code
4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4828 /// \endcode
4829 ///
4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4831 ///
4832 /// \param m
4833 /// A pointer to the memory used for loading values.
4834 /// \param i
4835 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4836 /// \param s
4837 /// A literal constant scale factor for the indexes in \a i. Must be
4838 /// 1, 2, 4, or 8.
4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840 #define _mm_i64gather_pd(m, i, s) \
4841 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842 (double const *)(m), \
4843 (__v2di)(__m128i)(i), \
4844 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845 _mm_setzero_pd()), \
4846 (s)))
4847
4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4849 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4850 ///
4851 /// \code{.operation}
4852 /// FOR element := 0 to 3
4853 /// j := element*64
4854 /// k := element*64
4855 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4856 /// ENDFOR
4857 /// \endcode
4858 ///
4859 /// \headerfile <immintrin.h>
4860 ///
4861 /// \code
4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4863 /// \endcode
4864 ///
4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4866 ///
4867 /// \param m
4868 /// A pointer to the memory used for loading values.
4869 /// \param i
4870 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4871 /// \param s
4872 /// A literal constant scale factor for the indexes in \a i. Must be
4873 /// 1, 2, 4, or 8.
4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875 #define _mm256_i64gather_pd(m, i, s) \
4876 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877 (double const *)(m), \
4878 (__v4di)(__m256i)(i), \
4879 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880 _mm256_setzero_pd(), \
4881 _CMP_EQ_OQ), \
4882 (s)))
4883
4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4885 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4886 ///
4887 /// \code{.operation}
4888 /// FOR element := 0 to 3
4889 /// j := element*32
4890 /// k := element*32
4891 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4892 /// ENDFOR
4893 /// \endcode
4894 ///
4895 /// \headerfile <immintrin.h>
4896 ///
4897 /// \code
4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4899 /// \endcode
4900 ///
4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4902 ///
4903 /// \param m
4904 /// A pointer to the memory used for loading values.
4905 /// \param i
4906 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4907 /// \param s
4908 /// A literal constant scale factor for the indexes in \a i. Must be
4909 /// 1, 2, 4, or 8.
4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911 #define _mm_i32gather_ps(m, i, s) \
4912 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913 (float const *)(m), \
4914 (__v4si)(__m128i)(i), \
4915 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916 _mm_setzero_ps()), \
4917 (s)))
4918
4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4921 ///
4922 /// \code{.operation}
4923 /// FOR element := 0 to 7
4924 /// j := element*32
4925 /// k := element*32
4926 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4927 /// ENDFOR
4928 /// \endcode
4929 ///
4930 /// \headerfile <immintrin.h>
4931 ///
4932 /// \code
4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4934 /// \endcode
4935 ///
4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4937 ///
4938 /// \param m
4939 /// A pointer to the memory used for loading values.
4940 /// \param i
4941 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4942 /// \param s
4943 /// A literal constant scale factor for the indexes in \a i. Must be
4944 /// 1, 2, 4, or 8.
4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946 #define _mm256_i32gather_ps(m, i, s) \
4947 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948 (float const *)(m), \
4949 (__v8si)(__m256i)(i), \
4950 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951 _mm256_setzero_ps(), \
4952 _CMP_EQ_OQ), \
4953 (s)))
4954
4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4956 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957 /// elements of the result are zeroed.
4958 ///
4959 /// \code{.operation}
4960 /// FOR element := 0 to 1
4961 /// j := element*32
4962 /// k := element*64
4963 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4964 /// ENDFOR
4965 /// result[127:64] := 0
4966 /// \endcode
4967 ///
4968 /// \headerfile <immintrin.h>
4969 ///
4970 /// \code
4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4972 /// \endcode
4973 ///
4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4975 ///
4976 /// \param m
4977 /// A pointer to the memory used for loading values.
4978 /// \param i
4979 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4980 /// \param s
4981 /// A literal constant scale factor for the indexes in \a i. Must be
4982 /// 1, 2, 4, or 8.
4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984 #define _mm_i64gather_ps(m, i, s) \
4985 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986 (float const *)(m), \
4987 (__v2di)(__m128i)(i), \
4988 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989 _mm_setzero_ps()), \
4990 (s)))
4991
4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4993 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4994 ///
4995 /// \code{.operation}
4996 /// FOR element := 0 to 3
4997 /// j := element*32
4998 /// k := element*64
4999 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5000 /// ENDFOR
5001 /// \endcode
5002 ///
5003 /// \headerfile <immintrin.h>
5004 ///
5005 /// \code
5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5007 /// \endcode
5008 ///
5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5010 ///
5011 /// \param m
5012 /// A pointer to the memory used for loading values.
5013 /// \param i
5014 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5015 /// \param s
5016 /// A literal constant scale factor for the indexes in \a i. Must be
5017 /// 1, 2, 4, or 8.
5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019 #define _mm256_i64gather_ps(m, i, s) \
5020 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021 (float const *)(m), \
5022 (__v4di)(__m256i)(i), \
5023 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024 _mm_setzero_ps()), \
5025 (s)))
5026
5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5028 /// indexes from the 128-bit vector of [4 x i32] in \a i.
5029 ///
5030 /// \code{.operation}
5031 /// FOR element := 0 to 3
5032 /// j := element*32
5033 /// k := element*32
5034 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5035 /// ENDFOR
5036 /// \endcode
5037 ///
5038 /// \headerfile <immintrin.h>
5039 ///
5040 /// \code
5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5042 /// \endcode
5043 ///
5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5045 ///
5046 /// \param m
5047 /// A pointer to the memory used for loading values.
5048 /// \param i
5049 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5050 /// \param s
5051 /// A literal constant scale factor for the indexes in \a i. Must be
5052 /// 1, 2, 4, or 8.
5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054 #define _mm_i32gather_epi32(m, i, s) \
5055 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056 (int const *)(m), (__v4si)(__m128i)(i), \
5057 (__v4si)_mm_set1_epi32(-1), (s)))
5058
5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060 /// indexes from the 256-bit vector of [8 x i32] in \a i.
5061 ///
5062 /// \code{.operation}
5063 /// FOR element := 0 to 7
5064 /// j := element*32
5065 /// k := element*32
5066 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5067 /// ENDFOR
5068 /// \endcode
5069 ///
5070 /// \headerfile <immintrin.h>
5071 ///
5072 /// \code
5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5074 /// \endcode
5075 ///
5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5077 ///
5078 /// \param m
5079 /// A pointer to the memory used for loading values.
5080 /// \param i
5081 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5082 /// \param s
5083 /// A literal constant scale factor for the indexes in \a i. Must be
5084 /// 1, 2, 4, or 8.
5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086 #define _mm256_i32gather_epi32(m, i, s) \
5087 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088 (int const *)(m), (__v8si)(__m256i)(i), \
5089 (__v8si)_mm256_set1_epi32(-1), (s)))
5090
5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093 /// of the result are zeroed.
5094 ///
5095 /// \code{.operation}
5096 /// FOR element := 0 to 1
5097 /// j := element*32
5098 /// k := element*64
5099 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5100 /// ENDFOR
5101 /// result[127:64] := 0
5102 /// \endcode
5103 ///
5104 /// \headerfile <immintrin.h>
5105 ///
5106 /// \code
5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5108 /// \endcode
5109 ///
5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5111 ///
5112 /// \param m
5113 /// A pointer to the memory used for loading values.
5114 /// \param i
5115 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5116 /// \param s
5117 /// A literal constant scale factor for the indexes in \a i. Must be
5118 /// 1, 2, 4, or 8.
5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120 #define _mm_i64gather_epi32(m, i, s) \
5121 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122 (int const *)(m), (__v2di)(__m128i)(i), \
5123 (__v4si)_mm_set1_epi32(-1), (s)))
5124
5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126 /// from the 256-bit vector of [4 x i64] in \a i.
5127 ///
5128 /// \code{.operation}
5129 /// FOR element := 0 to 3
5130 /// j := element*32
5131 /// k := element*64
5132 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5133 /// ENDFOR
5134 /// \endcode
5135 ///
5136 /// \headerfile <immintrin.h>
5137 ///
5138 /// \code
5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5140 /// \endcode
5141 ///
5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5143 ///
5144 /// \param m
5145 /// A pointer to the memory used for loading values.
5146 /// \param i
5147 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5148 /// \param s
5149 /// A literal constant scale factor for the indexes in \a i. Must be
5150 /// 1, 2, 4, or 8.
5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152 #define _mm256_i64gather_epi32(m, i, s) \
5153 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154 (int const *)(m), (__v4di)(__m256i)(i), \
5155 (__v4si)_mm_set1_epi32(-1), (s)))
5156
5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158 /// from the 128-bit vector of [4 x i32] in \a i.
5159 ///
5160 /// \code{.operation}
5161 /// FOR element := 0 to 1
5162 /// j := element*64
5163 /// k := element*32
5164 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5165 /// ENDFOR
5166 /// \endcode
5167 ///
5168 /// \headerfile <immintrin.h>
5169 ///
5170 /// \code
5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5172 /// \endcode
5173 ///
5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5175 ///
5176 /// \param m
5177 /// A pointer to the memory used for loading values.
5178 /// \param i
5179 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180 /// the first two elements are used.
5181 /// \param s
5182 /// A literal constant scale factor for the indexes in \a i. Must be
5183 /// 1, 2, 4, or 8.
5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185 #define _mm_i32gather_epi64(m, i, s) \
5186 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187 (long long const *)(m), \
5188 (__v4si)(__m128i)(i), \
5189 (__v2di)_mm_set1_epi64x(-1), (s)))
5190
5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192 /// from the 128-bit vector of [4 x i32] in \a i.
5193 ///
5194 /// \code{.operation}
5195 /// FOR element := 0 to 3
5196 /// j := element*64
5197 /// k := element*32
5198 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5199 /// ENDFOR
5200 /// \endcode
5201 ///
5202 /// \headerfile <immintrin.h>
5203 ///
5204 /// \code
5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5206 /// \endcode
5207 ///
5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5209 ///
5210 /// \param m
5211 /// A pointer to the memory used for loading values.
5212 /// \param i
5213 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5214 /// \param s
5215 /// A literal constant scale factor for the indexes in \a i. Must be
5216 /// 1, 2, 4, or 8.
5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218 #define _mm256_i32gather_epi64(m, i, s) \
5219 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220 (long long const *)(m), \
5221 (__v4si)(__m128i)(i), \
5222 (__v4di)_mm256_set1_epi64x(-1), (s)))
5223
5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225 /// from the 128-bit vector of [2 x i64] in \a i.
5226 ///
5227 /// \code{.operation}
5228 /// FOR element := 0 to 1
5229 /// j := element*64
5230 /// k := element*64
5231 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5232 /// ENDFOR
5233 /// \endcode
5234 ///
5235 /// \headerfile <immintrin.h>
5236 ///
5237 /// \code
5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5239 /// \endcode
5240 ///
5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5242 ///
5243 /// \param m
5244 /// A pointer to the memory used for loading values.
5245 /// \param i
5246 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5247 /// \param s
5248 /// A literal constant scale factor for the indexes in \a i. Must be
5249 /// 1, 2, 4, or 8.
5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251 #define _mm_i64gather_epi64(m, i, s) \
5252 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253 (long long const *)(m), \
5254 (__v2di)(__m128i)(i), \
5255 (__v2di)_mm_set1_epi64x(-1), (s)))
5256
5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258 /// from the 256-bit vector of [4 x i64] in \a i.
5259 ///
5260 /// \code{.operation}
5261 /// FOR element := 0 to 3
5262 /// j := element*64
5263 /// k := element*64
5264 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5265 /// ENDFOR
5266 /// \endcode
5267 ///
5268 /// \headerfile <immintrin.h>
5269 ///
5270 /// \code
5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5272 /// \endcode
5273 ///
5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5275 ///
5276 /// \param m
5277 /// A pointer to the memory used for loading values.
5278 /// \param i
5279 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5280 /// \param s
5281 /// A literal constant scale factor for the indexes in \a i. Must be
5282 /// 1, 2, 4, or 8.
5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284 #define _mm256_i64gather_epi64(m, i, s) \
5285 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286 (long long const *)(m), \
5287 (__v4di)(__m256i)(i), \
5288 (__v4di)_mm256_set1_epi64x(-1), (s)))
5289
5290 #undef __DEFAULT_FN_ATTRS256
5291 #undef __DEFAULT_FN_ATTRS128
5292
5293 #endif /* __AVX2INTRIN_H */
5294