• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24     __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines.  */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46 
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50 
51 /* Define the default attributes for the functions in this file. */
52 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
53 #define __DEFAULT_FN_ATTRS                                                     \
54   __attribute__((__always_inline__, __nodebug__,                               \
55                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
56 #else
57 #define __DEFAULT_FN_ATTRS                                                     \
58   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
59                  __min_vector_width__(128)))
60 #endif
61 
62 #define __trunc64(x)                                                           \
63   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
64 #define __anyext128(x)                                                         \
65   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
66                                     1, -1, -1)
67 
68 /// Adds lower double-precision values in both operands and returns the
69 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
70 ///    are copied from the upper double-precision value of the first operand.
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
75 ///
76 /// \param __a
77 ///    A 128-bit vector of [2 x double] containing one of the source operands.
78 /// \param __b
79 ///    A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
81 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
82 ///    from the upper 64 bits of the first source operand.
_mm_add_sd(__m128d __a,__m128d __b)83 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
84                                                         __m128d __b) {
85   __a[0] += __b[0];
86   return __a;
87 }
88 
89 /// Adds two 128-bit vectors of [2 x double].
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
94 ///
95 /// \param __a
96 ///    A 128-bit vector of [2 x double] containing one of the source operands.
97 /// \param __b
98 ///    A 128-bit vector of [2 x double] containing one of the source operands.
99 /// \returns A 128-bit vector of [2 x double] containing the sums of both
100 ///    operands.
_mm_add_pd(__m128d __a,__m128d __b)101 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
102                                                         __m128d __b) {
103   return (__m128d)((__v2df)__a + (__v2df)__b);
104 }
105 
106 /// Subtracts the lower double-precision value of the second operand
107 ///    from the lower double-precision value of the first operand and returns
108 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
109 ///    the result are copied from the upper double-precision value of the first
110 ///    operand.
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
115 ///
116 /// \param __a
117 ///    A 128-bit vector of [2 x double] containing the minuend.
118 /// \param __b
119 ///    A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
121 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
122 ///    copied from the upper 64 bits of the first source operand.
_mm_sub_sd(__m128d __a,__m128d __b)123 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
124                                                         __m128d __b) {
125   __a[0] -= __b[0];
126   return __a;
127 }
128 
129 /// Subtracts two 128-bit vectors of [2 x double].
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
134 ///
135 /// \param __a
136 ///    A 128-bit vector of [2 x double] containing the minuend.
137 /// \param __b
138 ///    A 128-bit vector of [2 x double] containing the subtrahend.
139 /// \returns A 128-bit vector of [2 x double] containing the differences between
140 ///    both operands.
_mm_sub_pd(__m128d __a,__m128d __b)141 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
142                                                         __m128d __b) {
143   return (__m128d)((__v2df)__a - (__v2df)__b);
144 }
145 
146 /// Multiplies lower double-precision values in both operands and returns
147 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
148 ///    result are copied from the upper double-precision value of the first
149 ///    operand.
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
154 ///
155 /// \param __a
156 ///    A 128-bit vector of [2 x double] containing one of the source operands.
157 /// \param __b
158 ///    A 128-bit vector of [2 x double] containing one of the source operands.
159 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
160 ///    product of the lower 64 bits of both operands. The upper 64 bits are
161 ///    copied from the upper 64 bits of the first source operand.
_mm_mul_sd(__m128d __a,__m128d __b)162 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
163                                                         __m128d __b) {
164   __a[0] *= __b[0];
165   return __a;
166 }
167 
168 /// Multiplies two 128-bit vectors of [2 x double].
169 ///
170 /// \headerfile <x86intrin.h>
171 ///
172 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
173 ///
174 /// \param __a
175 ///    A 128-bit vector of [2 x double] containing one of the operands.
176 /// \param __b
177 ///    A 128-bit vector of [2 x double] containing one of the operands.
178 /// \returns A 128-bit vector of [2 x double] containing the products of both
179 ///    operands.
_mm_mul_pd(__m128d __a,__m128d __b)180 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
181                                                         __m128d __b) {
182   return (__m128d)((__v2df)__a * (__v2df)__b);
183 }
184 
185 /// Divides the lower double-precision value of the first operand by the
186 ///    lower double-precision value of the second operand and returns the
187 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
188 ///    result are copied from the upper double-precision value of the first
189 ///    operand.
190 ///
191 /// \headerfile <x86intrin.h>
192 ///
193 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
194 ///
195 /// \param __a
196 ///    A 128-bit vector of [2 x double] containing the dividend.
197 /// \param __b
198 ///    A 128-bit vector of [2 x double] containing divisor.
199 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
200 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
201 ///    copied from the upper 64 bits of the first source operand.
_mm_div_sd(__m128d __a,__m128d __b)202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
203                                                         __m128d __b) {
204   __a[0] /= __b[0];
205   return __a;
206 }
207 
208 /// Performs an element-by-element division of two 128-bit vectors of
209 ///    [2 x double].
210 ///
211 /// \headerfile <x86intrin.h>
212 ///
213 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
214 ///
215 /// \param __a
216 ///    A 128-bit vector of [2 x double] containing the dividend.
217 /// \param __b
218 ///    A 128-bit vector of [2 x double] containing the divisor.
219 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
220 ///    operands.
_mm_div_pd(__m128d __a,__m128d __b)221 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
222                                                         __m128d __b) {
223   return (__m128d)((__v2df)__a / (__v2df)__b);
224 }
225 
226 /// Calculates the square root of the lower double-precision value of
227 ///    the second operand and returns it in the lower 64 bits of the result.
228 ///    The upper 64 bits of the result are copied from the upper
229 ///    double-precision value of the first operand.
230 ///
231 /// \headerfile <x86intrin.h>
232 ///
233 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
234 ///
235 /// \param __a
236 ///    A 128-bit vector of [2 x double] containing one of the operands. The
237 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
238 ///    result.
239 /// \param __b
240 ///    A 128-bit vector of [2 x double] containing one of the operands. The
241 ///    square root is calculated using the lower 64 bits of this operand.
242 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
243 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
244 ///    bits are copied from the upper 64 bits of operand \a __a.
_mm_sqrt_sd(__m128d __a,__m128d __b)245 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
246                                                          __m128d __b) {
247   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
248   return __extension__(__m128d){__c[0], __a[1]};
249 }
250 
251 /// Calculates the square root of the each of two values stored in a
252 ///    128-bit vector of [2 x double].
253 ///
254 /// \headerfile <x86intrin.h>
255 ///
256 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
257 ///
258 /// \param __a
259 ///    A 128-bit vector of [2 x double].
260 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
261 ///    values in the operand.
_mm_sqrt_pd(__m128d __a)262 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
263   return __builtin_ia32_sqrtpd((__v2df)__a);
264 }
265 
266 /// Compares lower 64-bit double-precision values of both operands, and
267 ///    returns the lesser of the pair of values in the lower 64-bits of the
268 ///    result. The upper 64 bits of the result are copied from the upper
269 ///    double-precision value of the first operand.
270 ///
271 ///    If either value in a comparison is NaN, returns the value from \a __b.
272 ///
273 /// \headerfile <x86intrin.h>
274 ///
275 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
276 ///
277 /// \param __a
278 ///    A 128-bit vector of [2 x double] containing one of the operands. The
279 ///    lower 64 bits of this operand are used in the comparison.
280 /// \param __b
281 ///    A 128-bit vector of [2 x double] containing one of the operands. The
282 ///    lower 64 bits of this operand are used in the comparison.
283 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
284 ///    minimum value between both operands. The upper 64 bits are copied from
285 ///    the upper 64 bits of the first source operand.
_mm_min_sd(__m128d __a,__m128d __b)286 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
287                                                         __m128d __b) {
288   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
289 }
290 
291 /// Performs element-by-element comparison of the two 128-bit vectors of
292 ///    [2 x double] and returns a vector containing the lesser of each pair of
293 ///    values.
294 ///
295 ///    If either value in a comparison is NaN, returns the value from \a __b.
296 ///
297 /// \headerfile <x86intrin.h>
298 ///
299 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
300 ///
301 /// \param __a
302 ///    A 128-bit vector of [2 x double] containing one of the operands.
303 /// \param __b
304 ///    A 128-bit vector of [2 x double] containing one of the operands.
305 /// \returns A 128-bit vector of [2 x double] containing the minimum values
306 ///    between both operands.
_mm_min_pd(__m128d __a,__m128d __b)307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
308                                                         __m128d __b) {
309   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
310 }
311 
312 /// Compares lower 64-bit double-precision values of both operands, and
313 ///    returns the greater of the pair of values in the lower 64-bits of the
314 ///    result. The upper 64 bits of the result are copied from the upper
315 ///    double-precision value of the first operand.
316 ///
317 ///    If either value in a comparison is NaN, returns the value from \a __b.
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
322 ///
323 /// \param __a
324 ///    A 128-bit vector of [2 x double] containing one of the operands. The
325 ///    lower 64 bits of this operand are used in the comparison.
326 /// \param __b
327 ///    A 128-bit vector of [2 x double] containing one of the operands. The
328 ///    lower 64 bits of this operand are used in the comparison.
329 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
330 ///    maximum value between both operands. The upper 64 bits are copied from
331 ///    the upper 64 bits of the first source operand.
_mm_max_sd(__m128d __a,__m128d __b)332 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
333                                                         __m128d __b) {
334   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
335 }
336 
337 /// Performs element-by-element comparison of the two 128-bit vectors of
338 ///    [2 x double] and returns a vector containing the greater of each pair
339 ///    of values.
340 ///
341 ///    If either value in a comparison is NaN, returns the value from \a __b.
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
346 ///
347 /// \param __a
348 ///    A 128-bit vector of [2 x double] containing one of the operands.
349 /// \param __b
350 ///    A 128-bit vector of [2 x double] containing one of the operands.
351 /// \returns A 128-bit vector of [2 x double] containing the maximum values
352 ///    between both operands.
_mm_max_pd(__m128d __a,__m128d __b)353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
354                                                         __m128d __b) {
355   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
356 }
357 
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
359 ///
360 /// \headerfile <x86intrin.h>
361 ///
362 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
363 ///
364 /// \param __a
365 ///    A 128-bit vector of [2 x double] containing one of the source operands.
366 /// \param __b
367 ///    A 128-bit vector of [2 x double] containing one of the source operands.
368 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
369 ///    values between both operands.
_mm_and_pd(__m128d __a,__m128d __b)370 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
371                                                         __m128d __b) {
372   return (__m128d)((__v2du)__a & (__v2du)__b);
373 }
374 
375 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
376 ///    the one's complement of the values contained in the first source operand.
377 ///
378 /// \headerfile <x86intrin.h>
379 ///
380 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
381 ///
382 /// \param __a
383 ///    A 128-bit vector of [2 x double] containing the left source operand. The
384 ///    one's complement of this value is used in the bitwise AND.
385 /// \param __b
386 ///    A 128-bit vector of [2 x double] containing the right source operand.
387 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
388 ///    values in the second operand and the one's complement of the first
389 ///    operand.
_mm_andnot_pd(__m128d __a,__m128d __b)390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
391                                                            __m128d __b) {
392   return (__m128d)(~(__v2du)__a & (__v2du)__b);
393 }
394 
395 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
400 ///
401 /// \param __a
402 ///    A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 ///    A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
406 ///    values between both operands.
_mm_or_pd(__m128d __a,__m128d __b)407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
408                                                        __m128d __b) {
409   return (__m128d)((__v2du)__a | (__v2du)__b);
410 }
411 
412 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
413 ///
414 /// \headerfile <x86intrin.h>
415 ///
416 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
417 ///
418 /// \param __a
419 ///    A 128-bit vector of [2 x double] containing one of the source operands.
420 /// \param __b
421 ///    A 128-bit vector of [2 x double] containing one of the source operands.
422 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
423 ///    values between both operands.
_mm_xor_pd(__m128d __a,__m128d __b)424 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
425                                                         __m128d __b) {
426   return (__m128d)((__v2du)__a ^ (__v2du)__b);
427 }
428 
429 /// Compares each of the corresponding double-precision values of the
430 ///    128-bit vectors of [2 x double] for equality.
431 ///
432 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
433 ///    If either value in a comparison is NaN, returns false.
434 ///
435 /// \headerfile <x86intrin.h>
436 ///
437 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
438 ///
439 /// \param __a
440 ///    A 128-bit vector of [2 x double].
441 /// \param __b
442 ///    A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpeq_pd(__m128d __a,__m128d __b)444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
445                                                           __m128d __b) {
446   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
447 }
448 
449 /// Compares each of the corresponding double-precision values of the
450 ///    128-bit vectors of [2 x double] to determine if the values in the first
451 ///    operand are less than those in the second operand.
452 ///
453 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454 ///    If either value in a comparison is NaN, returns false.
455 ///
456 /// \headerfile <x86intrin.h>
457 ///
458 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
459 ///
460 /// \param __a
461 ///    A 128-bit vector of [2 x double].
462 /// \param __b
463 ///    A 128-bit vector of [2 x double].
464 /// \returns A 128-bit vector containing the comparison results.
_mm_cmplt_pd(__m128d __a,__m128d __b)465 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
466                                                           __m128d __b) {
467   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
468 }
469 
470 /// Compares each of the corresponding double-precision values of the
471 ///    128-bit vectors of [2 x double] to determine if the values in the first
472 ///    operand are less than or equal to those in the second operand.
473 ///
474 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
475 ///    If either value in a comparison is NaN, returns false.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
480 ///
481 /// \param __a
482 ///    A 128-bit vector of [2 x double].
483 /// \param __b
484 ///    A 128-bit vector of [2 x double].
485 /// \returns A 128-bit vector containing the comparison results.
_mm_cmple_pd(__m128d __a,__m128d __b)486 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
487                                                           __m128d __b) {
488   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
489 }
490 
491 /// Compares each of the corresponding double-precision values of the
492 ///    128-bit vectors of [2 x double] to determine if the values in the first
493 ///    operand are greater than those in the second operand.
494 ///
495 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
496 ///    If either value in a comparison is NaN, returns false.
497 ///
498 /// \headerfile <x86intrin.h>
499 ///
500 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
501 ///
502 /// \param __a
503 ///    A 128-bit vector of [2 x double].
504 /// \param __b
505 ///    A 128-bit vector of [2 x double].
506 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpgt_pd(__m128d __a,__m128d __b)507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
508                                                           __m128d __b) {
509   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
510 }
511 
512 /// Compares each of the corresponding double-precision values of the
513 ///    128-bit vectors of [2 x double] to determine if the values in the first
514 ///    operand are greater than or equal to those in the second operand.
515 ///
516 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
517 ///    If either value in a comparison is NaN, returns false.
518 ///
519 /// \headerfile <x86intrin.h>
520 ///
521 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
522 ///
523 /// \param __a
524 ///    A 128-bit vector of [2 x double].
525 /// \param __b
526 ///    A 128-bit vector of [2 x double].
527 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpge_pd(__m128d __a,__m128d __b)528 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
529                                                           __m128d __b) {
530   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
531 }
532 
533 /// Compares each of the corresponding double-precision values of the
534 ///    128-bit vectors of [2 x double] to determine if the values in the first
535 ///    operand are ordered with respect to those in the second operand.
536 ///
537 ///    A pair of double-precision values are ordered with respect to each
538 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
539 ///    0xFFFFFFFFFFFFFFFF for true.
540 ///
541 /// \headerfile <x86intrin.h>
542 ///
543 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
544 ///
545 /// \param __a
546 ///    A 128-bit vector of [2 x double].
547 /// \param __b
548 ///    A 128-bit vector of [2 x double].
549 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpord_pd(__m128d __a,__m128d __b)550 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
551                                                            __m128d __b) {
552   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
553 }
554 
555 /// Compares each of the corresponding double-precision values of the
556 ///    128-bit vectors of [2 x double] to determine if the values in the first
557 ///    operand are unordered with respect to those in the second operand.
558 ///
559 ///    A pair of double-precision values are unordered with respect to each
560 ///    other if one or both values are NaN. Each comparison returns 0x0 for
561 ///    false, 0xFFFFFFFFFFFFFFFF for true.
562 ///
563 /// \headerfile <x86intrin.h>
564 ///
565 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
566 ///   instruction.
567 ///
568 /// \param __a
569 ///    A 128-bit vector of [2 x double].
570 /// \param __b
571 ///    A 128-bit vector of [2 x double].
572 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpunord_pd(__m128d __a,__m128d __b)573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
574                                                              __m128d __b) {
575   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
576 }
577 
578 /// Compares each of the corresponding double-precision values of the
579 ///    128-bit vectors of [2 x double] to determine if the values in the first
580 ///    operand are unequal to those in the second operand.
581 ///
582 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
583 ///    If either value in a comparison is NaN, returns true.
584 ///
585 /// \headerfile <x86intrin.h>
586 ///
587 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
588 ///
589 /// \param __a
590 ///    A 128-bit vector of [2 x double].
591 /// \param __b
592 ///    A 128-bit vector of [2 x double].
593 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpneq_pd(__m128d __a,__m128d __b)594 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
595                                                            __m128d __b) {
596   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
597 }
598 
599 /// Compares each of the corresponding double-precision values of the
600 ///    128-bit vectors of [2 x double] to determine if the values in the first
601 ///    operand are not less than those in the second operand.
602 ///
603 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
604 ///    If either value in a comparison is NaN, returns true.
605 ///
606 /// \headerfile <x86intrin.h>
607 ///
608 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
609 ///
610 /// \param __a
611 ///    A 128-bit vector of [2 x double].
612 /// \param __b
613 ///    A 128-bit vector of [2 x double].
614 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnlt_pd(__m128d __a,__m128d __b)615 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
616                                                            __m128d __b) {
617   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
618 }
619 
620 /// Compares each of the corresponding double-precision values of the
621 ///    128-bit vectors of [2 x double] to determine if the values in the first
622 ///    operand are not less than or equal to those in the second operand.
623 ///
624 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
625 ///    If either value in a comparison is NaN, returns true.
626 ///
627 /// \headerfile <x86intrin.h>
628 ///
629 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
630 ///
631 /// \param __a
632 ///    A 128-bit vector of [2 x double].
633 /// \param __b
634 ///    A 128-bit vector of [2 x double].
635 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnle_pd(__m128d __a,__m128d __b)636 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
637                                                            __m128d __b) {
638   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
639 }
640 
641 /// Compares each of the corresponding double-precision values of the
642 ///    128-bit vectors of [2 x double] to determine if the values in the first
643 ///    operand are not greater than those in the second operand.
644 ///
645 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
646 ///    If either value in a comparison is NaN, returns true.
647 ///
648 /// \headerfile <x86intrin.h>
649 ///
650 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
651 ///
652 /// \param __a
653 ///    A 128-bit vector of [2 x double].
654 /// \param __b
655 ///    A 128-bit vector of [2 x double].
656 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpngt_pd(__m128d __a,__m128d __b)657 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
658                                                            __m128d __b) {
659   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
660 }
661 
662 /// Compares each of the corresponding double-precision values of the
663 ///    128-bit vectors of [2 x double] to determine if the values in the first
664 ///    operand are not greater than or equal to those in the second operand.
665 ///
666 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
667 ///    If either value in a comparison is NaN, returns true.
668 ///
669 /// \headerfile <x86intrin.h>
670 ///
671 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
672 ///
673 /// \param __a
674 ///    A 128-bit vector of [2 x double].
675 /// \param __b
676 ///    A 128-bit vector of [2 x double].
677 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnge_pd(__m128d __a,__m128d __b)678 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
679                                                            __m128d __b) {
680   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
681 }
682 
683 /// Compares the lower double-precision floating-point values in each of
684 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
685 ///
686 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
687 ///    If either value in a comparison is NaN, returns false.
688 ///
689 /// \headerfile <x86intrin.h>
690 ///
691 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
692 ///
693 /// \param __a
694 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
695 ///    compared to the lower double-precision value of \a __b.
696 /// \param __b
697 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
698 ///    compared to the lower double-precision value of \a __a.
699 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
700 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpeq_sd(__m128d __a,__m128d __b)701 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
702                                                           __m128d __b) {
703   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
704 }
705 
706 /// Compares the lower double-precision floating-point values in each of
707 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
708 ///    the value in the first parameter is less than the corresponding value in
709 ///    the second parameter.
710 ///
711 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
712 ///    If either value in a comparison is NaN, returns false.
713 ///
714 /// \headerfile <x86intrin.h>
715 ///
716 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
717 ///
718 /// \param __a
719 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
720 ///    compared to the lower double-precision value of \a __b.
721 /// \param __b
722 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
723 ///    compared to the lower double-precision value of \a __a.
724 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
725 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmplt_sd(__m128d __a,__m128d __b)726 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
727                                                           __m128d __b) {
728   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
729 }
730 
731 /// Compares the lower double-precision floating-point values in each of
732 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
733 ///    the value in the first parameter is less than or equal to the
734 ///    corresponding value in the second parameter.
735 ///
736 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
737 ///    If either value in a comparison is NaN, returns false.
738 ///
739 /// \headerfile <x86intrin.h>
740 ///
741 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
742 ///
743 /// \param __a
744 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
745 ///    compared to the lower double-precision value of \a __b.
746 /// \param __b
747 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
748 ///    compared to the lower double-precision value of \a __a.
749 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
750 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmple_sd(__m128d __a,__m128d __b)751 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
752                                                           __m128d __b) {
753   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
754 }
755 
756 /// Compares the lower double-precision floating-point values in each of
757 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
758 ///    the value in the first parameter is greater than the corresponding value
759 ///    in the second parameter.
760 ///
761 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
762 ///    If either value in a comparison is NaN, returns false.
763 ///
764 /// \headerfile <x86intrin.h>
765 ///
766 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
767 ///
768 /// \param __a
769 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
770 ///     compared to the lower double-precision value of \a __b.
771 /// \param __b
772 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
773 ///     compared to the lower double-precision value of \a __a.
774 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
775 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpgt_sd(__m128d __a,__m128d __b)776 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
777                                                           __m128d __b) {
778   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
779   return __extension__(__m128d){__c[0], __a[1]};
780 }
781 
782 /// Compares the lower double-precision floating-point values in each of
783 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
784 ///    the value in the first parameter is greater than or equal to the
785 ///    corresponding value in the second parameter.
786 ///
787 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
788 ///    If either value in a comparison is NaN, returns false.
789 ///
790 /// \headerfile <x86intrin.h>
791 ///
792 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
793 ///
794 /// \param __a
795 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
796 ///    compared to the lower double-precision value of \a __b.
797 /// \param __b
798 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
799 ///    compared to the lower double-precision value of \a __a.
800 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
801 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpge_sd(__m128d __a,__m128d __b)802 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
803                                                           __m128d __b) {
804   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
805   return __extension__(__m128d){__c[0], __a[1]};
806 }
807 
808 /// Compares the lower double-precision floating-point values in each of
809 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
810 ///    the value in the first parameter is ordered with respect to the
811 ///    corresponding value in the second parameter.
812 ///
813 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
814 ///    of double-precision values are ordered with respect to each other if
815 ///    neither value is a NaN.
816 ///
817 /// \headerfile <x86intrin.h>
818 ///
819 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
820 ///
821 /// \param __a
822 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
823 ///    compared to the lower double-precision value of \a __b.
824 /// \param __b
825 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
826 ///    compared to the lower double-precision value of \a __a.
827 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
828 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpord_sd(__m128d __a,__m128d __b)829 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
830                                                            __m128d __b) {
831   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
832 }
833 
834 /// Compares the lower double-precision floating-point values in each of
835 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
836 ///    the value in the first parameter is unordered with respect to the
837 ///    corresponding value in the second parameter.
838 ///
839 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
840 ///    of double-precision values are unordered with respect to each other if
841 ///    one or both values are NaN.
842 ///
843 /// \headerfile <x86intrin.h>
844 ///
845 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
846 ///   instruction.
847 ///
848 /// \param __a
849 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
850 ///    compared to the lower double-precision value of \a __b.
851 /// \param __b
852 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
853 ///    compared to the lower double-precision value of \a __a.
854 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
855 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpunord_sd(__m128d __a,__m128d __b)856 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
857                                                              __m128d __b) {
858   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
859 }
860 
861 /// Compares the lower double-precision floating-point values in each of
862 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
863 ///    the value in the first parameter is unequal to the corresponding value in
864 ///    the second parameter.
865 ///
866 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
867 ///    If either value in a comparison is NaN, returns true.
868 ///
869 /// \headerfile <x86intrin.h>
870 ///
871 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
872 ///
873 /// \param __a
874 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
875 ///    compared to the lower double-precision value of \a __b.
876 /// \param __b
877 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
878 ///    compared to the lower double-precision value of \a __a.
879 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
880 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpneq_sd(__m128d __a,__m128d __b)881 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
882                                                            __m128d __b) {
883   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
884 }
885 
886 /// Compares the lower double-precision floating-point values in each of
887 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
888 ///    the value in the first parameter is not less than the corresponding
889 ///    value in the second parameter.
890 ///
891 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
892 ///    If either value in a comparison is NaN, returns true.
893 ///
894 /// \headerfile <x86intrin.h>
895 ///
896 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
897 ///
898 /// \param __a
899 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
900 ///    compared to the lower double-precision value of \a __b.
901 /// \param __b
902 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
903 ///    compared to the lower double-precision value of \a __a.
904 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
905 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnlt_sd(__m128d __a,__m128d __b)906 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
907                                                            __m128d __b) {
908   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
909 }
910 
911 /// Compares the lower double-precision floating-point values in each of
912 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
913 ///    the value in the first parameter is not less than or equal to the
914 ///    corresponding value in the second parameter.
915 ///
916 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
917 ///    If either value in a comparison is NaN, returns true.
918 ///
919 /// \headerfile <x86intrin.h>
920 ///
921 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
922 ///
923 /// \param __a
924 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
925 ///    compared to the lower double-precision value of \a __b.
926 /// \param __b
927 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
928 ///    compared to the lower double-precision value of \a __a.
929 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
930 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnle_sd(__m128d __a,__m128d __b)931 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
932                                                            __m128d __b) {
933   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
934 }
935 
936 /// Compares the lower double-precision floating-point values in each of
937 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
938 ///    the value in the first parameter is not greater than the corresponding
939 ///    value in the second parameter.
940 ///
941 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
942 ///    If either value in a comparison is NaN, returns true.
943 ///
944 /// \headerfile <x86intrin.h>
945 ///
946 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
947 ///
948 /// \param __a
949 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
950 ///    compared to the lower double-precision value of \a __b.
951 /// \param __b
952 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
953 ///    compared to the lower double-precision value of \a __a.
954 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
955 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpngt_sd(__m128d __a,__m128d __b)956 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
957                                                            __m128d __b) {
958   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
959   return __extension__(__m128d){__c[0], __a[1]};
960 }
961 
962 /// Compares the lower double-precision floating-point values in each of
963 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
964 ///    the value in the first parameter is not greater than or equal to the
965 ///    corresponding value in the second parameter.
966 ///
967 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
968 ///    If either value in a comparison is NaN, returns true.
969 ///
970 /// \headerfile <x86intrin.h>
971 ///
972 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
973 ///
974 /// \param __a
975 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
976 ///    compared to the lower double-precision value of \a __b.
977 /// \param __b
978 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
979 ///    compared to the lower double-precision value of \a __a.
980 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
981 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnge_sd(__m128d __a,__m128d __b)982 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
983                                                            __m128d __b) {
984   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
985   return __extension__(__m128d){__c[0], __a[1]};
986 }
987 
988 /// Compares the lower double-precision floating-point values in each of
989 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
990 ///
991 ///    The comparison returns 0 for false, 1 for true. If either value in a
992 ///    comparison is NaN, returns 0.
993 ///
994 /// \headerfile <x86intrin.h>
995 ///
996 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
997 ///
998 /// \param __a
999 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1000 ///    compared to the lower double-precision value of \a __b.
1001 /// \param __b
1002 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1003 ///    compared to the lower double-precision value of \a __a.
1004 /// \returns An integer containing the comparison results.
_mm_comieq_sd(__m128d __a,__m128d __b)1005 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1006                                                        __m128d __b) {
1007   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1008 }
1009 
1010 /// Compares the lower double-precision floating-point values in each of
1011 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1012 ///    the value in the first parameter is less than the corresponding value in
1013 ///    the second parameter.
1014 ///
1015 ///    The comparison returns 0 for false, 1 for true. If either value in a
1016 ///    comparison is NaN, returns 0.
1017 ///
1018 /// \headerfile <x86intrin.h>
1019 ///
1020 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1021 ///
1022 /// \param __a
1023 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1024 ///    compared to the lower double-precision value of \a __b.
1025 /// \param __b
1026 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1027 ///    compared to the lower double-precision value of \a __a.
1028 /// \returns An integer containing the comparison results.
_mm_comilt_sd(__m128d __a,__m128d __b)1029 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1030                                                        __m128d __b) {
1031   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1032 }
1033 
1034 /// Compares the lower double-precision floating-point values in each of
1035 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1036 ///    the value in the first parameter is less than or equal to the
1037 ///    corresponding value in the second parameter.
1038 ///
1039 ///    The comparison returns 0 for false, 1 for true. If either value in a
1040 ///    comparison is NaN, returns 0.
1041 ///
1042 /// \headerfile <x86intrin.h>
1043 ///
1044 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1045 ///
1046 /// \param __a
1047 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1048 ///    compared to the lower double-precision value of \a __b.
1049 /// \param __b
1050 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1051 ///     compared to the lower double-precision value of \a __a.
1052 /// \returns An integer containing the comparison results.
_mm_comile_sd(__m128d __a,__m128d __b)1053 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1054                                                        __m128d __b) {
1055   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1056 }
1057 
1058 /// Compares the lower double-precision floating-point values in each of
1059 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1060 ///    the value in the first parameter is greater than the corresponding value
1061 ///    in the second parameter.
1062 ///
1063 ///    The comparison returns 0 for false, 1 for true. If either value in a
1064 ///    comparison is NaN, returns 0.
1065 ///
1066 /// \headerfile <x86intrin.h>
1067 ///
1068 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1069 ///
1070 /// \param __a
1071 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1072 ///    compared to the lower double-precision value of \a __b.
1073 /// \param __b
1074 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1075 ///    compared to the lower double-precision value of \a __a.
1076 /// \returns An integer containing the comparison results.
_mm_comigt_sd(__m128d __a,__m128d __b)1077 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1078                                                        __m128d __b) {
1079   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1080 }
1081 
1082 /// Compares the lower double-precision floating-point values in each of
1083 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1084 ///    the value in the first parameter is greater than or equal to the
1085 ///    corresponding value in the second parameter.
1086 ///
1087 ///    The comparison returns 0 for false, 1 for true. If either value in a
1088 ///    comparison is NaN, returns 0.
1089 ///
1090 /// \headerfile <x86intrin.h>
1091 ///
1092 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1093 ///
1094 /// \param __a
1095 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1096 ///    compared to the lower double-precision value of \a __b.
1097 /// \param __b
1098 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1099 ///    compared to the lower double-precision value of \a __a.
1100 /// \returns An integer containing the comparison results.
_mm_comige_sd(__m128d __a,__m128d __b)1101 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1102                                                        __m128d __b) {
1103   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1104 }
1105 
1106 /// Compares the lower double-precision floating-point values in each of
1107 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1108 ///    the value in the first parameter is unequal to the corresponding value in
1109 ///    the second parameter.
1110 ///
1111 ///    The comparison returns 0 for false, 1 for true. If either value in a
1112 ///    comparison is NaN, returns 1.
1113 ///
1114 /// \headerfile <x86intrin.h>
1115 ///
1116 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1117 ///
1118 /// \param __a
1119 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1120 ///    compared to the lower double-precision value of \a __b.
1121 /// \param __b
1122 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1123 ///    compared to the lower double-precision value of \a __a.
1124 /// \returns An integer containing the comparison results.
_mm_comineq_sd(__m128d __a,__m128d __b)1125 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1126                                                         __m128d __b) {
1127   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1128 }
1129 
1130 /// Compares the lower double-precision floating-point values in each of
1131 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
1132 ///
1133 ///    The comparison returns 0 for false, 1 for true. If either value in a
1134 ///    comparison is NaN, returns 0.
1135 ///
1136 /// \headerfile <x86intrin.h>
1137 ///
1138 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1139 ///
1140 /// \param __a
1141 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1142 ///    compared to the lower double-precision value of \a __b.
1143 /// \param __b
1144 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1145 ///    compared to the lower double-precision value of \a __a.
1146 /// \returns An integer containing the comparison results.
_mm_ucomieq_sd(__m128d __a,__m128d __b)1147 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1148                                                         __m128d __b) {
1149   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1150 }
1151 
1152 /// Compares the lower double-precision floating-point values in each of
1153 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1154 ///    the value in the first parameter is less than the corresponding value in
1155 ///    the second parameter.
1156 ///
1157 ///    The comparison returns 0 for false, 1 for true. If either value in a
1158 ///    comparison is NaN, returns 0.
1159 ///
1160 /// \headerfile <x86intrin.h>
1161 ///
1162 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1163 ///
1164 /// \param __a
1165 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1166 ///    compared to the lower double-precision value of \a __b.
1167 /// \param __b
1168 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1169 ///    compared to the lower double-precision value of \a __a.
1170 /// \returns An integer containing the comparison results.
_mm_ucomilt_sd(__m128d __a,__m128d __b)1171 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1172                                                         __m128d __b) {
1173   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1174 }
1175 
1176 /// Compares the lower double-precision floating-point values in each of
1177 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1178 ///    the value in the first parameter is less than or equal to the
1179 ///    corresponding value in the second parameter.
1180 ///
1181 ///    The comparison returns 0 for false, 1 for true. If either value in a
1182 ///    comparison is NaN, returns 0.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1187 ///
1188 /// \param __a
1189 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1190 ///    compared to the lower double-precision value of \a __b.
1191 /// \param __b
1192 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1193 ///     compared to the lower double-precision value of \a __a.
1194 /// \returns An integer containing the comparison results.
_mm_ucomile_sd(__m128d __a,__m128d __b)1195 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1196                                                         __m128d __b) {
1197   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1198 }
1199 
1200 /// Compares the lower double-precision floating-point values in each of
1201 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1202 ///    the value in the first parameter is greater than the corresponding value
1203 ///    in the second parameter.
1204 ///
1205 ///    The comparison returns 0 for false, 1 for true. If either value in a
1206 ///    comparison is NaN, returns 0.
1207 ///
1208 /// \headerfile <x86intrin.h>
1209 ///
1210 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1211 ///
1212 /// \param __a
1213 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1214 ///    compared to the lower double-precision value of \a __b.
1215 /// \param __b
1216 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1217 ///     compared to the lower double-precision value of \a __a.
1218 /// \returns An integer containing the comparison results.
_mm_ucomigt_sd(__m128d __a,__m128d __b)1219 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1220                                                         __m128d __b) {
1221   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1222 }
1223 
1224 /// Compares the lower double-precision floating-point values in each of
1225 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1226 ///    the value in the first parameter is greater than or equal to the
1227 ///    corresponding value in the second parameter.
1228 ///
1229 ///    The comparison returns 0 for false, 1 for true. If either value in a
1230 ///    comparison is NaN, returns 0.
1231 ///
1232 /// \headerfile <x86intrin.h>
1233 ///
1234 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1235 ///
1236 /// \param __a
1237 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1238 ///    compared to the lower double-precision value of \a __b.
1239 /// \param __b
1240 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1241 ///    compared to the lower double-precision value of \a __a.
1242 /// \returns An integer containing the comparison results.
_mm_ucomige_sd(__m128d __a,__m128d __b)1243 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1244                                                         __m128d __b) {
1245   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1246 }
1247 
1248 /// Compares the lower double-precision floating-point values in each of
1249 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1250 ///    the value in the first parameter is unequal to the corresponding value in
1251 ///    the second parameter.
1252 ///
1253 ///    The comparison returns 0 for false, 1 for true. If either value in a
1254 ///    comparison is NaN, returns 1.
1255 ///
1256 /// \headerfile <x86intrin.h>
1257 ///
1258 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1259 ///
1260 /// \param __a
1261 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1262 ///    compared to the lower double-precision value of \a __b.
1263 /// \param __b
1264 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1265 ///    compared to the lower double-precision value of \a __a.
1266 /// \returns An integer containing the comparison result.
_mm_ucomineq_sd(__m128d __a,__m128d __b)1267 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1268                                                          __m128d __b) {
1269   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1270 }
1271 
1272 /// Converts the two double-precision floating-point elements of a
1273 ///    128-bit vector of [2 x double] into two single-precision floating-point
1274 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1275 ///    The upper 64 bits of the result vector are set to zero.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1280 ///
1281 /// \param __a
1282 ///    A 128-bit vector of [2 x double].
1283 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1284 ///    converted values. The upper 64 bits are set to zero.
_mm_cvtpd_ps(__m128d __a)1285 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1286   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1287 }
1288 
1289 /// Converts the lower two single-precision floating-point elements of a
1290 ///    128-bit vector of [4 x float] into two double-precision floating-point
1291 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1292 ///    elements of the input vector are unused.
1293 ///
1294 /// \headerfile <x86intrin.h>
1295 ///
1296 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1297 ///
1298 /// \param __a
1299 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1300 ///    floating-point elements are converted to double-precision values. The
1301 ///    upper two elements are unused.
1302 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtps_pd(__m128 __a)1303 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1304   return (__m128d) __builtin_convertvector(
1305       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1306 }
1307 
1308 /// Converts the lower two integer elements of a 128-bit vector of
1309 ///    [4 x i32] into two double-precision floating-point values, returned in a
1310 ///    128-bit vector of [2 x double].
1311 ///
1312 ///    The upper two elements of the input vector are unused.
1313 ///
1314 /// \headerfile <x86intrin.h>
1315 ///
1316 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1317 ///
1318 /// \param __a
1319 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1320 ///    converted to double-precision values.
1321 ///
1322 ///    The upper two elements are unused.
1323 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtepi32_pd(__m128i __a)1324 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1325   return (__m128d) __builtin_convertvector(
1326       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1327 }
1328 
1329 /// Converts the two double-precision floating-point elements of a
1330 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1331 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1332 ///    64 bits of the result vector are set to zero.
1333 ///
1334 ///    If a converted value does not fit in a 32-bit integer, raises a
1335 ///    floating-point invalid exception. If the exception is masked, returns
1336 ///    the most negative integer.
1337 ///
1338 /// \headerfile <x86intrin.h>
1339 ///
1340 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1341 ///
1342 /// \param __a
1343 ///    A 128-bit vector of [2 x double].
1344 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1345 ///    converted values. The upper 64 bits are set to zero.
_mm_cvtpd_epi32(__m128d __a)1346 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1347   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1348 }
1349 
1350 /// Converts the low-order element of a 128-bit vector of [2 x double]
1351 ///    into a 32-bit signed integer value.
1352 ///
1353 ///    If the converted value does not fit in a 32-bit integer, raises a
1354 ///    floating-point invalid exception. If the exception is masked, returns
1355 ///    the most negative integer.
1356 ///
1357 /// \headerfile <x86intrin.h>
1358 ///
1359 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1360 ///
1361 /// \param __a
1362 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1363 ///    conversion.
1364 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvtsd_si32(__m128d __a)1365 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1366   return __builtin_ia32_cvtsd2si((__v2df)__a);
1367 }
1368 
1369 /// Converts the lower double-precision floating-point element of a
1370 ///    128-bit vector of [2 x double], in the second parameter, into a
1371 ///    single-precision floating-point value, returned in the lower 32 bits of a
1372 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1373 ///    copied from the upper 96 bits of the first parameter.
1374 ///
1375 /// \headerfile <x86intrin.h>
1376 ///
1377 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1378 ///
1379 /// \param __a
1380 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1381 ///    copied to the upper 96 bits of the result.
1382 /// \param __b
1383 ///    A 128-bit vector of [2 x double]. The lower double-precision
1384 ///    floating-point element is used in the conversion.
1385 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1386 ///    converted value from the second parameter. The upper 96 bits are copied
1387 ///    from the upper 96 bits of the first parameter.
_mm_cvtsd_ss(__m128 __a,__m128d __b)1388 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1389                                                          __m128d __b) {
1390   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1391 }
1392 
1393 /// Converts a 32-bit signed integer value, in the second parameter, into
1394 ///    a double-precision floating-point value, returned in the lower 64 bits of
1395 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1396 ///    are copied from the upper 64 bits of the first parameter.
1397 ///
1398 /// \headerfile <x86intrin.h>
1399 ///
1400 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1401 ///
1402 /// \param __a
1403 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1404 ///    copied to the upper 64 bits of the result.
1405 /// \param __b
1406 ///    A 32-bit signed integer containing the value to be converted.
1407 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1408 ///    converted value from the second parameter. The upper 64 bits are copied
1409 ///    from the upper 64 bits of the first parameter.
_mm_cvtsi32_sd(__m128d __a,int __b)1410 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1411                                                             int __b) {
1412   __a[0] = __b;
1413   return __a;
1414 }
1415 
1416 /// Converts the lower single-precision floating-point element of a
1417 ///    128-bit vector of [4 x float], in the second parameter, into a
1418 ///    double-precision floating-point value, returned in the lower 64 bits of
1419 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1420 ///    are copied from the upper 64 bits of the first parameter.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1425 ///
1426 /// \param __a
1427 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1428 ///    copied to the upper 64 bits of the result.
1429 /// \param __b
1430 ///    A 128-bit vector of [4 x float]. The lower single-precision
1431 ///    floating-point element is used in the conversion.
1432 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1433 ///    converted value from the second parameter. The upper 64 bits are copied
1434 ///    from the upper 64 bits of the first parameter.
_mm_cvtss_sd(__m128d __a,__m128 __b)1435 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1436                                                           __m128 __b) {
1437   __a[0] = __b[0];
1438   return __a;
1439 }
1440 
1441 /// Converts the two double-precision floating-point elements of a
1442 ///    128-bit vector of [2 x double] into two signed truncated (rounded
1443 ///    toward zero) 32-bit integer values, returned in the lower 64 bits
1444 ///    of a 128-bit vector of [4 x i32].
1445 ///
1446 ///    If a converted value does not fit in a 32-bit integer, raises a
1447 ///    floating-point invalid exception. If the exception is masked, returns
1448 ///    the most negative integer.
1449 ///
1450 /// \headerfile <x86intrin.h>
1451 ///
1452 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1453 ///   instruction.
1454 ///
1455 /// \param __a
1456 ///    A 128-bit vector of [2 x double].
1457 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1458 ///    converted values. The upper 64 bits are set to zero.
_mm_cvttpd_epi32(__m128d __a)1459 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1460   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1461 }
1462 
1463 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1464 ///    signed truncated (rounded toward zero) integer value.
1465 ///
1466 ///    If the converted value does not fit in a 32-bit integer, raises a
1467 ///    floating-point invalid exception. If the exception is masked, returns
1468 ///    the most negative integer.
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1473 ///   instruction.
1474 ///
1475 /// \param __a
1476 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1477 ///    conversion.
1478 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvttsd_si32(__m128d __a)1479 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1480   return __builtin_ia32_cvttsd2si((__v2df)__a);
1481 }
1482 
1483 /// Converts the two double-precision floating-point elements of a
1484 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1485 ///    returned in a 64-bit vector of [2 x i32].
1486 ///
1487 ///    If a converted value does not fit in a 32-bit integer, raises a
1488 ///    floating-point invalid exception. If the exception is masked, returns
1489 ///    the most negative integer.
1490 ///
1491 /// \headerfile <x86intrin.h>
1492 ///
1493 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1494 ///
1495 /// \param __a
1496 ///    A 128-bit vector of [2 x double].
1497 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvtpd_pi32(__m128d __a)1498 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1499   return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1500 }
1501 
1502 /// Converts the two double-precision floating-point elements of a
1503 ///    128-bit vector of [2 x double] into two signed truncated (rounded toward
1504 ///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1505 ///
1506 ///    If a converted value does not fit in a 32-bit integer, raises a
1507 ///    floating-point invalid exception. If the exception is masked, returns
1508 ///    the most negative integer.
1509 ///
1510 /// \headerfile <x86intrin.h>
1511 ///
1512 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1513 ///
1514 /// \param __a
1515 ///    A 128-bit vector of [2 x double].
1516 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvttpd_pi32(__m128d __a)1517 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1518   return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1519 }
1520 
1521 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1522 ///    [2 x i32] into two double-precision floating-point values, returned in a
1523 ///    128-bit vector of [2 x double].
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1528 ///
1529 /// \param __a
1530 ///    A 64-bit vector of [2 x i32].
1531 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtpi32_pd(__m64 __a)1532 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
1533   return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1534 }
1535 
1536 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1537 ///    a double-precision floating-point value.
1538 ///
1539 /// \headerfile <x86intrin.h>
1540 ///
1541 /// This intrinsic has no corresponding instruction.
1542 ///
1543 /// \param __a
1544 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1545 /// \returns A double-precision floating-point value copied from the lower 64
1546 ///    bits of \a __a.
_mm_cvtsd_f64(__m128d __a)1547 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1548   return __a[0];
1549 }
1550 
1551 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1552 ///    memory location.
1553 ///
1554 /// \headerfile <x86intrin.h>
1555 ///
1556 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1557 ///
1558 /// \param __dp
1559 ///    A pointer to a 128-bit memory location. The address of the memory
1560 ///    location has to be 16-byte aligned.
1561 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_load_pd(double const * __dp)1562 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1563   return *(const __m128d *)__dp;
1564 }
1565 
1566 /// Loads a double-precision floating-point value from a specified memory
1567 ///    location and duplicates it to both vector elements of a 128-bit vector of
1568 ///    [2 x double].
1569 ///
1570 /// \headerfile <x86intrin.h>
1571 ///
1572 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1573 ///
1574 /// \param __dp
1575 ///    A pointer to a memory location containing a double-precision value.
1576 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1577 ///    duplicated values.
_mm_load1_pd(double const * __dp)1578 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1579   struct __mm_load1_pd_struct {
1580     double __u;
1581   } __attribute__((__packed__, __may_alias__));
1582   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1583   return __extension__(__m128d){__u, __u};
1584 }
1585 
1586 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1587 
1588 /// Loads two double-precision values, in reverse order, from an aligned
1589 ///    memory location into a 128-bit vector of [2 x double].
1590 ///
1591 /// \headerfile <x86intrin.h>
1592 ///
1593 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1594 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1595 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1596 ///
1597 /// \param __dp
1598 ///    A 16-byte aligned pointer to an array of double-precision values to be
1599 ///    loaded in reverse order.
1600 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1601 ///    values.
_mm_loadr_pd(double const * __dp)1602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1603   __m128d __u = *(const __m128d *)__dp;
1604   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1605 }
1606 
1607 /// Loads a 128-bit floating-point vector of [2 x double] from an
1608 ///    unaligned memory location.
1609 ///
1610 /// \headerfile <x86intrin.h>
1611 ///
1612 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1613 ///
1614 /// \param __dp
1615 ///    A pointer to a 128-bit memory location. The address of the memory
1616 ///    location does not have to be aligned.
1617 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_loadu_pd(double const * __dp)1618 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1619   struct __loadu_pd {
1620     __m128d_u __v;
1621   } __attribute__((__packed__, __may_alias__));
1622   return ((const struct __loadu_pd *)__dp)->__v;
1623 }
1624 
1625 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1626 ///    vector and clears the upper element.
1627 ///
1628 /// \headerfile <x86intrin.h>
1629 ///
1630 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1631 ///
1632 /// \param __a
1633 ///    A pointer to a 64-bit memory location. The address of the memory
1634 ///    location does not have to be aligned.
1635 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
_mm_loadu_si64(void const * __a)1636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1637   struct __loadu_si64 {
1638     long long __v;
1639   } __attribute__((__packed__, __may_alias__));
1640   long long __u = ((const struct __loadu_si64 *)__a)->__v;
1641   return __extension__(__m128i)(__v2di){__u, 0LL};
1642 }
1643 
1644 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1645 ///    vector and clears the upper element.
1646 ///
1647 /// \headerfile <x86intrin.h>
1648 ///
1649 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1650 ///
1651 /// \param __a
1652 ///    A pointer to a 32-bit memory location. The address of the memory
1653 ///    location does not have to be aligned.
1654 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
_mm_loadu_si32(void const * __a)1655 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1656   struct __loadu_si32 {
1657     int __v;
1658   } __attribute__((__packed__, __may_alias__));
1659   int __u = ((const struct __loadu_si32 *)__a)->__v;
1660   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1661 }
1662 
1663 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1664 ///    vector and clears the upper element.
1665 ///
1666 /// \headerfile <x86intrin.h>
1667 ///
1668 /// This intrinsic does not correspond to a specific instruction.
1669 ///
1670 /// \param __a
1671 ///    A pointer to a 16-bit memory location. The address of the memory
1672 ///    location does not have to be aligned.
1673 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
_mm_loadu_si16(void const * __a)1674 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1675   struct __loadu_si16 {
1676     short __v;
1677   } __attribute__((__packed__, __may_alias__));
1678   short __u = ((const struct __loadu_si16 *)__a)->__v;
1679   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1680 }
1681 
1682 /// Loads a 64-bit double-precision value to the low element of a
1683 ///    128-bit integer vector and clears the upper element.
1684 ///
1685 /// \headerfile <x86intrin.h>
1686 ///
1687 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1688 ///
1689 /// \param __dp
1690 ///    A pointer to a memory location containing a double-precision value.
1691 ///    The address of the memory location does not have to be aligned.
1692 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
_mm_load_sd(double const * __dp)1693 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1694   struct __mm_load_sd_struct {
1695     double __u;
1696   } __attribute__((__packed__, __may_alias__));
1697   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1698   return __extension__(__m128d){__u, 0};
1699 }
1700 
1701 /// Loads a double-precision value into the high-order bits of a 128-bit
1702 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1703 ///    bits of the first operand.
1704 ///
1705 /// \headerfile <x86intrin.h>
1706 ///
1707 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1708 ///
1709 /// \param __a
1710 ///    A 128-bit vector of [2 x double]. \n
1711 ///    Bits [63:0] are written to bits [63:0] of the result.
1712 /// \param __dp
1713 ///    A pointer to a 64-bit memory location containing a double-precision
1714 ///    floating-point value that is loaded. The loaded value is written to bits
1715 ///    [127:64] of the result. The address of the memory location does not have
1716 ///    to be aligned.
1717 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadh_pd(__m128d __a,double const * __dp)1718 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1719                                                           double const *__dp) {
1720   struct __mm_loadh_pd_struct {
1721     double __u;
1722   } __attribute__((__packed__, __may_alias__));
1723   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1724   return __extension__(__m128d){__a[0], __u};
1725 }
1726 
1727 /// Loads a double-precision value into the low-order bits of a 128-bit
1728 ///    vector of [2 x double]. The high-order bits are copied from the
1729 ///    high-order bits of the first operand.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1734 ///
1735 /// \param __a
1736 ///    A 128-bit vector of [2 x double]. \n
1737 ///    Bits [127:64] are written to bits [127:64] of the result.
1738 /// \param __dp
1739 ///    A pointer to a 64-bit memory location containing a double-precision
1740 ///    floating-point value that is loaded. The loaded value is written to bits
1741 ///    [63:0] of the result. The address of the memory location does not have to
1742 ///    be aligned.
1743 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadl_pd(__m128d __a,double const * __dp)1744 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1745                                                           double const *__dp) {
1746   struct __mm_loadl_pd_struct {
1747     double __u;
1748   } __attribute__((__packed__, __may_alias__));
1749   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1750   return __extension__(__m128d){__u, __a[1]};
1751 }
1752 
1753 /// Constructs a 128-bit floating-point vector of [2 x double] with
1754 ///    unspecified content. This could be used as an argument to another
1755 ///    intrinsic function where the argument is required but the value is not
1756 ///    actually used.
1757 ///
1758 /// \headerfile <x86intrin.h>
1759 ///
1760 /// This intrinsic has no corresponding instruction.
1761 ///
1762 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1763 ///    content.
_mm_undefined_pd(void)1764 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1765   return (__m128d)__builtin_ia32_undef128();
1766 }
1767 
1768 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1769 ///    64 bits of the vector are initialized with the specified double-precision
1770 ///    floating-point value. The upper 64 bits are set to zero.
1771 ///
1772 /// \headerfile <x86intrin.h>
1773 ///
1774 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1775 ///
1776 /// \param __w
1777 ///    A double-precision floating-point value used to initialize the lower 64
1778 ///    bits of the result.
1779 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1780 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1781 ///    set to zero.
_mm_set_sd(double __w)1782 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1783   return __extension__(__m128d){__w, 0.0};
1784 }
1785 
1786 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1787 ///    of the two double-precision floating-point vector elements set to the
1788 ///    specified double-precision floating-point value.
1789 ///
1790 /// \headerfile <x86intrin.h>
1791 ///
1792 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1793 ///
1794 /// \param __w
1795 ///    A double-precision floating-point value used to initialize each vector
1796 ///    element of the result.
1797 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set1_pd(double __w)1798 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1799   return __extension__(__m128d){__w, __w};
1800 }
1801 
1802 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1803 ///    of the two double-precision floating-point vector elements set to the
1804 ///    specified double-precision floating-point value.
1805 ///
1806 /// \headerfile <x86intrin.h>
1807 ///
1808 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1809 ///
1810 /// \param __w
1811 ///    A double-precision floating-point value used to initialize each vector
1812 ///    element of the result.
1813 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd1(double __w)1814 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1815   return _mm_set1_pd(__w);
1816 }
1817 
1818 /// Constructs a 128-bit floating-point vector of [2 x double]
1819 ///    initialized with the specified double-precision floating-point values.
1820 ///
1821 /// \headerfile <x86intrin.h>
1822 ///
1823 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1824 ///
1825 /// \param __w
1826 ///    A double-precision floating-point value used to initialize the upper 64
1827 ///    bits of the result.
1828 /// \param __x
1829 ///    A double-precision floating-point value used to initialize the lower 64
1830 ///    bits of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd(double __w,double __x)1832 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1833                                                         double __x) {
1834   return __extension__(__m128d){__x, __w};
1835 }
1836 
1837 /// Constructs a 128-bit floating-point vector of [2 x double],
1838 ///    initialized in reverse order with the specified double-precision
1839 ///    floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1844 ///
1845 /// \param __w
1846 ///    A double-precision floating-point value used to initialize the lower 64
1847 ///    bits of the result.
1848 /// \param __x
1849 ///    A double-precision floating-point value used to initialize the upper 64
1850 ///    bits of the result.
1851 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_setr_pd(double __w,double __x)1852 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1853                                                          double __x) {
1854   return __extension__(__m128d){__w, __x};
1855 }
1856 
1857 /// Constructs a 128-bit floating-point vector of [2 x double]
1858 ///    initialized to zero.
1859 ///
1860 /// \headerfile <x86intrin.h>
1861 ///
1862 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1863 ///
1864 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1865 ///    all elements set to zero.
_mm_setzero_pd(void)1866 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1867   return __extension__(__m128d){0.0, 0.0};
1868 }
1869 
1870 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1871 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1872 ///    64 bits are set to the upper 64 bits of the first parameter.
1873 ///
1874 /// \headerfile <x86intrin.h>
1875 ///
1876 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1877 ///
1878 /// \param __a
1879 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1880 ///    upper 64 bits of the result.
1881 /// \param __b
1882 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1883 ///    lower 64 bits of the result.
1884 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_move_sd(__m128d __a,__m128d __b)1885 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1886                                                          __m128d __b) {
1887   __a[0] = __b[0];
1888   return __a;
1889 }
1890 
1891 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1892 ///    memory location.
1893 ///
1894 /// \headerfile <x86intrin.h>
1895 ///
1896 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1897 ///
1898 /// \param __dp
1899 ///    A pointer to a 64-bit memory location.
1900 /// \param __a
1901 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_store_sd(double * __dp,__m128d __a)1902 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1903                                                        __m128d __a) {
1904   struct __mm_store_sd_struct {
1905     double __u;
1906   } __attribute__((__packed__, __may_alias__));
1907   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1908 }
1909 
1910 /// Moves packed double-precision values from a 128-bit vector of
1911 ///    [2 x double] to a memory location.
1912 ///
1913 /// \headerfile <x86intrin.h>
1914 ///
1915 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1916 ///
1917 /// \param __dp
1918 ///    A pointer to an aligned memory location that can store two
1919 ///    double-precision values.
1920 /// \param __a
1921 ///    A packed 128-bit vector of [2 x double] containing the values to be
1922 ///    moved.
_mm_store_pd(double * __dp,__m128d __a)1923 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1924                                                        __m128d __a) {
1925   *(__m128d *)__dp = __a;
1926 }
1927 
1928 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1929 ///    the upper and lower 64 bits of a memory location.
1930 ///
1931 /// \headerfile <x86intrin.h>
1932 ///
1933 /// This intrinsic corresponds to the
1934 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1935 ///
1936 /// \param __dp
1937 ///    A pointer to a memory location that can store two double-precision
1938 ///    values.
1939 /// \param __a
1940 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1941 ///    of the values in \a __dp.
_mm_store1_pd(double * __dp,__m128d __a)1942 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1943                                                         __m128d __a) {
1944   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1945   _mm_store_pd(__dp, __a);
1946 }
1947 
1948 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1949 ///    the upper and lower 64 bits of a memory location.
1950 ///
1951 /// \headerfile <x86intrin.h>
1952 ///
1953 /// This intrinsic corresponds to the
1954 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1955 ///
1956 /// \param __dp
1957 ///    A pointer to a memory location that can store two double-precision
1958 ///    values.
1959 /// \param __a
1960 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1961 ///    of the values in \a __dp.
_mm_store_pd1(double * __dp,__m128d __a)1962 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1963                                                         __m128d __a) {
1964   _mm_store1_pd(__dp, __a);
1965 }
1966 
1967 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1968 ///    location.
1969 ///
1970 /// \headerfile <x86intrin.h>
1971 ///
1972 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1973 ///
1974 /// \param __dp
1975 ///    A pointer to a 128-bit memory location. The address of the memory
1976 ///    location does not have to be aligned.
1977 /// \param __a
1978 ///    A 128-bit vector of [2 x double] containing the values to be stored.
_mm_storeu_pd(double * __dp,__m128d __a)1979 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1980                                                         __m128d __a) {
1981   struct __storeu_pd {
1982     __m128d_u __v;
1983   } __attribute__((__packed__, __may_alias__));
1984   ((struct __storeu_pd *)__dp)->__v = __a;
1985 }
1986 
1987 /// Stores two double-precision values, in reverse order, from a 128-bit
1988 ///    vector of [2 x double] to a 16-byte aligned memory location.
1989 ///
1990 /// \headerfile <x86intrin.h>
1991 ///
1992 /// This intrinsic corresponds to a shuffling instruction followed by a
1993 /// <c> VMOVAPD / MOVAPD </c> instruction.
1994 ///
1995 /// \param __dp
1996 ///    A pointer to a 16-byte aligned memory location that can store two
1997 ///    double-precision values.
1998 /// \param __a
1999 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
2000 ///    stored.
_mm_storer_pd(double * __dp,__m128d __a)2001 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2002                                                         __m128d __a) {
2003   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2004   *(__m128d *)__dp = __a;
2005 }
2006 
2007 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2008 ///    memory location.
2009 ///
2010 /// \headerfile <x86intrin.h>
2011 ///
2012 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2013 ///
2014 /// \param __dp
2015 ///    A pointer to a 64-bit memory location.
2016 /// \param __a
2017 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storeh_pd(double * __dp,__m128d __a)2018 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2019                                                         __m128d __a) {
2020   struct __mm_storeh_pd_struct {
2021     double __u;
2022   } __attribute__((__packed__, __may_alias__));
2023   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2024 }
2025 
2026 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2027 ///    memory location.
2028 ///
2029 /// \headerfile <x86intrin.h>
2030 ///
2031 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2032 ///
2033 /// \param __dp
2034 ///    A pointer to a 64-bit memory location.
2035 /// \param __a
2036 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storel_pd(double * __dp,__m128d __a)2037 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2038                                                         __m128d __a) {
2039   struct __mm_storeh_pd_struct {
2040     double __u;
2041   } __attribute__((__packed__, __may_alias__));
2042   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2043 }
2044 
2045 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2046 ///    saving the lower 8 bits of each sum in the corresponding element of a
2047 ///    128-bit result vector of [16 x i8].
2048 ///
2049 ///    The integer elements of both parameters can be either signed or unsigned.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2054 ///
2055 /// \param __a
2056 ///    A 128-bit vector of [16 x i8].
2057 /// \param __b
2058 ///    A 128-bit vector of [16 x i8].
2059 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2060 ///    parameters.
_mm_add_epi8(__m128i __a,__m128i __b)2061 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2062                                                           __m128i __b) {
2063   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2064 }
2065 
2066 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2067 ///    saving the lower 16 bits of each sum in the corresponding element of a
2068 ///    128-bit result vector of [8 x i16].
2069 ///
2070 ///    The integer elements of both parameters can be either signed or unsigned.
2071 ///
2072 /// \headerfile <x86intrin.h>
2073 ///
2074 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2075 ///
2076 /// \param __a
2077 ///    A 128-bit vector of [8 x i16].
2078 /// \param __b
2079 ///    A 128-bit vector of [8 x i16].
2080 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2081 ///    parameters.
_mm_add_epi16(__m128i __a,__m128i __b)2082 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2083                                                            __m128i __b) {
2084   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2085 }
2086 
2087 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2088 ///    saving the lower 32 bits of each sum in the corresponding element of a
2089 ///    128-bit result vector of [4 x i32].
2090 ///
2091 ///    The integer elements of both parameters can be either signed or unsigned.
2092 ///
2093 /// \headerfile <x86intrin.h>
2094 ///
2095 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2096 ///
2097 /// \param __a
2098 ///    A 128-bit vector of [4 x i32].
2099 /// \param __b
2100 ///    A 128-bit vector of [4 x i32].
2101 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2102 ///    parameters.
_mm_add_epi32(__m128i __a,__m128i __b)2103 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2104                                                            __m128i __b) {
2105   return (__m128i)((__v4su)__a + (__v4su)__b);
2106 }
2107 
2108 /// Adds two signed or unsigned 64-bit integer values, returning the
2109 ///    lower 64 bits of the sum.
2110 ///
2111 /// \headerfile <x86intrin.h>
2112 ///
2113 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2114 ///
2115 /// \param __a
2116 ///    A 64-bit integer.
2117 /// \param __b
2118 ///    A 64-bit integer.
2119 /// \returns A 64-bit integer containing the sum of both parameters.
_mm_add_si64(__m64 __a,__m64 __b)2120 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2121   return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2122 }
2123 
2124 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2125 ///    saving the lower 64 bits of each sum in the corresponding element of a
2126 ///    128-bit result vector of [2 x i64].
2127 ///
2128 ///    The integer elements of both parameters can be either signed or unsigned.
2129 ///
2130 /// \headerfile <x86intrin.h>
2131 ///
2132 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2133 ///
2134 /// \param __a
2135 ///    A 128-bit vector of [2 x i64].
2136 /// \param __b
2137 ///    A 128-bit vector of [2 x i64].
2138 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2139 ///    parameters.
_mm_add_epi64(__m128i __a,__m128i __b)2140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2141                                                            __m128i __b) {
2142   return (__m128i)((__v2du)__a + (__v2du)__b);
2143 }
2144 
2145 /// Adds, with saturation, the corresponding elements of two 128-bit
2146 ///    signed [16 x i8] vectors, saving each sum in the corresponding element
2147 ///    of a 128-bit result vector of [16 x i8].
2148 ///
2149 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2150 ///    less than 0x80 are saturated to 0x80.
2151 ///
2152 /// \headerfile <x86intrin.h>
2153 ///
2154 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2155 ///
2156 /// \param __a
2157 ///    A 128-bit signed [16 x i8] vector.
2158 /// \param __b
2159 ///    A 128-bit signed [16 x i8] vector.
2160 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2161 ///    both parameters.
_mm_adds_epi8(__m128i __a,__m128i __b)2162 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2163                                                            __m128i __b) {
2164   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2165 }
2166 
2167 /// Adds, with saturation, the corresponding elements of two 128-bit
2168 ///    signed [8 x i16] vectors, saving each sum in the corresponding element
2169 ///    of a 128-bit result vector of [8 x i16].
2170 ///
2171 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2172 ///    less than 0x8000 are saturated to 0x8000.
2173 ///
2174 /// \headerfile <x86intrin.h>
2175 ///
2176 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2177 ///
2178 /// \param __a
2179 ///    A 128-bit signed [8 x i16] vector.
2180 /// \param __b
2181 ///    A 128-bit signed [8 x i16] vector.
2182 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2183 ///    both parameters.
_mm_adds_epi16(__m128i __a,__m128i __b)2184 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2185                                                             __m128i __b) {
2186   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2187 }
2188 
2189 /// Adds, with saturation, the corresponding elements of two 128-bit
2190 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2191 ///    of a 128-bit result vector of [16 x i8].
2192 ///
2193 ///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2194 ///    saturated to 0x00.
2195 ///
2196 /// \headerfile <x86intrin.h>
2197 ///
2198 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2199 ///
2200 /// \param __a
2201 ///    A 128-bit unsigned [16 x i8] vector.
2202 /// \param __b
2203 ///    A 128-bit unsigned [16 x i8] vector.
2204 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2205 ///    of both parameters.
_mm_adds_epu8(__m128i __a,__m128i __b)2206 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2207                                                            __m128i __b) {
2208   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2209 }
2210 
2211 /// Adds, with saturation, the corresponding elements of two 128-bit
2212 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2213 ///    of a 128-bit result vector of [8 x i16].
2214 ///
2215 ///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2216 ///    are saturated to 0x0000.
2217 ///
2218 /// \headerfile <x86intrin.h>
2219 ///
2220 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2221 ///
2222 /// \param __a
2223 ///    A 128-bit unsigned [8 x i16] vector.
2224 /// \param __b
2225 ///    A 128-bit unsigned [8 x i16] vector.
2226 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2227 ///    of both parameters.
_mm_adds_epu16(__m128i __a,__m128i __b)2228 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2229                                                             __m128i __b) {
2230   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2231 }
2232 
2233 /// Computes the rounded averages of corresponding elements of two
2234 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2235 ///    corresponding element of a 128-bit result vector of [16 x i8].
2236 ///
2237 /// \headerfile <x86intrin.h>
2238 ///
2239 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2240 ///
2241 /// \param __a
2242 ///    A 128-bit unsigned [16 x i8] vector.
2243 /// \param __b
2244 ///    A 128-bit unsigned [16 x i8] vector.
2245 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2246 ///    averages of both parameters.
_mm_avg_epu8(__m128i __a,__m128i __b)2247 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2248                                                           __m128i __b) {
2249   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2250 }
2251 
2252 /// Computes the rounded averages of corresponding elements of two
2253 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2254 ///    corresponding element of a 128-bit result vector of [8 x i16].
2255 ///
2256 /// \headerfile <x86intrin.h>
2257 ///
2258 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2259 ///
2260 /// \param __a
2261 ///    A 128-bit unsigned [8 x i16] vector.
2262 /// \param __b
2263 ///    A 128-bit unsigned [8 x i16] vector.
2264 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2265 ///    averages of both parameters.
_mm_avg_epu16(__m128i __a,__m128i __b)2266 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2267                                                            __m128i __b) {
2268   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2269 }
2270 
2271 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2272 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2273 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2274 ///    [4 x i32] vector.
2275 ///
2276 ///    For example, bits [15:0] of both parameters are multiplied producing a
2277 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2278 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2279 ///    of the result.
2280 ///
2281 /// \headerfile <x86intrin.h>
2282 ///
2283 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2284 ///
2285 /// \param __a
2286 ///    A 128-bit signed [8 x i16] vector.
2287 /// \param __b
2288 ///    A 128-bit signed [8 x i16] vector.
2289 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2290 ///    of both parameters.
_mm_madd_epi16(__m128i __a,__m128i __b)2291 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2292                                                             __m128i __b) {
2293   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2294 }
2295 
2296 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2297 ///    vectors, saving the greater value from each comparison in the
2298 ///    corresponding element of a 128-bit result vector of [8 x i16].
2299 ///
2300 /// \headerfile <x86intrin.h>
2301 ///
2302 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2303 ///
2304 /// \param __a
2305 ///    A 128-bit signed [8 x i16] vector.
2306 /// \param __b
2307 ///    A 128-bit signed [8 x i16] vector.
2308 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2309 ///    each comparison.
_mm_max_epi16(__m128i __a,__m128i __b)2310 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2311                                                            __m128i __b) {
2312   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2313 }
2314 
2315 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2316 ///    vectors, saving the greater value from each comparison in the
2317 ///    corresponding element of a 128-bit result vector of [16 x i8].
2318 ///
2319 /// \headerfile <x86intrin.h>
2320 ///
2321 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2322 ///
2323 /// \param __a
2324 ///    A 128-bit unsigned [16 x i8] vector.
2325 /// \param __b
2326 ///    A 128-bit unsigned [16 x i8] vector.
2327 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2328 ///    each comparison.
_mm_max_epu8(__m128i __a,__m128i __b)2329 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2330                                                           __m128i __b) {
2331   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2332 }
2333 
2334 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2335 ///    vectors, saving the smaller value from each comparison in the
2336 ///    corresponding element of a 128-bit result vector of [8 x i16].
2337 ///
2338 /// \headerfile <x86intrin.h>
2339 ///
2340 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2341 ///
2342 /// \param __a
2343 ///    A 128-bit signed [8 x i16] vector.
2344 /// \param __b
2345 ///    A 128-bit signed [8 x i16] vector.
2346 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2347 ///    each comparison.
_mm_min_epi16(__m128i __a,__m128i __b)2348 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2349                                                            __m128i __b) {
2350   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2351 }
2352 
2353 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2354 ///    vectors, saving the smaller value from each comparison in the
2355 ///    corresponding element of a 128-bit result vector of [16 x i8].
2356 ///
2357 /// \headerfile <x86intrin.h>
2358 ///
2359 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2360 ///
2361 /// \param __a
2362 ///    A 128-bit unsigned [16 x i8] vector.
2363 /// \param __b
2364 ///    A 128-bit unsigned [16 x i8] vector.
2365 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2366 ///    each comparison.
_mm_min_epu8(__m128i __a,__m128i __b)2367 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2368                                                           __m128i __b) {
2369   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2370 }
2371 
2372 /// Multiplies the corresponding elements of two signed [8 x i16]
2373 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2374 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2375 ///
2376 /// \headerfile <x86intrin.h>
2377 ///
2378 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2379 ///
2380 /// \param __a
2381 ///    A 128-bit signed [8 x i16] vector.
2382 /// \param __b
2383 ///    A 128-bit signed [8 x i16] vector.
2384 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2385 ///    each of the eight 32-bit products.
_mm_mulhi_epi16(__m128i __a,__m128i __b)2386 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2387                                                              __m128i __b) {
2388   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2389 }
2390 
2391 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2392 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2393 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2394 ///
2395 /// \headerfile <x86intrin.h>
2396 ///
2397 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2398 ///
2399 /// \param __a
2400 ///    A 128-bit unsigned [8 x i16] vector.
2401 /// \param __b
2402 ///    A 128-bit unsigned [8 x i16] vector.
2403 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2404 ///    of each of the eight 32-bit products.
_mm_mulhi_epu16(__m128i __a,__m128i __b)2405 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2406                                                              __m128i __b) {
2407   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2408 }
2409 
2410 /// Multiplies the corresponding elements of two signed [8 x i16]
2411 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2412 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2413 ///
2414 /// \headerfile <x86intrin.h>
2415 ///
2416 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2417 ///
2418 /// \param __a
2419 ///    A 128-bit signed [8 x i16] vector.
2420 /// \param __b
2421 ///    A 128-bit signed [8 x i16] vector.
2422 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2423 ///    each of the eight 32-bit products.
_mm_mullo_epi16(__m128i __a,__m128i __b)2424 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2425                                                              __m128i __b) {
2426   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2427 }
2428 
2429 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2430 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2431 ///    product.
2432 ///
2433 /// \headerfile <x86intrin.h>
2434 ///
2435 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2436 ///
2437 /// \param __a
2438 ///    A 64-bit integer containing one of the source operands.
2439 /// \param __b
2440 ///    A 64-bit integer containing one of the source operands.
2441 /// \returns A 64-bit integer vector containing the product of both operands.
_mm_mul_su32(__m64 __a,__m64 __b)2442 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2443   return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2444                                              (__v4si)__anyext128(__b)));
2445 }
2446 
2447 /// Multiplies 32-bit unsigned integer values contained in the lower
2448 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2449 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2450 ///
2451 /// \headerfile <x86intrin.h>
2452 ///
2453 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2454 ///
2455 /// \param __a
2456 ///    A [2 x i64] vector containing one of the source operands.
2457 /// \param __b
2458 ///    A [2 x i64] vector containing one of the source operands.
2459 /// \returns A [2 x i64] vector containing the product of both operands.
_mm_mul_epu32(__m128i __a,__m128i __b)2460 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2461                                                            __m128i __b) {
2462   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2463 }
2464 
2465 /// Computes the absolute differences of corresponding 8-bit integer
2466 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2467 ///    separately sums the second 8 absolute differences. Packs these two
2468 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2469 ///    [2 x i64] vector.
2470 ///
2471 /// \headerfile <x86intrin.h>
2472 ///
2473 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2474 ///
2475 /// \param __a
2476 ///    A 128-bit integer vector containing one of the source operands.
2477 /// \param __b
2478 ///    A 128-bit integer vector containing one of the source operands.
2479 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2480 ///    differences between both operands.
_mm_sad_epu8(__m128i __a,__m128i __b)2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2482                                                           __m128i __b) {
2483   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2484 }
2485 
2486 /// Subtracts the corresponding 8-bit integer values in the operands.
2487 ///
2488 /// \headerfile <x86intrin.h>
2489 ///
2490 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2491 ///
2492 /// \param __a
2493 ///    A 128-bit integer vector containing the minuends.
2494 /// \param __b
2495 ///    A 128-bit integer vector containing the subtrahends.
2496 /// \returns A 128-bit integer vector containing the differences of the values
2497 ///    in the operands.
_mm_sub_epi8(__m128i __a,__m128i __b)2498 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2499                                                           __m128i __b) {
2500   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2501 }
2502 
2503 /// Subtracts the corresponding 16-bit integer values in the operands.
2504 ///
2505 /// \headerfile <x86intrin.h>
2506 ///
2507 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2508 ///
2509 /// \param __a
2510 ///    A 128-bit integer vector containing the minuends.
2511 /// \param __b
2512 ///    A 128-bit integer vector containing the subtrahends.
2513 /// \returns A 128-bit integer vector containing the differences of the values
2514 ///    in the operands.
_mm_sub_epi16(__m128i __a,__m128i __b)2515 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2516                                                            __m128i __b) {
2517   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2518 }
2519 
2520 /// Subtracts the corresponding 32-bit integer values in the operands.
2521 ///
2522 /// \headerfile <x86intrin.h>
2523 ///
2524 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2525 ///
2526 /// \param __a
2527 ///    A 128-bit integer vector containing the minuends.
2528 /// \param __b
2529 ///    A 128-bit integer vector containing the subtrahends.
2530 /// \returns A 128-bit integer vector containing the differences of the values
2531 ///    in the operands.
_mm_sub_epi32(__m128i __a,__m128i __b)2532 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2533                                                            __m128i __b) {
2534   return (__m128i)((__v4su)__a - (__v4su)__b);
2535 }
2536 
2537 /// Subtracts signed or unsigned 64-bit integer values and writes the
2538 ///    difference to the corresponding bits in the destination.
2539 ///
2540 /// \headerfile <x86intrin.h>
2541 ///
2542 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2543 ///
2544 /// \param __a
2545 ///    A 64-bit integer vector containing the minuend.
2546 /// \param __b
2547 ///    A 64-bit integer vector containing the subtrahend.
2548 /// \returns A 64-bit integer vector containing the difference of the values in
2549 ///    the operands.
_mm_sub_si64(__m64 __a,__m64 __b)2550 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2551   return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2552 }
2553 
2554 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2555 ///
2556 /// \headerfile <x86intrin.h>
2557 ///
2558 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2559 ///
2560 /// \param __a
2561 ///    A 128-bit integer vector containing the minuends.
2562 /// \param __b
2563 ///    A 128-bit integer vector containing the subtrahends.
2564 /// \returns A 128-bit integer vector containing the differences of the values
2565 ///    in the operands.
_mm_sub_epi64(__m128i __a,__m128i __b)2566 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2567                                                            __m128i __b) {
2568   return (__m128i)((__v2du)__a - (__v2du)__b);
2569 }
2570 
2571 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2572 ///    the input and returns the differences in the corresponding bytes in the
2573 ///    destination.
2574 ///
2575 ///    Differences greater than 0x7F are saturated to 0x7F, and differences
2576 ///    less than 0x80 are saturated to 0x80.
2577 ///
2578 /// \headerfile <x86intrin.h>
2579 ///
2580 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2581 ///
2582 /// \param __a
2583 ///    A 128-bit integer vector containing the minuends.
2584 /// \param __b
2585 ///    A 128-bit integer vector containing the subtrahends.
2586 /// \returns A 128-bit integer vector containing the differences of the values
2587 ///    in the operands.
_mm_subs_epi8(__m128i __a,__m128i __b)2588 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2589                                                            __m128i __b) {
2590   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2591 }
2592 
2593 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2594 ///    the input and returns the differences in the corresponding bytes in the
2595 ///    destination.
2596 ///
2597 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2598 ///    than 0x8000 are saturated to 0x8000.
2599 ///
2600 /// \headerfile <x86intrin.h>
2601 ///
2602 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2603 ///
2604 /// \param __a
2605 ///    A 128-bit integer vector containing the minuends.
2606 /// \param __b
2607 ///    A 128-bit integer vector containing the subtrahends.
2608 /// \returns A 128-bit integer vector containing the differences of the values
2609 ///    in the operands.
_mm_subs_epi16(__m128i __a,__m128i __b)2610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2611                                                             __m128i __b) {
2612   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2613 }
2614 
2615 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2616 ///    the input and returns the differences in the corresponding bytes in the
2617 ///    destination.
2618 ///
2619 ///    Differences less than 0x00 are saturated to 0x00.
2620 ///
2621 /// \headerfile <x86intrin.h>
2622 ///
2623 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2624 ///
2625 /// \param __a
2626 ///    A 128-bit integer vector containing the minuends.
2627 /// \param __b
2628 ///    A 128-bit integer vector containing the subtrahends.
2629 /// \returns A 128-bit integer vector containing the unsigned integer
2630 ///    differences of the values in the operands.
_mm_subs_epu8(__m128i __a,__m128i __b)2631 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2632                                                            __m128i __b) {
2633   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2634 }
2635 
2636 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2637 ///    the input and returns the differences in the corresponding bytes in the
2638 ///    destination.
2639 ///
2640 ///    Differences less than 0x0000 are saturated to 0x0000.
2641 ///
2642 /// \headerfile <x86intrin.h>
2643 ///
2644 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2645 ///
2646 /// \param __a
2647 ///    A 128-bit integer vector containing the minuends.
2648 /// \param __b
2649 ///    A 128-bit integer vector containing the subtrahends.
2650 /// \returns A 128-bit integer vector containing the unsigned integer
2651 ///    differences of the values in the operands.
_mm_subs_epu16(__m128i __a,__m128i __b)2652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2653                                                             __m128i __b) {
2654   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2655 }
2656 
2657 /// Performs a bitwise AND of two 128-bit integer vectors.
2658 ///
2659 /// \headerfile <x86intrin.h>
2660 ///
2661 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2662 ///
2663 /// \param __a
2664 ///    A 128-bit integer vector containing one of the source operands.
2665 /// \param __b
2666 ///    A 128-bit integer vector containing one of the source operands.
2667 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2668 ///    in both operands.
_mm_and_si128(__m128i __a,__m128i __b)2669 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2670                                                            __m128i __b) {
2671   return (__m128i)((__v2du)__a & (__v2du)__b);
2672 }
2673 
2674 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2675 ///    one's complement of the values contained in the first source operand.
2676 ///
2677 /// \headerfile <x86intrin.h>
2678 ///
2679 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2680 ///
2681 /// \param __a
2682 ///    A 128-bit vector containing the left source operand. The one's complement
2683 ///    of this value is used in the bitwise AND.
2684 /// \param __b
2685 ///    A 128-bit vector containing the right source operand.
2686 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2687 ///    complement of the first operand and the values in the second operand.
_mm_andnot_si128(__m128i __a,__m128i __b)2688 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2689                                                               __m128i __b) {
2690   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2691 }
2692 /// Performs a bitwise OR of two 128-bit integer vectors.
2693 ///
2694 /// \headerfile <x86intrin.h>
2695 ///
2696 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2697 ///
2698 /// \param __a
2699 ///    A 128-bit integer vector containing one of the source operands.
2700 /// \param __b
2701 ///    A 128-bit integer vector containing one of the source operands.
2702 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2703 ///    in both operands.
_mm_or_si128(__m128i __a,__m128i __b)2704 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2705                                                           __m128i __b) {
2706   return (__m128i)((__v2du)__a | (__v2du)__b);
2707 }
2708 
2709 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2710 ///
2711 /// \headerfile <x86intrin.h>
2712 ///
2713 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2714 ///
2715 /// \param __a
2716 ///    A 128-bit integer vector containing one of the source operands.
2717 /// \param __b
2718 ///    A 128-bit integer vector containing one of the source operands.
2719 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2720 ///    values in both operands.
_mm_xor_si128(__m128i __a,__m128i __b)2721 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2722                                                            __m128i __b) {
2723   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2724 }
2725 
2726 /// Left-shifts the 128-bit integer vector operand by the specified
2727 ///    number of bytes. Low-order bits are cleared.
2728 ///
2729 /// \headerfile <x86intrin.h>
2730 ///
2731 /// \code
2732 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2733 /// \endcode
2734 ///
2735 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2736 ///
2737 /// \param a
2738 ///    A 128-bit integer vector containing the source operand.
2739 /// \param imm
2740 ///    An immediate value specifying the number of bytes to left-shift operand
2741 ///    \a a.
2742 /// \returns A 128-bit integer vector containing the left-shifted value.
2743 #define _mm_slli_si128(a, imm)                                                 \
2744   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2745                                                 (int)(imm)))
2746 
2747 #define _mm_bslli_si128(a, imm)                                                \
2748   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2749                                                 (int)(imm)))
2750 
2751 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2752 ///    by the specified number of bits. Low-order bits are cleared.
2753 ///
2754 /// \headerfile <x86intrin.h>
2755 ///
2756 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2757 ///
2758 /// \param __a
2759 ///    A 128-bit integer vector containing the source operand.
2760 /// \param __count
2761 ///    An integer value specifying the number of bits to left-shift each value
2762 ///    in operand \a __a.
2763 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi16(__m128i __a,int __count)2764 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2765                                                             int __count) {
2766   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2767 }
2768 
2769 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2770 ///    by the specified number of bits. Low-order bits are cleared.
2771 ///
2772 /// \headerfile <x86intrin.h>
2773 ///
2774 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2775 ///
2776 /// \param __a
2777 ///    A 128-bit integer vector containing the source operand.
2778 /// \param __count
2779 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2780 ///    to left-shift each value in operand \a __a.
2781 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi16(__m128i __a,__m128i __count)2782 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2783                                                            __m128i __count) {
2784   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2785 }
2786 
2787 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2788 ///    by the specified number of bits. Low-order bits are cleared.
2789 ///
2790 /// \headerfile <x86intrin.h>
2791 ///
2792 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2793 ///
2794 /// \param __a
2795 ///    A 128-bit integer vector containing the source operand.
2796 /// \param __count
2797 ///    An integer value specifying the number of bits to left-shift each value
2798 ///    in operand \a __a.
2799 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi32(__m128i __a,int __count)2800 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2801                                                             int __count) {
2802   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2803 }
2804 
2805 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2806 ///    by the specified number of bits. Low-order bits are cleared.
2807 ///
2808 /// \headerfile <x86intrin.h>
2809 ///
2810 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2811 ///
2812 /// \param __a
2813 ///    A 128-bit integer vector containing the source operand.
2814 /// \param __count
2815 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2816 ///    to left-shift each value in operand \a __a.
2817 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi32(__m128i __a,__m128i __count)2818 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2819                                                            __m128i __count) {
2820   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2821 }
2822 
2823 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2824 ///    by the specified number of bits. Low-order bits are cleared.
2825 ///
2826 /// \headerfile <x86intrin.h>
2827 ///
2828 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2829 ///
2830 /// \param __a
2831 ///    A 128-bit integer vector containing the source operand.
2832 /// \param __count
2833 ///    An integer value specifying the number of bits to left-shift each value
2834 ///    in operand \a __a.
2835 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi64(__m128i __a,int __count)2836 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2837                                                             int __count) {
2838   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2839 }
2840 
2841 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2842 ///    by the specified number of bits. Low-order bits are cleared.
2843 ///
2844 /// \headerfile <x86intrin.h>
2845 ///
2846 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2847 ///
2848 /// \param __a
2849 ///    A 128-bit integer vector containing the source operand.
2850 /// \param __count
2851 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2852 ///    to left-shift each value in operand \a __a.
2853 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi64(__m128i __a,__m128i __count)2854 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2855                                                            __m128i __count) {
2856   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2857 }
2858 
2859 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2860 ///    by the specified number of bits. High-order bits are filled with the sign
2861 ///    bit of the initial value.
2862 ///
2863 /// \headerfile <x86intrin.h>
2864 ///
2865 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2866 ///
2867 /// \param __a
2868 ///    A 128-bit integer vector containing the source operand.
2869 /// \param __count
2870 ///    An integer value specifying the number of bits to right-shift each value
2871 ///    in operand \a __a.
2872 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi16(__m128i __a,int __count)2873 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2874                                                             int __count) {
2875   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2876 }
2877 
2878 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2879 ///    by the specified number of bits. High-order bits are filled with the sign
2880 ///    bit of the initial value.
2881 ///
2882 /// \headerfile <x86intrin.h>
2883 ///
2884 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2885 ///
2886 /// \param __a
2887 ///    A 128-bit integer vector containing the source operand.
2888 /// \param __count
2889 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2890 ///    to right-shift each value in operand \a __a.
2891 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi16(__m128i __a,__m128i __count)2892 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2893                                                            __m128i __count) {
2894   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2895 }
2896 
2897 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2898 ///    by the specified number of bits. High-order bits are filled with the sign
2899 ///    bit of the initial value.
2900 ///
2901 /// \headerfile <x86intrin.h>
2902 ///
2903 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2904 ///
2905 /// \param __a
2906 ///    A 128-bit integer vector containing the source operand.
2907 /// \param __count
2908 ///    An integer value specifying the number of bits to right-shift each value
2909 ///    in operand \a __a.
2910 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi32(__m128i __a,int __count)2911 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2912                                                             int __count) {
2913   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2914 }
2915 
2916 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2917 ///    by the specified number of bits. High-order bits are filled with the sign
2918 ///    bit of the initial value.
2919 ///
2920 /// \headerfile <x86intrin.h>
2921 ///
2922 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2923 ///
2924 /// \param __a
2925 ///    A 128-bit integer vector containing the source operand.
2926 /// \param __count
2927 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2928 ///    to right-shift each value in operand \a __a.
2929 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi32(__m128i __a,__m128i __count)2930 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2931                                                            __m128i __count) {
2932   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2933 }
2934 
2935 /// Right-shifts the 128-bit integer vector operand by the specified
2936 ///    number of bytes. High-order bits are cleared.
2937 ///
2938 /// \headerfile <x86intrin.h>
2939 ///
2940 /// \code
2941 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2942 /// \endcode
2943 ///
2944 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2945 ///
2946 /// \param a
2947 ///    A 128-bit integer vector containing the source operand.
2948 /// \param imm
2949 ///    An immediate value specifying the number of bytes to right-shift operand
2950 ///    \a a.
2951 /// \returns A 128-bit integer vector containing the right-shifted value.
2952 #define _mm_srli_si128(a, imm)                                                 \
2953   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2954                                                 (int)(imm)))
2955 
2956 #define _mm_bsrli_si128(a, imm)                                                \
2957   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2958                                                 (int)(imm)))
2959 
2960 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2961 ///    operand by the specified number of bits. High-order bits are cleared.
2962 ///
2963 /// \headerfile <x86intrin.h>
2964 ///
2965 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2966 ///
2967 /// \param __a
2968 ///    A 128-bit integer vector containing the source operand.
2969 /// \param __count
2970 ///    An integer value specifying the number of bits to right-shift each value
2971 ///    in operand \a __a.
2972 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi16(__m128i __a,int __count)2973 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2974                                                             int __count) {
2975   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2976 }
2977 
2978 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2979 ///    operand by the specified number of bits. High-order bits are cleared.
2980 ///
2981 /// \headerfile <x86intrin.h>
2982 ///
2983 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2984 ///
2985 /// \param __a
2986 ///    A 128-bit integer vector containing the source operand.
2987 /// \param __count
2988 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2989 ///    to right-shift each value in operand \a __a.
2990 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi16(__m128i __a,__m128i __count)2991 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2992                                                            __m128i __count) {
2993   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2994 }
2995 
2996 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2997 ///    operand by the specified number of bits. High-order bits are cleared.
2998 ///
2999 /// \headerfile <x86intrin.h>
3000 ///
3001 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3002 ///
3003 /// \param __a
3004 ///    A 128-bit integer vector containing the source operand.
3005 /// \param __count
3006 ///    An integer value specifying the number of bits to right-shift each value
3007 ///    in operand \a __a.
3008 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi32(__m128i __a,int __count)3009 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3010                                                             int __count) {
3011   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3012 }
3013 
3014 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3015 ///    operand by the specified number of bits. High-order bits are cleared.
3016 ///
3017 /// \headerfile <x86intrin.h>
3018 ///
3019 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3020 ///
3021 /// \param __a
3022 ///    A 128-bit integer vector containing the source operand.
3023 /// \param __count
3024 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3025 ///    to right-shift each value in operand \a __a.
3026 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi32(__m128i __a,__m128i __count)3027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3028                                                            __m128i __count) {
3029   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3030 }
3031 
3032 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3033 ///    operand by the specified number of bits. High-order bits are cleared.
3034 ///
3035 /// \headerfile <x86intrin.h>
3036 ///
3037 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3038 ///
3039 /// \param __a
3040 ///    A 128-bit integer vector containing the source operand.
3041 /// \param __count
3042 ///    An integer value specifying the number of bits to right-shift each value
3043 ///    in operand \a __a.
3044 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi64(__m128i __a,int __count)3045 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3046                                                             int __count) {
3047   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3048 }
3049 
3050 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3051 ///    operand by the specified number of bits. High-order bits are cleared.
3052 ///
3053 /// \headerfile <x86intrin.h>
3054 ///
3055 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3056 ///
3057 /// \param __a
3058 ///    A 128-bit integer vector containing the source operand.
3059 /// \param __count
3060 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3061 ///    to right-shift each value in operand \a __a.
3062 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi64(__m128i __a,__m128i __count)3063 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3064                                                            __m128i __count) {
3065   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3066 }
3067 
3068 /// Compares each of the corresponding 8-bit values of the 128-bit
3069 ///    integer vectors for equality.
3070 ///
3071 ///    Each comparison returns 0x0 for false, 0xFF for true.
3072 ///
3073 /// \headerfile <x86intrin.h>
3074 ///
3075 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3076 ///
3077 /// \param __a
3078 ///    A 128-bit integer vector.
3079 /// \param __b
3080 ///    A 128-bit integer vector.
3081 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3082 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3083                                                             __m128i __b) {
3084   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3085 }
3086 
3087 /// Compares each of the corresponding 16-bit values of the 128-bit
3088 ///    integer vectors for equality.
3089 ///
3090 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3091 ///
3092 /// \headerfile <x86intrin.h>
3093 ///
3094 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3095 ///
3096 /// \param __a
3097 ///    A 128-bit integer vector.
3098 /// \param __b
3099 ///    A 128-bit integer vector.
3100 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3101 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3102                                                              __m128i __b) {
3103   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3104 }
3105 
3106 /// Compares each of the corresponding 32-bit values of the 128-bit
3107 ///    integer vectors for equality.
3108 ///
3109 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3110 ///
3111 /// \headerfile <x86intrin.h>
3112 ///
3113 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3114 ///
3115 /// \param __a
3116 ///    A 128-bit integer vector.
3117 /// \param __b
3118 ///    A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3121                                                              __m128i __b) {
3122   return (__m128i)((__v4si)__a == (__v4si)__b);
3123 }
3124 
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 ///    integer vectors to determine if the values in the first operand are
3127 ///    greater than those in the second operand.
3128 ///
3129 ///    Each comparison returns 0x0 for false, 0xFF for true.
3130 ///
3131 /// \headerfile <x86intrin.h>
3132 ///
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134 ///
3135 /// \param __a
3136 ///    A 128-bit integer vector.
3137 /// \param __b
3138 ///    A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3141                                                             __m128i __b) {
3142   /* This function always performs a signed comparison, but __v16qi is a char
3143      which may be signed or unsigned, so use __v16qs. */
3144   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3145 }
3146 
3147 /// Compares each of the corresponding signed 16-bit values of the
3148 ///    128-bit integer vectors to determine if the values in the first operand
3149 ///    are greater than those in the second operand.
3150 ///
3151 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3152 ///
3153 /// \headerfile <x86intrin.h>
3154 ///
3155 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3156 ///
3157 /// \param __a
3158 ///    A 128-bit integer vector.
3159 /// \param __b
3160 ///    A 128-bit integer vector.
3161 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3162 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3163                                                              __m128i __b) {
3164   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3165 }
3166 
3167 /// Compares each of the corresponding signed 32-bit values of the
3168 ///    128-bit integer vectors to determine if the values in the first operand
3169 ///    are greater than those in the second operand.
3170 ///
3171 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3172 ///
3173 /// \headerfile <x86intrin.h>
3174 ///
3175 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3176 ///
3177 /// \param __a
3178 ///    A 128-bit integer vector.
3179 /// \param __b
3180 ///    A 128-bit integer vector.
3181 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3182 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3183                                                              __m128i __b) {
3184   return (__m128i)((__v4si)__a > (__v4si)__b);
3185 }
3186 
3187 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3188 ///    integer vectors to determine if the values in the first operand are less
3189 ///    than those in the second operand.
3190 ///
3191 ///    Each comparison returns 0x0 for false, 0xFF for true.
3192 ///
3193 /// \headerfile <x86intrin.h>
3194 ///
3195 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3196 ///
3197 /// \param __a
3198 ///    A 128-bit integer vector.
3199 /// \param __b
3200 ///    A 128-bit integer vector.
3201 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi8(__m128i __a,__m128i __b)3202 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3203                                                             __m128i __b) {
3204   return _mm_cmpgt_epi8(__b, __a);
3205 }
3206 
3207 /// Compares each of the corresponding signed 16-bit values of the
3208 ///    128-bit integer vectors to determine if the values in the first operand
3209 ///    are less than those in the second operand.
3210 ///
3211 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3212 ///
3213 /// \headerfile <x86intrin.h>
3214 ///
3215 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3216 ///
3217 /// \param __a
3218 ///    A 128-bit integer vector.
3219 /// \param __b
3220 ///    A 128-bit integer vector.
3221 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi16(__m128i __a,__m128i __b)3222 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3223                                                              __m128i __b) {
3224   return _mm_cmpgt_epi16(__b, __a);
3225 }
3226 
3227 /// Compares each of the corresponding signed 32-bit values of the
3228 ///    128-bit integer vectors to determine if the values in the first operand
3229 ///    are less than those in the second operand.
3230 ///
3231 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3232 ///
3233 /// \headerfile <x86intrin.h>
3234 ///
3235 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3236 ///
3237 /// \param __a
3238 ///    A 128-bit integer vector.
3239 /// \param __b
3240 ///    A 128-bit integer vector.
3241 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi32(__m128i __a,__m128i __b)3242 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3243                                                              __m128i __b) {
3244   return _mm_cmpgt_epi32(__b, __a);
3245 }
3246 
3247 #ifdef __x86_64__
3248 /// Converts a 64-bit signed integer value from the second operand into a
3249 ///    double-precision value and returns it in the lower element of a [2 x
3250 ///    double] vector; the upper element of the returned vector is copied from
3251 ///    the upper element of the first operand.
3252 ///
3253 /// \headerfile <x86intrin.h>
3254 ///
3255 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3256 ///
3257 /// \param __a
3258 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3259 ///    copied to the upper 64 bits of the destination.
3260 /// \param __b
3261 ///    A 64-bit signed integer operand containing the value to be converted.
3262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3263 ///    converted value of the second operand. The upper 64 bits are copied from
3264 ///    the upper 64 bits of the first operand.
_mm_cvtsi64_sd(__m128d __a,long long __b)3265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3266                                                             long long __b) {
3267   __a[0] = __b;
3268   return __a;
3269 }
3270 
3271 /// Converts the first (lower) element of a vector of [2 x double] into a
3272 ///    64-bit signed integer value.
3273 ///
3274 ///    If the converted value does not fit in a 64-bit integer, raises a
3275 ///    floating-point invalid exception. If the exception is masked, returns
3276 ///    the most negative integer.
3277 ///
3278 /// \headerfile <x86intrin.h>
3279 ///
3280 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3281 ///
3282 /// \param __a
3283 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3284 ///    conversion.
3285 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvtsd_si64(__m128d __a)3286 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3287   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3288 }
3289 
3290 /// Converts the first (lower) element of a vector of [2 x double] into a
3291 ///    64-bit signed truncated (rounded toward zero) integer value.
3292 ///
3293 ///    If a converted value does not fit in a 64-bit integer, raises a
3294 ///    floating-point invalid exception. If the exception is masked, returns
3295 ///    the most negative integer.
3296 ///
3297 /// \headerfile <x86intrin.h>
3298 ///
3299 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3300 ///   instruction.
3301 ///
3302 /// \param __a
3303 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3304 ///    conversion.
3305 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvttsd_si64(__m128d __a)3306 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3307   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3308 }
3309 #endif
3310 
3311 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3312 ///
3313 /// \headerfile <x86intrin.h>
3314 ///
3315 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3316 ///
3317 /// \param __a
3318 ///    A 128-bit integer vector.
3319 /// \returns A 128-bit vector of [4 x float] containing the converted values.
_mm_cvtepi32_ps(__m128i __a)3320 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3321   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3322 }
3323 
3324 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3325 ///
3326 ///    If a converted value does not fit in a 32-bit integer, raises a
3327 ///    floating-point invalid exception. If the exception is masked, returns
3328 ///    the most negative integer.
3329 ///
3330 /// \headerfile <x86intrin.h>
3331 ///
3332 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3333 ///
3334 /// \param __a
3335 ///    A 128-bit vector of [4 x float].
3336 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3337 ///    values.
_mm_cvtps_epi32(__m128 __a)3338 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3339   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3340 }
3341 
3342 /// Converts a vector of [4 x float] into four signed truncated (rounded toward
3343 ///    zero) 32-bit integers, returned in a vector of [4 x i32].
3344 ///
3345 ///    If a converted value does not fit in a 32-bit integer, raises a
3346 ///    floating-point invalid exception. If the exception is masked, returns
3347 ///    the most negative integer.
3348 ///
3349 /// \headerfile <x86intrin.h>
3350 ///
3351 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3352 ///   instruction.
3353 ///
3354 /// \param __a
3355 ///    A 128-bit vector of [4 x float].
3356 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
_mm_cvttps_epi32(__m128 __a)3357 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3358   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3359 }
3360 
3361 /// Returns a vector of [4 x i32] where the lowest element is the input
3362 ///    operand and the remaining elements are zero.
3363 ///
3364 /// \headerfile <x86intrin.h>
3365 ///
3366 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3367 ///
3368 /// \param __a
3369 ///    A 32-bit signed integer operand.
3370 /// \returns A 128-bit vector of [4 x i32].
_mm_cvtsi32_si128(int __a)3371 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3372   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3373 }
3374 
3375 /// Returns a vector of [2 x i64] where the lower element is the input
3376 ///    operand and the upper element is zero.
3377 ///
3378 /// \headerfile <x86intrin.h>
3379 ///
3380 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3381 /// in 64-bit mode.
3382 ///
3383 /// \param __a
3384 ///    A 64-bit signed integer operand containing the value to be converted.
3385 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
_mm_cvtsi64_si128(long long __a)3386 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3387   return __extension__(__m128i)(__v2di){__a, 0};
3388 }
3389 
3390 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3391 ///    32-bit signed integer value.
3392 ///
3393 /// \headerfile <x86intrin.h>
3394 ///
3395 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3396 ///
3397 /// \param __a
3398 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3399 ///    destination.
3400 /// \returns A 32-bit signed integer containing the moved value.
_mm_cvtsi128_si32(__m128i __a)3401 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3402   __v4si __b = (__v4si)__a;
3403   return __b[0];
3404 }
3405 
3406 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3407 ///    64-bit signed integer value.
3408 ///
3409 /// \headerfile <x86intrin.h>
3410 ///
3411 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3412 ///
3413 /// \param __a
3414 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3415 ///    destination.
3416 /// \returns A 64-bit signed integer containing the moved value.
_mm_cvtsi128_si64(__m128i __a)3417 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3418   return __a[0];
3419 }
3420 
3421 /// Moves packed integer values from an aligned 128-bit memory location
3422 ///    to elements in a 128-bit integer vector.
3423 ///
3424 /// \headerfile <x86intrin.h>
3425 ///
3426 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3427 ///
3428 /// \param __p
3429 ///    An aligned pointer to a memory location containing integer values.
3430 /// \returns A 128-bit integer vector containing the moved values.
3431 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3432 _mm_load_si128(__m128i const *__p) {
3433   return *__p;
3434 }
3435 
3436 /// Moves packed integer values from an unaligned 128-bit memory location
3437 ///    to elements in a 128-bit integer vector.
3438 ///
3439 /// \headerfile <x86intrin.h>
3440 ///
3441 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3442 ///
3443 /// \param __p
3444 ///    A pointer to a memory location containing integer values.
3445 /// \returns A 128-bit integer vector containing the moved values.
3446 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3447 _mm_loadu_si128(__m128i_u const *__p) {
3448   struct __loadu_si128 {
3449     __m128i_u __v;
3450   } __attribute__((__packed__, __may_alias__));
3451   return ((const struct __loadu_si128 *)__p)->__v;
3452 }
3453 
3454 /// Returns a vector of [2 x i64] where the lower element is taken from
3455 ///    the lower element of the operand, and the upper element is zero.
3456 ///
3457 /// \headerfile <x86intrin.h>
3458 ///
3459 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3460 ///
3461 /// \param __p
3462 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3463 ///    the destination.
3464 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3465 ///    moved value. The higher order bits are cleared.
3466 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3467 _mm_loadl_epi64(__m128i_u const *__p) {
3468   struct __mm_loadl_epi64_struct {
3469     long long __u;
3470   } __attribute__((__packed__, __may_alias__));
3471   return __extension__(__m128i){
3472       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3473 }
3474 
3475 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3476 ///    This could be used as an argument to another intrinsic function where the
3477 ///    argument is required but the value is not actually used.
3478 ///
3479 /// \headerfile <x86intrin.h>
3480 ///
3481 /// This intrinsic has no corresponding instruction.
3482 ///
3483 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
_mm_undefined_si128(void)3484 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3485   return (__m128i)__builtin_ia32_undef128();
3486 }
3487 
3488 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3489 ///    the specified 64-bit integer values.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic is a utility function and does not correspond to a specific
3494 ///    instruction.
3495 ///
3496 /// \param __q1
3497 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3498 ///    destination vector of [2 x i64].
3499 /// \param __q0
3500 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3501 ///    destination vector of [2 x i64].
3502 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3503 ///    provided in the operands.
_mm_set_epi64x(long long __q1,long long __q0)3504 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3505                                                             long long __q0) {
3506   return __extension__(__m128i)(__v2di){__q0, __q1};
3507 }
3508 
3509 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3510 ///    the specified 64-bit integer values.
3511 ///
3512 /// \headerfile <x86intrin.h>
3513 ///
3514 /// This intrinsic is a utility function and does not correspond to a specific
3515 ///    instruction.
3516 ///
3517 /// \param __q1
3518 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3519 ///    destination vector of [2 x i64].
3520 /// \param __q0
3521 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3522 ///    destination vector of [2 x i64].
3523 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3524 ///    provided in the operands.
_mm_set_epi64(__m64 __q1,__m64 __q0)3525 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3526                                                            __m64 __q0) {
3527   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3528 }
3529 
3530 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3531 ///    the specified 32-bit integer values.
3532 ///
3533 /// \headerfile <x86intrin.h>
3534 ///
3535 /// This intrinsic is a utility function and does not correspond to a specific
3536 ///    instruction.
3537 ///
3538 /// \param __i3
3539 ///    A 32-bit integer value used to initialize bits [127:96] of the
3540 ///    destination vector.
3541 /// \param __i2
3542 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3543 ///    vector.
3544 /// \param __i1
3545 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3546 ///    vector.
3547 /// \param __i0
3548 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3549 ///    vector.
3550 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3551 ///    provided in the operands.
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3552 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3553                                                            int __i1, int __i0) {
3554   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3555 }
3556 
3557 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3558 ///    the specified 16-bit integer values.
3559 ///
3560 /// \headerfile <x86intrin.h>
3561 ///
3562 /// This intrinsic is a utility function and does not correspond to a specific
3563 ///    instruction.
3564 ///
3565 /// \param __w7
3566 ///    A 16-bit integer value used to initialize bits [127:112] of the
3567 ///    destination vector.
3568 /// \param __w6
3569 ///    A 16-bit integer value used to initialize bits [111:96] of the
3570 ///    destination vector.
3571 /// \param __w5
3572 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3573 ///    vector.
3574 /// \param __w4
3575 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3576 ///    vector.
3577 /// \param __w3
3578 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3579 ///    vector.
3580 /// \param __w2
3581 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3582 ///    vector.
3583 /// \param __w1
3584 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3585 ///    vector.
3586 /// \param __w0
3587 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3588 ///    vector.
3589 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3590 ///    provided in the operands.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3592 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3593               short __w2, short __w1, short __w0) {
3594   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3595                                         __w4, __w5, __w6, __w7};
3596 }
3597 
3598 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3599 ///    the specified 8-bit integer values.
3600 ///
3601 /// \headerfile <x86intrin.h>
3602 ///
3603 /// This intrinsic is a utility function and does not correspond to a specific
3604 ///    instruction.
3605 ///
3606 /// \param __b15
3607 ///    Initializes bits [127:120] of the destination vector.
3608 /// \param __b14
3609 ///    Initializes bits [119:112] of the destination vector.
3610 /// \param __b13
3611 ///    Initializes bits [111:104] of the destination vector.
3612 /// \param __b12
3613 ///    Initializes bits [103:96] of the destination vector.
3614 /// \param __b11
3615 ///    Initializes bits [95:88] of the destination vector.
3616 /// \param __b10
3617 ///    Initializes bits [87:80] of the destination vector.
3618 /// \param __b9
3619 ///    Initializes bits [79:72] of the destination vector.
3620 /// \param __b8
3621 ///    Initializes bits [71:64] of the destination vector.
3622 /// \param __b7
3623 ///    Initializes bits [63:56] of the destination vector.
3624 /// \param __b6
3625 ///    Initializes bits [55:48] of the destination vector.
3626 /// \param __b5
3627 ///    Initializes bits [47:40] of the destination vector.
3628 /// \param __b4
3629 ///    Initializes bits [39:32] of the destination vector.
3630 /// \param __b3
3631 ///    Initializes bits [31:24] of the destination vector.
3632 /// \param __b2
3633 ///    Initializes bits [23:16] of the destination vector.
3634 /// \param __b1
3635 ///    Initializes bits [15:8] of the destination vector.
3636 /// \param __b0
3637 ///    Initializes bits [7:0] of the destination vector.
3638 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3639 ///    provided in the operands.
3640 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3641 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3642              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3643              char __b4, char __b3, char __b2, char __b1, char __b0) {
3644   return __extension__(__m128i)(__v16qi){
3645       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3646       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3647 }
3648 
3649 /// Initializes both values in a 128-bit integer vector with the
3650 ///    specified 64-bit integer value.
3651 ///
3652 /// \headerfile <x86intrin.h>
3653 ///
3654 /// This intrinsic is a utility function and does not correspond to a specific
3655 ///    instruction.
3656 ///
3657 /// \param __q
3658 ///    Integer value used to initialize the elements of the destination integer
3659 ///    vector.
3660 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3661 ///    elements containing the value provided in the operand.
_mm_set1_epi64x(long long __q)3662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3663   return _mm_set_epi64x(__q, __q);
3664 }
3665 
3666 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3667 ///    specified 64-bit value.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic is a utility function and does not correspond to a specific
3672 ///    instruction.
3673 ///
3674 /// \param __q
3675 ///    A 64-bit value used to initialize the elements of the destination integer
3676 ///    vector.
3677 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3678 ///    containing the value provided in the operand.
_mm_set1_epi64(__m64 __q)3679 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3680   return _mm_set_epi64(__q, __q);
3681 }
3682 
3683 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3684 ///    specified 32-bit value.
3685 ///
3686 /// \headerfile <x86intrin.h>
3687 ///
3688 /// This intrinsic is a utility function and does not correspond to a specific
3689 ///    instruction.
3690 ///
3691 /// \param __i
3692 ///    A 32-bit value used to initialize the elements of the destination integer
3693 ///    vector.
3694 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3695 ///    containing the value provided in the operand.
_mm_set1_epi32(int __i)3696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3697   return _mm_set_epi32(__i, __i, __i, __i);
3698 }
3699 
3700 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3701 ///    specified 16-bit value.
3702 ///
3703 /// \headerfile <x86intrin.h>
3704 ///
3705 /// This intrinsic is a utility function and does not correspond to a specific
3706 ///    instruction.
3707 ///
3708 /// \param __w
3709 ///    A 16-bit value used to initialize the elements of the destination integer
3710 ///    vector.
3711 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3712 ///    containing the value provided in the operand.
_mm_set1_epi16(short __w)3713 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3714   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3715 }
3716 
3717 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3718 ///    specified 8-bit value.
3719 ///
3720 /// \headerfile <x86intrin.h>
3721 ///
3722 /// This intrinsic is a utility function and does not correspond to a specific
3723 ///    instruction.
3724 ///
3725 /// \param __b
3726 ///    An 8-bit value used to initialize the elements of the destination integer
3727 ///    vector.
3728 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3729 ///    containing the value provided in the operand.
_mm_set1_epi8(char __b)3730 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3731   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3732                       __b, __b, __b, __b, __b);
3733 }
3734 
3735 /// Constructs a 128-bit integer vector, initialized in reverse order
3736 ///     with the specified 64-bit integral values.
3737 ///
3738 /// \headerfile <x86intrin.h>
3739 ///
3740 /// This intrinsic does not correspond to a specific instruction.
3741 ///
3742 /// \param __q0
3743 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3744 ///    result.
3745 /// \param __q1
3746 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3747 ///    result.
3748 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi64(__m64 __q0,__m64 __q1)3749 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3750                                                             __m64 __q1) {
3751   return _mm_set_epi64(__q1, __q0);
3752 }
3753 
3754 /// Constructs a 128-bit integer vector, initialized in reverse order
3755 ///     with the specified 32-bit integral values.
3756 ///
3757 /// \headerfile <x86intrin.h>
3758 ///
3759 /// This intrinsic is a utility function and does not correspond to a specific
3760 ///    instruction.
3761 ///
3762 /// \param __i0
3763 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3764 /// \param __i1
3765 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3766 /// \param __i2
3767 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3768 /// \param __i3
3769 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3770 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3771 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3772                                                             int __i2,
3773                                                             int __i3) {
3774   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3775 }
3776 
3777 /// Constructs a 128-bit integer vector, initialized in reverse order
3778 ///     with the specified 16-bit integral values.
3779 ///
3780 /// \headerfile <x86intrin.h>
3781 ///
3782 /// This intrinsic is a utility function and does not correspond to a specific
3783 ///    instruction.
3784 ///
3785 /// \param __w0
3786 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3787 /// \param __w1
3788 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3789 /// \param __w2
3790 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3791 /// \param __w3
3792 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3793 /// \param __w4
3794 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3795 /// \param __w5
3796 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3797 /// \param __w6
3798 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3799 /// \param __w7
3800 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3801 /// \returns An initialized 128-bit integer vector.
3802 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3803 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3804                short __w5, short __w6, short __w7) {
3805   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3806 }
3807 
3808 /// Constructs a 128-bit integer vector, initialized in reverse order
3809 ///     with the specified 8-bit integral values.
3810 ///
3811 /// \headerfile <x86intrin.h>
3812 ///
3813 /// This intrinsic is a utility function and does not correspond to a specific
3814 ///    instruction.
3815 ///
3816 /// \param __b0
3817 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3818 /// \param __b1
3819 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3820 /// \param __b2
3821 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3822 /// \param __b3
3823 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3824 /// \param __b4
3825 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3826 /// \param __b5
3827 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3828 /// \param __b6
3829 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3830 /// \param __b7
3831 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3832 /// \param __b8
3833 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3834 /// \param __b9
3835 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3836 /// \param __b10
3837 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3838 /// \param __b11
3839 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3840 /// \param __b12
3841 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3842 /// \param __b13
3843 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3844 /// \param __b14
3845 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3846 /// \param __b15
3847 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3848 /// \returns An initialized 128-bit integer vector.
3849 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3850 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3851               char __b6, char __b7, char __b8, char __b9, char __b10,
3852               char __b11, char __b12, char __b13, char __b14, char __b15) {
3853   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3854                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3855 }
3856 
3857 /// Creates a 128-bit integer vector initialized to zero.
3858 ///
3859 /// \headerfile <x86intrin.h>
3860 ///
3861 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3862 ///
3863 /// \returns An initialized 128-bit integer vector with all elements set to
3864 ///    zero.
_mm_setzero_si128(void)3865 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3866   return __extension__(__m128i)(__v2di){0LL, 0LL};
3867 }
3868 
3869 /// Stores a 128-bit integer vector to a memory location aligned on a
3870 ///    128-bit boundary.
3871 ///
3872 /// \headerfile <x86intrin.h>
3873 ///
3874 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3875 ///
3876 /// \param __p
3877 ///    A pointer to an aligned memory location that will receive the integer
3878 ///    values.
3879 /// \param __b
3880 ///    A 128-bit integer vector containing the values to be moved.
_mm_store_si128(__m128i * __p,__m128i __b)3881 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3882                                                           __m128i __b) {
3883   *__p = __b;
3884 }
3885 
3886 /// Stores a 128-bit integer vector to an unaligned memory location.
3887 ///
3888 /// \headerfile <x86intrin.h>
3889 ///
3890 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3891 ///
3892 /// \param __p
3893 ///    A pointer to a memory location that will receive the integer values.
3894 /// \param __b
3895 ///    A 128-bit integer vector containing the values to be moved.
_mm_storeu_si128(__m128i_u * __p,__m128i __b)3896 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3897                                                            __m128i __b) {
3898   struct __storeu_si128 {
3899     __m128i_u __v;
3900   } __attribute__((__packed__, __may_alias__));
3901   ((struct __storeu_si128 *)__p)->__v = __b;
3902 }
3903 
3904 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3905 ///    vector.
3906 ///
3907 /// \headerfile <x86intrin.h>
3908 ///
3909 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3910 ///
3911 /// \param __p
3912 ///    A pointer to a 64-bit memory location. The address of the memory
3913 ///    location does not have to be aligned.
3914 /// \param __b
3915 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si64(void * __p,__m128i __b)3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3917                                                           __m128i __b) {
3918   struct __storeu_si64 {
3919     long long __v;
3920   } __attribute__((__packed__, __may_alias__));
3921   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3922 }
3923 
3924 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3925 ///    vector.
3926 ///
3927 /// \headerfile <x86intrin.h>
3928 ///
3929 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3930 ///
3931 /// \param __p
3932 ///    A pointer to a 32-bit memory location. The address of the memory
3933 ///    location does not have to be aligned.
3934 /// \param __b
3935 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si32(void * __p,__m128i __b)3936 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3937                                                           __m128i __b) {
3938   struct __storeu_si32 {
3939     int __v;
3940   } __attribute__((__packed__, __may_alias__));
3941   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3942 }
3943 
3944 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3945 ///    vector.
3946 ///
3947 /// \headerfile <x86intrin.h>
3948 ///
3949 /// This intrinsic does not correspond to a specific instruction.
3950 ///
3951 /// \param __p
3952 ///    A pointer to a 16-bit memory location. The address of the memory
3953 ///    location does not have to be aligned.
3954 /// \param __b
3955 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si16(void * __p,__m128i __b)3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3957                                                           __m128i __b) {
3958   struct __storeu_si16 {
3959     short __v;
3960   } __attribute__((__packed__, __may_alias__));
3961   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3962 }
3963 
3964 /// Moves bytes selected by the mask from the first operand to the
3965 ///    specified unaligned memory location. When a mask bit is 1, the
3966 ///    corresponding byte is written, otherwise it is not written.
3967 ///
3968 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3969 ///    used again soon). Exception and trap behavior for elements not selected
3970 ///    for storage to memory are implementation dependent.
3971 ///
3972 /// \headerfile <x86intrin.h>
3973 ///
3974 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3975 ///   instruction.
3976 ///
3977 /// \param __d
3978 ///    A 128-bit integer vector containing the values to be moved.
3979 /// \param __n
3980 ///    A 128-bit integer vector containing the mask. The most significant bit of
3981 ///    each byte represents the mask bits.
3982 /// \param __p
3983 ///    A pointer to an unaligned 128-bit memory location where the specified
3984 ///    values are moved.
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)3985 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3986                                                               __m128i __n,
3987                                                               char *__p) {
3988   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3989 }
3990 
3991 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3992 ///    a memory location.
3993 ///
3994 /// \headerfile <x86intrin.h>
3995 ///
3996 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3997 ///
3998 /// \param __p
3999 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
4000 ///    of the integer vector parameter.
4001 /// \param __a
4002 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4003 ///    value to be stored.
_mm_storel_epi64(__m128i_u * __p,__m128i __a)4004 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4005                                                            __m128i __a) {
4006   struct __mm_storel_epi64_struct {
4007     long long __u;
4008   } __attribute__((__packed__, __may_alias__));
4009   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4010 }
4011 
4012 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4013 ///    aligned memory location.
4014 ///
4015 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4016 ///    used again soon).
4017 ///
4018 /// \headerfile <x86intrin.h>
4019 ///
4020 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4021 ///
4022 /// \param __p
4023 ///    A pointer to the 128-bit aligned memory location used to store the value.
4024 /// \param __a
4025 ///    A vector of [2 x double] containing the 64-bit values to be stored.
_mm_stream_pd(void * __p,__m128d __a)4026 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4027                                                         __m128d __a) {
4028   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4029 }
4030 
4031 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4032 ///
4033 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4034 ///    used again soon).
4035 ///
4036 /// \headerfile <x86intrin.h>
4037 ///
4038 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4039 ///
4040 /// \param __p
4041 ///    A pointer to the 128-bit aligned memory location used to store the value.
4042 /// \param __a
4043 ///    A 128-bit integer vector containing the values to be stored.
_mm_stream_si128(void * __p,__m128i __a)4044 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4045                                                            __m128i __a) {
4046   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4047 }
4048 
4049 /// Stores a 32-bit integer value in the specified memory location.
4050 ///
4051 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4052 ///    used again soon).
4053 ///
4054 /// \headerfile <x86intrin.h>
4055 ///
4056 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4057 ///
4058 /// \param __p
4059 ///    A pointer to the 32-bit memory location used to store the value.
4060 /// \param __a
4061 ///    A 32-bit integer containing the value to be stored.
4062 static __inline__ void
4063     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void * __p,int __a)4064     _mm_stream_si32(void *__p, int __a) {
4065   __builtin_ia32_movnti((int *)__p, __a);
4066 }
4067 
4068 #ifdef __x86_64__
4069 /// Stores a 64-bit integer value in the specified memory location.
4070 ///
4071 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4072 ///    used again soon).
4073 ///
4074 /// \headerfile <x86intrin.h>
4075 ///
4076 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4077 ///
4078 /// \param __p
4079 ///    A pointer to the 64-bit memory location used to store the value.
4080 /// \param __a
4081 ///    A 64-bit integer containing the value to be stored.
4082 static __inline__ void
4083     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void * __p,long long __a)4084     _mm_stream_si64(void *__p, long long __a) {
4085   __builtin_ia32_movnti64((long long *)__p, __a);
4086 }
4087 #endif
4088 
4089 #if defined(__cplusplus)
4090 extern "C" {
4091 #endif
4092 
4093 /// The cache line containing \a __p is flushed and invalidated from all
4094 ///    caches in the coherency domain.
4095 ///
4096 /// \headerfile <x86intrin.h>
4097 ///
4098 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4099 ///
4100 /// \param __p
4101 ///    A pointer to the memory location used to identify the cache line to be
4102 ///    flushed.
4103 void _mm_clflush(void const *__p);
4104 
4105 /// Forces strong memory ordering (serialization) between load
4106 ///    instructions preceding this instruction and load instructions following
4107 ///    this instruction, ensuring the system completes all previous loads before
4108 ///    executing subsequent loads.
4109 ///
4110 /// \headerfile <x86intrin.h>
4111 ///
4112 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4113 ///
4114 void _mm_lfence(void);
4115 
4116 /// Forces strong memory ordering (serialization) between load and store
4117 ///    instructions preceding this instruction and load and store instructions
4118 ///    following this instruction, ensuring that the system completes all
4119 ///    previous memory accesses before executing subsequent memory accesses.
4120 ///
4121 /// \headerfile <x86intrin.h>
4122 ///
4123 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4124 ///
4125 void _mm_mfence(void);
4126 
4127 #if defined(__cplusplus)
4128 } // extern "C"
4129 #endif
4130 
4131 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4132 ///    vector operands into 8-bit signed integers, and packs the results into
4133 ///    the destination.
4134 ///
4135 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
4136 ///    less than 0x80 are saturated to 0x80.
4137 ///
4138 /// \headerfile <x86intrin.h>
4139 ///
4140 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4141 ///
4142 /// \param __a
4143 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4144 ///   written to the lower 64 bits of the result.
4145 /// \param __b
4146 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4147 ///   written to the higher 64 bits of the result.
4148 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packs_epi16(__m128i __a,__m128i __b)4149 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4150                                                              __m128i __b) {
4151   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4152 }
4153 
4154 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4155 ///    vector operands into 16-bit signed integers, and packs the results into
4156 ///    the destination.
4157 ///
4158 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4159 ///    values less than 0x8000 are saturated to 0x8000.
4160 ///
4161 /// \headerfile <x86intrin.h>
4162 ///
4163 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4164 ///
4165 /// \param __a
4166 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4167 ///    are written to the lower 64 bits of the result.
4168 /// \param __b
4169 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4170 ///    are written to the higher 64 bits of the result.
4171 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
_mm_packs_epi32(__m128i __a,__m128i __b)4172 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4173                                                              __m128i __b) {
4174   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4175 }
4176 
4177 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4178 ///    vector operands into 8-bit unsigned integers, and packs the results into
4179 ///    the destination.
4180 ///
4181 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4182 ///    are saturated to 0x00.
4183 ///
4184 /// \headerfile <x86intrin.h>
4185 ///
4186 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4187 ///
4188 /// \param __a
4189 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4190 ///    written to the lower 64 bits of the result.
4191 /// \param __b
4192 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4193 ///    written to the higher 64 bits of the result.
4194 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packus_epi16(__m128i __a,__m128i __b)4195 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4196                                                               __m128i __b) {
4197   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4198 }
4199 
4200 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4201 ///    the immediate-value parameter as a selector.
4202 ///
4203 /// \headerfile <x86intrin.h>
4204 ///
4205 /// \code
4206 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4207 /// \endcode
4208 ///
4209 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4210 ///
4211 /// \param a
4212 ///    A 128-bit integer vector.
4213 /// \param imm
4214 ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4215 ///    to bits[15:0] of the result. \n
4216 ///    000: assign values from bits [15:0] of \a a. \n
4217 ///    001: assign values from bits [31:16] of \a a. \n
4218 ///    010: assign values from bits [47:32] of \a a. \n
4219 ///    011: assign values from bits [63:48] of \a a. \n
4220 ///    100: assign values from bits [79:64] of \a a. \n
4221 ///    101: assign values from bits [95:80] of \a a. \n
4222 ///    110: assign values from bits [111:96] of \a a. \n
4223 ///    111: assign values from bits [127:112] of \a a.
4224 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4225 ///    integer vector parameter and the remaining bits are assigned zeros.
4226 #define _mm_extract_epi16(a, imm)                                              \
4227   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4228                                                     (int)(imm)))
4229 
4230 /// Constructs a 128-bit integer vector by first making a copy of the
4231 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4232 ///    of an integer parameter into an offset specified by the immediate-value
4233 ///    parameter.
4234 ///
4235 /// \headerfile <x86intrin.h>
4236 ///
4237 /// \code
4238 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4239 /// \endcode
4240 ///
4241 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4242 ///
4243 /// \param a
4244 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4245 ///    result and then one of the eight elements in the result is replaced by
4246 ///    the lower 16 bits of \a b.
4247 /// \param b
4248 ///    An integer. The lower 16 bits of this parameter are written to the
4249 ///    result beginning at an offset specified by \a imm.
4250 /// \param imm
4251 ///    An immediate value specifying the bit offset in the result at which the
4252 ///    lower 16 bits of \a b are written.
4253 /// \returns A 128-bit integer vector containing the constructed values.
4254 #define _mm_insert_epi16(a, b, imm)                                            \
4255   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4256                                         (int)(imm)))
4257 
4258 /// Copies the values of the most significant bits from each 8-bit
4259 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4260 ///    value, zero-extends the value, and writes it to the destination.
4261 ///
4262 /// \headerfile <x86intrin.h>
4263 ///
4264 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4265 ///
4266 /// \param __a
4267 ///    A 128-bit integer vector containing the values with bits to be extracted.
4268 /// \returns The most significant bits from each 8-bit element in \a __a,
4269 ///    written to bits [15:0]. The other bits are assigned zeros.
_mm_movemask_epi8(__m128i __a)4270 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4271   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4272 }
4273 
4274 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4275 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4276 ///    parameter as a specifier.
4277 ///
4278 /// \headerfile <x86intrin.h>
4279 ///
4280 /// \code
4281 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4282 /// \endcode
4283 ///
4284 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4285 ///
4286 /// \param a
4287 ///    A 128-bit integer vector containing the values to be copied.
4288 /// \param imm
4289 ///    An immediate value containing an 8-bit value specifying which elements to
4290 ///    copy from a. The destinations within the 128-bit destination are assigned
4291 ///    values as follows: \n
4292 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4293 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4294 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4295 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4296 ///    Bit value assignments: \n
4297 ///    00: assign values from bits [31:0] of \a a. \n
4298 ///    01: assign values from bits [63:32] of \a a. \n
4299 ///    10: assign values from bits [95:64] of \a a. \n
4300 ///    11: assign values from bits [127:96] of \a a. \n
4301 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 ///    <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shuffle_epi32(a, imm)                                              \
4306   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4307 
4308 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4309 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4310 ///    value parameter as a specifier.
4311 ///
4312 /// \headerfile <x86intrin.h>
4313 ///
4314 /// \code
4315 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4316 /// \endcode
4317 ///
4318 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4319 ///
4320 /// \param a
4321 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4322 ///    [127:64] of the result.
4323 /// \param imm
4324 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4325 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4326 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4327 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4328 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4329 ///    Bit value assignments: \n
4330 ///    00: assign values from bits [15:0] of \a a. \n
4331 ///    01: assign values from bits [31:16] of \a a. \n
4332 ///    10: assign values from bits [47:32] of \a a. \n
4333 ///    11: assign values from bits [63:48] of \a a. \n
4334 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4335 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4336 ///    <c>[b6, b4, b2, b0]</c>.
4337 /// \returns A 128-bit integer vector containing the shuffled values.
4338 #define _mm_shufflelo_epi16(a, imm)                                            \
4339   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4340 
4341 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4342 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4343 ///    value parameter as a specifier.
4344 ///
4345 /// \headerfile <x86intrin.h>
4346 ///
4347 /// \code
4348 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4349 /// \endcode
4350 ///
4351 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4352 ///
4353 /// \param a
4354 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4355 ///    [63:0] of the result.
4356 /// \param imm
4357 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4358 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4359 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4360 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4361 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4362 ///    Bit value assignments: \n
4363 ///    00: assign values from bits [79:64] of \a a. \n
4364 ///    01: assign values from bits [95:80] of \a a. \n
4365 ///    10: assign values from bits [111:96] of \a a. \n
4366 ///    11: assign values from bits [127:112] of \a a. \n
4367 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4368 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4369 ///    <c>[b6, b4, b2, b0]</c>.
4370 /// \returns A 128-bit integer vector containing the shuffled values.
4371 #define _mm_shufflehi_epi16(a, imm)                                            \
4372   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4373 
4374 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4375 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4376 ///
4377 /// \headerfile <x86intrin.h>
4378 ///
4379 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4380 ///   instruction.
4381 ///
4382 /// \param __a
4383 ///    A 128-bit vector of [16 x i8].
4384 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4385 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4386 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4387 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4388 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4389 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4390 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4391 ///    Bits [127:120] are written to bits [119:112] of the result.
4392 /// \param __b
4393 ///    A 128-bit vector of [16 x i8]. \n
4394 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4395 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4396 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4397 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4398 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4399 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4400 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4401 ///    Bits [127:120] are written to bits [127:120] of the result.
4402 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4403 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4404                                                                __m128i __b) {
4405   return (__m128i)__builtin_shufflevector(
4406       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4407       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4408 }
4409 
4410 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4411 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4412 ///
4413 /// \headerfile <x86intrin.h>
4414 ///
4415 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4416 ///   instruction.
4417 ///
4418 /// \param __a
4419 ///    A 128-bit vector of [8 x i16].
4420 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4421 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4422 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4423 ///    Bits [127:112] are written to bits [111:96] of the result.
4424 /// \param __b
4425 ///    A 128-bit vector of [8 x i16].
4426 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4427 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4428 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4429 ///    Bits [127:112] are written to bits [127:112] of the result.
4430 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4431 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4432                                                                 __m128i __b) {
4433   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4434                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
4435 }
4436 
4437 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4438 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4439 ///
4440 /// \headerfile <x86intrin.h>
4441 ///
4442 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4443 ///   instruction.
4444 ///
4445 /// \param __a
4446 ///    A 128-bit vector of [4 x i32]. \n
4447 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4448 ///    Bits [127:96] are written to bits [95:64] of the destination.
4449 /// \param __b
4450 ///    A 128-bit vector of [4 x i32]. \n
4451 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4452 ///    Bits [127:96] are written to bits [127:96] of the destination.
4453 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4455                                                                 __m128i __b) {
4456   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4457                                           4 + 3);
4458 }
4459 
4460 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4461 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4462 ///
4463 /// \headerfile <x86intrin.h>
4464 ///
4465 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4466 ///   instruction.
4467 ///
4468 /// \param __a
4469 ///    A 128-bit vector of [2 x i64]. \n
4470 ///    Bits [127:64] are written to bits [63:0] of the destination.
4471 /// \param __b
4472 ///    A 128-bit vector of [2 x i64]. \n
4473 ///    Bits [127:64] are written to bits [127:64] of the destination.
4474 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4475 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4476                                                                 __m128i __b) {
4477   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4478 }
4479 
4480 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4481 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4482 ///
4483 /// \headerfile <x86intrin.h>
4484 ///
4485 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4486 ///   instruction.
4487 ///
4488 /// \param __a
4489 ///    A 128-bit vector of [16 x i8]. \n
4490 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4491 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4492 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4493 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4494 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4495 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4496 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4497 ///    Bits [63:56] are written to bits [119:112] of the result.
4498 /// \param __b
4499 ///    A 128-bit vector of [16 x i8].
4500 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4501 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4502 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4503 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4504 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4505 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4506 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4507 ///    Bits [63:56] are written to bits [127:120] of the result.
4508 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4509 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4510                                                                __m128i __b) {
4511   return (__m128i)__builtin_shufflevector(
4512       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4513       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4514 }
4515 
4516 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4517 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4518 ///    [8 x i16].
4519 ///
4520 /// \headerfile <x86intrin.h>
4521 ///
4522 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4523 ///   instruction.
4524 ///
4525 /// \param __a
4526 ///    A 128-bit vector of [8 x i16].
4527 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4528 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4529 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4530 ///    Bits [63:48] are written to bits [111:96] of the result.
4531 /// \param __b
4532 ///    A 128-bit vector of [8 x i16].
4533 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4534 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4535 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4536 ///    Bits [63:48] are written to bits [127:112] of the result.
4537 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4538 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4539                                                                 __m128i __b) {
4540   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4541                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
4542 }
4543 
4544 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4545 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4546 ///
4547 /// \headerfile <x86intrin.h>
4548 ///
4549 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4550 ///   instruction.
4551 ///
4552 /// \param __a
4553 ///    A 128-bit vector of [4 x i32]. \n
4554 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4555 ///    Bits [63:32] are written to bits [95:64] of the destination.
4556 /// \param __b
4557 ///    A 128-bit vector of [4 x i32]. \n
4558 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4559 ///    Bits [63:32] are written to bits [127:96] of the destination.
4560 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4561 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4562                                                                 __m128i __b) {
4563   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4564                                           4 + 1);
4565 }
4566 
4567 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4568 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4569 ///
4570 /// \headerfile <x86intrin.h>
4571 ///
4572 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4573 ///   instruction.
4574 ///
4575 /// \param __a
4576 ///    A 128-bit vector of [2 x i64]. \n
4577 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4578 /// \param __b
4579 ///    A 128-bit vector of [2 x i64]. \n
4580 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4581 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4582 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4583                                                                 __m128i __b) {
4584   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4585 }
4586 
4587 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4588 ///    integer.
4589 ///
4590 /// \headerfile <x86intrin.h>
4591 ///
4592 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4593 ///
4594 /// \param __a
4595 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4596 ///    destination.
4597 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
_mm_movepi64_pi64(__m128i __a)4598 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4599   return (__m64)__a[0];
4600 }
4601 
4602 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4603 ///    upper bits.
4604 ///
4605 /// \headerfile <x86intrin.h>
4606 ///
4607 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4608 ///
4609 /// \param __a
4610 ///    A 64-bit value.
4611 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4612 ///    the operand. The upper 64 bits are assigned zeros.
_mm_movpi64_epi64(__m64 __a)4613 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4614   return __extension__(__m128i)(__v2di){(long long)__a, 0};
4615 }
4616 
4617 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4618 ///    integer vector, zeroing the upper bits.
4619 ///
4620 /// \headerfile <x86intrin.h>
4621 ///
4622 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4623 ///
4624 /// \param __a
4625 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4626 ///    destination.
4627 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4628 ///    the operand. The upper 64 bits are assigned zeros.
_mm_move_epi64(__m128i __a)4629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4630   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4631 }
4632 
4633 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4634 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4635 ///    double].
4636 ///
4637 /// \headerfile <x86intrin.h>
4638 ///
4639 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4640 ///
4641 /// \param __a
4642 ///    A 128-bit vector of [2 x double]. \n
4643 ///    Bits [127:64] are written to bits [63:0] of the destination.
4644 /// \param __b
4645 ///    A 128-bit vector of [2 x double]. \n
4646 ///    Bits [127:64] are written to bits [127:64] of the destination.
4647 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpackhi_pd(__m128d __a,__m128d __b)4648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4649                                                              __m128d __b) {
4650   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4651 }
4652 
4653 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4654 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4655 ///    double].
4656 ///
4657 /// \headerfile <x86intrin.h>
4658 ///
4659 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4660 ///
4661 /// \param __a
4662 ///    A 128-bit vector of [2 x double]. \n
4663 ///    Bits [63:0] are written to bits [63:0] of the destination.
4664 /// \param __b
4665 ///    A 128-bit vector of [2 x double]. \n
4666 ///    Bits [63:0] are written to bits [127:64] of the destination.
4667 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpacklo_pd(__m128d __a,__m128d __b)4668 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4669                                                              __m128d __b) {
4670   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4671 }
4672 
4673 /// Extracts the sign bits of the double-precision values in the 128-bit
4674 ///    vector of [2 x double], zero-extends the value, and writes it to the
4675 ///    low-order bits of the destination.
4676 ///
4677 /// \headerfile <x86intrin.h>
4678 ///
4679 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4680 ///
4681 /// \param __a
4682 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4683 ///    be extracted.
4684 /// \returns The sign bits from each of the double-precision elements in \a __a,
4685 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
_mm_movemask_pd(__m128d __a)4686 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4687   return __builtin_ia32_movmskpd((__v2df)__a);
4688 }
4689 
4690 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4691 ///    128-bit vector parameters of [2 x double], using the immediate-value
4692 ///     parameter as a specifier.
4693 ///
4694 /// \headerfile <x86intrin.h>
4695 ///
4696 /// \code
4697 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4698 /// \endcode
4699 ///
4700 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4701 ///
4702 /// \param a
4703 ///    A 128-bit vector of [2 x double].
4704 /// \param b
4705 ///    A 128-bit vector of [2 x double].
4706 /// \param i
4707 ///    An 8-bit immediate value. The least significant two bits specify which
4708 ///    elements to copy from \a a and \a b: \n
4709 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4710 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4711 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4712 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4713 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4714 ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4715 ///    <c>[b1, b0]</c>.
4716 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4717 #define _mm_shuffle_pd(a, b, i)                                                \
4718   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4719                                   (int)(i)))
4720 
4721 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4722 ///    floating-point vector of [4 x float].
4723 ///
4724 /// \headerfile <x86intrin.h>
4725 ///
4726 /// This intrinsic has no corresponding instruction.
4727 ///
4728 /// \param __a
4729 ///    A 128-bit floating-point vector of [2 x double].
4730 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4731 ///    bitwise pattern as the parameter.
_mm_castpd_ps(__m128d __a)4732 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4733   return (__m128)__a;
4734 }
4735 
4736 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4737 ///    integer vector.
4738 ///
4739 /// \headerfile <x86intrin.h>
4740 ///
4741 /// This intrinsic has no corresponding instruction.
4742 ///
4743 /// \param __a
4744 ///    A 128-bit floating-point vector of [2 x double].
4745 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4746 ///    parameter.
_mm_castpd_si128(__m128d __a)4747 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4748   return (__m128i)__a;
4749 }
4750 
4751 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4752 ///    floating-point vector of [2 x double].
4753 ///
4754 /// \headerfile <x86intrin.h>
4755 ///
4756 /// This intrinsic has no corresponding instruction.
4757 ///
4758 /// \param __a
4759 ///    A 128-bit floating-point vector of [4 x float].
4760 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4761 ///    bitwise pattern as the parameter.
_mm_castps_pd(__m128 __a)4762 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4763   return (__m128d)__a;
4764 }
4765 
4766 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4767 ///    integer vector.
4768 ///
4769 /// \headerfile <x86intrin.h>
4770 ///
4771 /// This intrinsic has no corresponding instruction.
4772 ///
4773 /// \param __a
4774 ///    A 128-bit floating-point vector of [4 x float].
4775 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4776 ///    parameter.
_mm_castps_si128(__m128 __a)4777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4778   return (__m128i)__a;
4779 }
4780 
4781 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4782 ///    of [4 x float].
4783 ///
4784 /// \headerfile <x86intrin.h>
4785 ///
4786 /// This intrinsic has no corresponding instruction.
4787 ///
4788 /// \param __a
4789 ///    A 128-bit integer vector.
4790 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4791 ///    bitwise pattern as the parameter.
_mm_castsi128_ps(__m128i __a)4792 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4793   return (__m128)__a;
4794 }
4795 
4796 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4797 ///    of [2 x double].
4798 ///
4799 /// \headerfile <x86intrin.h>
4800 ///
4801 /// This intrinsic has no corresponding instruction.
4802 ///
4803 /// \param __a
4804 ///    A 128-bit integer vector.
4805 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4806 ///    bitwise pattern as the parameter.
_mm_castsi128_pd(__m128i __a)4807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4808   return (__m128d)__a;
4809 }
4810 
4811 /// Compares each of the corresponding double-precision values of two
4812 ///    128-bit vectors of [2 x double], using the operation specified by the
4813 ///    immediate integer operand.
4814 ///
4815 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4816 ///    If either value in a comparison is NaN, comparisons that are ordered
4817 ///    return false, and comparisons that are unordered return true.
4818 ///
4819 /// \headerfile <x86intrin.h>
4820 ///
4821 /// \code
4822 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4823 /// \endcode
4824 ///
4825 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4826 ///
4827 /// \param a
4828 ///    A 128-bit vector of [2 x double].
4829 /// \param b
4830 ///    A 128-bit vector of [2 x double].
4831 /// \param c
4832 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4833 ///    operation to use: \n
4834 ///    0x00: Equal (ordered, non-signaling) \n
4835 ///    0x01: Less-than (ordered, signaling) \n
4836 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4837 ///    0x03: Unordered (non-signaling) \n
4838 ///    0x04: Not-equal (unordered, non-signaling) \n
4839 ///    0x05: Not-less-than (unordered, signaling) \n
4840 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4841 ///    0x07: Ordered (non-signaling) \n
4842 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4843 #define _mm_cmp_pd(a, b, c)                                                    \
4844   ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4845                                  (c)))
4846 
4847 /// Compares each of the corresponding scalar double-precision values of
4848 ///    two 128-bit vectors of [2 x double], using the operation specified by the
4849 ///    immediate integer operand.
4850 ///
4851 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4852 ///    If either value in a comparison is NaN, comparisons that are ordered
4853 ///    return false, and comparisons that are unordered return true.
4854 ///
4855 /// \headerfile <x86intrin.h>
4856 ///
4857 /// \code
4858 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4859 /// \endcode
4860 ///
4861 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4862 ///
4863 /// \param a
4864 ///    A 128-bit vector of [2 x double].
4865 /// \param b
4866 ///    A 128-bit vector of [2 x double].
4867 /// \param c
4868 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4869 ///    operation to use: \n
4870 ///    0x00: Equal (ordered, non-signaling) \n
4871 ///    0x01: Less-than (ordered, signaling) \n
4872 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4873 ///    0x03: Unordered (non-signaling) \n
4874 ///    0x04: Not-equal (unordered, non-signaling) \n
4875 ///    0x05: Not-less-than (unordered, signaling) \n
4876 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4877 ///    0x07: Ordered (non-signaling) \n
4878 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4879 #define _mm_cmp_sd(a, b, c)                                                    \
4880   ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4881                                  (c)))
4882 
4883 #if defined(__cplusplus)
4884 extern "C" {
4885 #endif
4886 
4887 /// Indicates that a spin loop is being executed for the purposes of
4888 ///    optimizing power consumption during the loop.
4889 ///
4890 /// \headerfile <x86intrin.h>
4891 ///
4892 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4893 ///
4894 void _mm_pause(void);
4895 
4896 #if defined(__cplusplus)
4897 } // extern "C"
4898 #endif
4899 
4900 #undef __anyext128
4901 #undef __trunc64
4902 #undef __DEFAULT_FN_ATTRS
4903 
4904 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4905 
4906 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4907 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4908 
4909 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4910 
4911 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4912 #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4913   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4914 
4915 #endif /* __EMMINTRIN_H */
4916