1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16
17 #include <xmmintrin.h>
18
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50
51 /* Define the default attributes for the functions in this file. */
52 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
53 #define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, \
55 __target__("sse2,no-evex512"), __min_vector_width__(128)))
56 #else
57 #define __DEFAULT_FN_ATTRS \
58 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
59 __min_vector_width__(128)))
60 #endif
61
62 #define __trunc64(x) \
63 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
64 #define __anyext128(x) \
65 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
66 1, -1, -1)
67
68 /// Adds lower double-precision values in both operands and returns the
69 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
70 /// are copied from the upper double-precision value of the first operand.
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
75 ///
76 /// \param __a
77 /// A 128-bit vector of [2 x double] containing one of the source operands.
78 /// \param __b
79 /// A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
81 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
82 /// from the upper 64 bits of the first source operand.
_mm_add_sd(__m128d __a,__m128d __b)83 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
84 __m128d __b) {
85 __a[0] += __b[0];
86 return __a;
87 }
88
89 /// Adds two 128-bit vectors of [2 x double].
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
94 ///
95 /// \param __a
96 /// A 128-bit vector of [2 x double] containing one of the source operands.
97 /// \param __b
98 /// A 128-bit vector of [2 x double] containing one of the source operands.
99 /// \returns A 128-bit vector of [2 x double] containing the sums of both
100 /// operands.
_mm_add_pd(__m128d __a,__m128d __b)101 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
102 __m128d __b) {
103 return (__m128d)((__v2df)__a + (__v2df)__b);
104 }
105
106 /// Subtracts the lower double-precision value of the second operand
107 /// from the lower double-precision value of the first operand and returns
108 /// the difference in the lower 64 bits of the result. The upper 64 bits of
109 /// the result are copied from the upper double-precision value of the first
110 /// operand.
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
115 ///
116 /// \param __a
117 /// A 128-bit vector of [2 x double] containing the minuend.
118 /// \param __b
119 /// A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
121 /// difference of the lower 64 bits of both operands. The upper 64 bits are
122 /// copied from the upper 64 bits of the first source operand.
_mm_sub_sd(__m128d __a,__m128d __b)123 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
124 __m128d __b) {
125 __a[0] -= __b[0];
126 return __a;
127 }
128
129 /// Subtracts two 128-bit vectors of [2 x double].
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
134 ///
135 /// \param __a
136 /// A 128-bit vector of [2 x double] containing the minuend.
137 /// \param __b
138 /// A 128-bit vector of [2 x double] containing the subtrahend.
139 /// \returns A 128-bit vector of [2 x double] containing the differences between
140 /// both operands.
_mm_sub_pd(__m128d __a,__m128d __b)141 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
142 __m128d __b) {
143 return (__m128d)((__v2df)__a - (__v2df)__b);
144 }
145
146 /// Multiplies lower double-precision values in both operands and returns
147 /// the product in the lower 64 bits of the result. The upper 64 bits of the
148 /// result are copied from the upper double-precision value of the first
149 /// operand.
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
154 ///
155 /// \param __a
156 /// A 128-bit vector of [2 x double] containing one of the source operands.
157 /// \param __b
158 /// A 128-bit vector of [2 x double] containing one of the source operands.
159 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
160 /// product of the lower 64 bits of both operands. The upper 64 bits are
161 /// copied from the upper 64 bits of the first source operand.
_mm_mul_sd(__m128d __a,__m128d __b)162 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
163 __m128d __b) {
164 __a[0] *= __b[0];
165 return __a;
166 }
167
168 /// Multiplies two 128-bit vectors of [2 x double].
169 ///
170 /// \headerfile <x86intrin.h>
171 ///
172 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
173 ///
174 /// \param __a
175 /// A 128-bit vector of [2 x double] containing one of the operands.
176 /// \param __b
177 /// A 128-bit vector of [2 x double] containing one of the operands.
178 /// \returns A 128-bit vector of [2 x double] containing the products of both
179 /// operands.
_mm_mul_pd(__m128d __a,__m128d __b)180 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
181 __m128d __b) {
182 return (__m128d)((__v2df)__a * (__v2df)__b);
183 }
184
185 /// Divides the lower double-precision value of the first operand by the
186 /// lower double-precision value of the second operand and returns the
187 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
188 /// result are copied from the upper double-precision value of the first
189 /// operand.
190 ///
191 /// \headerfile <x86intrin.h>
192 ///
193 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
194 ///
195 /// \param __a
196 /// A 128-bit vector of [2 x double] containing the dividend.
197 /// \param __b
198 /// A 128-bit vector of [2 x double] containing divisor.
199 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
200 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
201 /// copied from the upper 64 bits of the first source operand.
_mm_div_sd(__m128d __a,__m128d __b)202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
203 __m128d __b) {
204 __a[0] /= __b[0];
205 return __a;
206 }
207
208 /// Performs an element-by-element division of two 128-bit vectors of
209 /// [2 x double].
210 ///
211 /// \headerfile <x86intrin.h>
212 ///
213 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
214 ///
215 /// \param __a
216 /// A 128-bit vector of [2 x double] containing the dividend.
217 /// \param __b
218 /// A 128-bit vector of [2 x double] containing the divisor.
219 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
220 /// operands.
_mm_div_pd(__m128d __a,__m128d __b)221 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
222 __m128d __b) {
223 return (__m128d)((__v2df)__a / (__v2df)__b);
224 }
225
226 /// Calculates the square root of the lower double-precision value of
227 /// the second operand and returns it in the lower 64 bits of the result.
228 /// The upper 64 bits of the result are copied from the upper
229 /// double-precision value of the first operand.
230 ///
231 /// \headerfile <x86intrin.h>
232 ///
233 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
234 ///
235 /// \param __a
236 /// A 128-bit vector of [2 x double] containing one of the operands. The
237 /// upper 64 bits of this operand are copied to the upper 64 bits of the
238 /// result.
239 /// \param __b
240 /// A 128-bit vector of [2 x double] containing one of the operands. The
241 /// square root is calculated using the lower 64 bits of this operand.
242 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
243 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
244 /// bits are copied from the upper 64 bits of operand \a __a.
_mm_sqrt_sd(__m128d __a,__m128d __b)245 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
246 __m128d __b) {
247 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
248 return __extension__(__m128d){__c[0], __a[1]};
249 }
250
251 /// Calculates the square root of the each of two values stored in a
252 /// 128-bit vector of [2 x double].
253 ///
254 /// \headerfile <x86intrin.h>
255 ///
256 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
257 ///
258 /// \param __a
259 /// A 128-bit vector of [2 x double].
260 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
261 /// values in the operand.
_mm_sqrt_pd(__m128d __a)262 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
263 return __builtin_ia32_sqrtpd((__v2df)__a);
264 }
265
266 /// Compares lower 64-bit double-precision values of both operands, and
267 /// returns the lesser of the pair of values in the lower 64-bits of the
268 /// result. The upper 64 bits of the result are copied from the upper
269 /// double-precision value of the first operand.
270 ///
271 /// If either value in a comparison is NaN, returns the value from \a __b.
272 ///
273 /// \headerfile <x86intrin.h>
274 ///
275 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
276 ///
277 /// \param __a
278 /// A 128-bit vector of [2 x double] containing one of the operands. The
279 /// lower 64 bits of this operand are used in the comparison.
280 /// \param __b
281 /// A 128-bit vector of [2 x double] containing one of the operands. The
282 /// lower 64 bits of this operand are used in the comparison.
283 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
284 /// minimum value between both operands. The upper 64 bits are copied from
285 /// the upper 64 bits of the first source operand.
_mm_min_sd(__m128d __a,__m128d __b)286 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
287 __m128d __b) {
288 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
289 }
290
291 /// Performs element-by-element comparison of the two 128-bit vectors of
292 /// [2 x double] and returns a vector containing the lesser of each pair of
293 /// values.
294 ///
295 /// If either value in a comparison is NaN, returns the value from \a __b.
296 ///
297 /// \headerfile <x86intrin.h>
298 ///
299 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
300 ///
301 /// \param __a
302 /// A 128-bit vector of [2 x double] containing one of the operands.
303 /// \param __b
304 /// A 128-bit vector of [2 x double] containing one of the operands.
305 /// \returns A 128-bit vector of [2 x double] containing the minimum values
306 /// between both operands.
_mm_min_pd(__m128d __a,__m128d __b)307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
308 __m128d __b) {
309 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
310 }
311
312 /// Compares lower 64-bit double-precision values of both operands, and
313 /// returns the greater of the pair of values in the lower 64-bits of the
314 /// result. The upper 64 bits of the result are copied from the upper
315 /// double-precision value of the first operand.
316 ///
317 /// If either value in a comparison is NaN, returns the value from \a __b.
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
322 ///
323 /// \param __a
324 /// A 128-bit vector of [2 x double] containing one of the operands. The
325 /// lower 64 bits of this operand are used in the comparison.
326 /// \param __b
327 /// A 128-bit vector of [2 x double] containing one of the operands. The
328 /// lower 64 bits of this operand are used in the comparison.
329 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
330 /// maximum value between both operands. The upper 64 bits are copied from
331 /// the upper 64 bits of the first source operand.
_mm_max_sd(__m128d __a,__m128d __b)332 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
333 __m128d __b) {
334 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
335 }
336
337 /// Performs element-by-element comparison of the two 128-bit vectors of
338 /// [2 x double] and returns a vector containing the greater of each pair
339 /// of values.
340 ///
341 /// If either value in a comparison is NaN, returns the value from \a __b.
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
346 ///
347 /// \param __a
348 /// A 128-bit vector of [2 x double] containing one of the operands.
349 /// \param __b
350 /// A 128-bit vector of [2 x double] containing one of the operands.
351 /// \returns A 128-bit vector of [2 x double] containing the maximum values
352 /// between both operands.
_mm_max_pd(__m128d __a,__m128d __b)353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
354 __m128d __b) {
355 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
356 }
357
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
359 ///
360 /// \headerfile <x86intrin.h>
361 ///
362 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
363 ///
364 /// \param __a
365 /// A 128-bit vector of [2 x double] containing one of the source operands.
366 /// \param __b
367 /// A 128-bit vector of [2 x double] containing one of the source operands.
368 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
369 /// values between both operands.
_mm_and_pd(__m128d __a,__m128d __b)370 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
371 __m128d __b) {
372 return (__m128d)((__v2du)__a & (__v2du)__b);
373 }
374
375 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
376 /// the one's complement of the values contained in the first source operand.
377 ///
378 /// \headerfile <x86intrin.h>
379 ///
380 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
381 ///
382 /// \param __a
383 /// A 128-bit vector of [2 x double] containing the left source operand. The
384 /// one's complement of this value is used in the bitwise AND.
385 /// \param __b
386 /// A 128-bit vector of [2 x double] containing the right source operand.
387 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
388 /// values in the second operand and the one's complement of the first
389 /// operand.
_mm_andnot_pd(__m128d __a,__m128d __b)390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
391 __m128d __b) {
392 return (__m128d)(~(__v2du)__a & (__v2du)__b);
393 }
394
395 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
400 ///
401 /// \param __a
402 /// A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 /// A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
406 /// values between both operands.
_mm_or_pd(__m128d __a,__m128d __b)407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
408 __m128d __b) {
409 return (__m128d)((__v2du)__a | (__v2du)__b);
410 }
411
412 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
413 ///
414 /// \headerfile <x86intrin.h>
415 ///
416 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
417 ///
418 /// \param __a
419 /// A 128-bit vector of [2 x double] containing one of the source operands.
420 /// \param __b
421 /// A 128-bit vector of [2 x double] containing one of the source operands.
422 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
423 /// values between both operands.
_mm_xor_pd(__m128d __a,__m128d __b)424 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
425 __m128d __b) {
426 return (__m128d)((__v2du)__a ^ (__v2du)__b);
427 }
428
429 /// Compares each of the corresponding double-precision values of the
430 /// 128-bit vectors of [2 x double] for equality.
431 ///
432 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
433 /// If either value in a comparison is NaN, returns false.
434 ///
435 /// \headerfile <x86intrin.h>
436 ///
437 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
438 ///
439 /// \param __a
440 /// A 128-bit vector of [2 x double].
441 /// \param __b
442 /// A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpeq_pd(__m128d __a,__m128d __b)444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
445 __m128d __b) {
446 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
447 }
448
449 /// Compares each of the corresponding double-precision values of the
450 /// 128-bit vectors of [2 x double] to determine if the values in the first
451 /// operand are less than those in the second operand.
452 ///
453 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454 /// If either value in a comparison is NaN, returns false.
455 ///
456 /// \headerfile <x86intrin.h>
457 ///
458 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
459 ///
460 /// \param __a
461 /// A 128-bit vector of [2 x double].
462 /// \param __b
463 /// A 128-bit vector of [2 x double].
464 /// \returns A 128-bit vector containing the comparison results.
_mm_cmplt_pd(__m128d __a,__m128d __b)465 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
466 __m128d __b) {
467 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
468 }
469
470 /// Compares each of the corresponding double-precision values of the
471 /// 128-bit vectors of [2 x double] to determine if the values in the first
472 /// operand are less than or equal to those in the second operand.
473 ///
474 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
475 /// If either value in a comparison is NaN, returns false.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
480 ///
481 /// \param __a
482 /// A 128-bit vector of [2 x double].
483 /// \param __b
484 /// A 128-bit vector of [2 x double].
485 /// \returns A 128-bit vector containing the comparison results.
_mm_cmple_pd(__m128d __a,__m128d __b)486 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
487 __m128d __b) {
488 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
489 }
490
491 /// Compares each of the corresponding double-precision values of the
492 /// 128-bit vectors of [2 x double] to determine if the values in the first
493 /// operand are greater than those in the second operand.
494 ///
495 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
496 /// If either value in a comparison is NaN, returns false.
497 ///
498 /// \headerfile <x86intrin.h>
499 ///
500 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
501 ///
502 /// \param __a
503 /// A 128-bit vector of [2 x double].
504 /// \param __b
505 /// A 128-bit vector of [2 x double].
506 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpgt_pd(__m128d __a,__m128d __b)507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
508 __m128d __b) {
509 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
510 }
511
512 /// Compares each of the corresponding double-precision values of the
513 /// 128-bit vectors of [2 x double] to determine if the values in the first
514 /// operand are greater than or equal to those in the second operand.
515 ///
516 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
517 /// If either value in a comparison is NaN, returns false.
518 ///
519 /// \headerfile <x86intrin.h>
520 ///
521 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
522 ///
523 /// \param __a
524 /// A 128-bit vector of [2 x double].
525 /// \param __b
526 /// A 128-bit vector of [2 x double].
527 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpge_pd(__m128d __a,__m128d __b)528 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
529 __m128d __b) {
530 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
531 }
532
533 /// Compares each of the corresponding double-precision values of the
534 /// 128-bit vectors of [2 x double] to determine if the values in the first
535 /// operand are ordered with respect to those in the second operand.
536 ///
537 /// A pair of double-precision values are ordered with respect to each
538 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
539 /// 0xFFFFFFFFFFFFFFFF for true.
540 ///
541 /// \headerfile <x86intrin.h>
542 ///
543 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
544 ///
545 /// \param __a
546 /// A 128-bit vector of [2 x double].
547 /// \param __b
548 /// A 128-bit vector of [2 x double].
549 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpord_pd(__m128d __a,__m128d __b)550 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
551 __m128d __b) {
552 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
553 }
554
555 /// Compares each of the corresponding double-precision values of the
556 /// 128-bit vectors of [2 x double] to determine if the values in the first
557 /// operand are unordered with respect to those in the second operand.
558 ///
559 /// A pair of double-precision values are unordered with respect to each
560 /// other if one or both values are NaN. Each comparison returns 0x0 for
561 /// false, 0xFFFFFFFFFFFFFFFF for true.
562 ///
563 /// \headerfile <x86intrin.h>
564 ///
565 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
566 /// instruction.
567 ///
568 /// \param __a
569 /// A 128-bit vector of [2 x double].
570 /// \param __b
571 /// A 128-bit vector of [2 x double].
572 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpunord_pd(__m128d __a,__m128d __b)573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
574 __m128d __b) {
575 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
576 }
577
578 /// Compares each of the corresponding double-precision values of the
579 /// 128-bit vectors of [2 x double] to determine if the values in the first
580 /// operand are unequal to those in the second operand.
581 ///
582 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
583 /// If either value in a comparison is NaN, returns true.
584 ///
585 /// \headerfile <x86intrin.h>
586 ///
587 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
588 ///
589 /// \param __a
590 /// A 128-bit vector of [2 x double].
591 /// \param __b
592 /// A 128-bit vector of [2 x double].
593 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpneq_pd(__m128d __a,__m128d __b)594 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
595 __m128d __b) {
596 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
597 }
598
599 /// Compares each of the corresponding double-precision values of the
600 /// 128-bit vectors of [2 x double] to determine if the values in the first
601 /// operand are not less than those in the second operand.
602 ///
603 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
604 /// If either value in a comparison is NaN, returns true.
605 ///
606 /// \headerfile <x86intrin.h>
607 ///
608 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
609 ///
610 /// \param __a
611 /// A 128-bit vector of [2 x double].
612 /// \param __b
613 /// A 128-bit vector of [2 x double].
614 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnlt_pd(__m128d __a,__m128d __b)615 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
616 __m128d __b) {
617 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
618 }
619
620 /// Compares each of the corresponding double-precision values of the
621 /// 128-bit vectors of [2 x double] to determine if the values in the first
622 /// operand are not less than or equal to those in the second operand.
623 ///
624 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
625 /// If either value in a comparison is NaN, returns true.
626 ///
627 /// \headerfile <x86intrin.h>
628 ///
629 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
630 ///
631 /// \param __a
632 /// A 128-bit vector of [2 x double].
633 /// \param __b
634 /// A 128-bit vector of [2 x double].
635 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnle_pd(__m128d __a,__m128d __b)636 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
637 __m128d __b) {
638 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
639 }
640
641 /// Compares each of the corresponding double-precision values of the
642 /// 128-bit vectors of [2 x double] to determine if the values in the first
643 /// operand are not greater than those in the second operand.
644 ///
645 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
646 /// If either value in a comparison is NaN, returns true.
647 ///
648 /// \headerfile <x86intrin.h>
649 ///
650 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
651 ///
652 /// \param __a
653 /// A 128-bit vector of [2 x double].
654 /// \param __b
655 /// A 128-bit vector of [2 x double].
656 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpngt_pd(__m128d __a,__m128d __b)657 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
658 __m128d __b) {
659 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
660 }
661
662 /// Compares each of the corresponding double-precision values of the
663 /// 128-bit vectors of [2 x double] to determine if the values in the first
664 /// operand are not greater than or equal to those in the second operand.
665 ///
666 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
667 /// If either value in a comparison is NaN, returns true.
668 ///
669 /// \headerfile <x86intrin.h>
670 ///
671 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
672 ///
673 /// \param __a
674 /// A 128-bit vector of [2 x double].
675 /// \param __b
676 /// A 128-bit vector of [2 x double].
677 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnge_pd(__m128d __a,__m128d __b)678 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
679 __m128d __b) {
680 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
681 }
682
683 /// Compares the lower double-precision floating-point values in each of
684 /// the two 128-bit floating-point vectors of [2 x double] for equality.
685 ///
686 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
687 /// If either value in a comparison is NaN, returns false.
688 ///
689 /// \headerfile <x86intrin.h>
690 ///
691 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
692 ///
693 /// \param __a
694 /// A 128-bit vector of [2 x double]. The lower double-precision value is
695 /// compared to the lower double-precision value of \a __b.
696 /// \param __b
697 /// A 128-bit vector of [2 x double]. The lower double-precision value is
698 /// compared to the lower double-precision value of \a __a.
699 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
700 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpeq_sd(__m128d __a,__m128d __b)701 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
702 __m128d __b) {
703 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
704 }
705
706 /// Compares the lower double-precision floating-point values in each of
707 /// the two 128-bit floating-point vectors of [2 x double] to determine if
708 /// the value in the first parameter is less than the corresponding value in
709 /// the second parameter.
710 ///
711 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
712 /// If either value in a comparison is NaN, returns false.
713 ///
714 /// \headerfile <x86intrin.h>
715 ///
716 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
717 ///
718 /// \param __a
719 /// A 128-bit vector of [2 x double]. The lower double-precision value is
720 /// compared to the lower double-precision value of \a __b.
721 /// \param __b
722 /// A 128-bit vector of [2 x double]. The lower double-precision value is
723 /// compared to the lower double-precision value of \a __a.
724 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
725 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmplt_sd(__m128d __a,__m128d __b)726 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
727 __m128d __b) {
728 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
729 }
730
731 /// Compares the lower double-precision floating-point values in each of
732 /// the two 128-bit floating-point vectors of [2 x double] to determine if
733 /// the value in the first parameter is less than or equal to the
734 /// corresponding value in the second parameter.
735 ///
736 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
737 /// If either value in a comparison is NaN, returns false.
738 ///
739 /// \headerfile <x86intrin.h>
740 ///
741 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
742 ///
743 /// \param __a
744 /// A 128-bit vector of [2 x double]. The lower double-precision value is
745 /// compared to the lower double-precision value of \a __b.
746 /// \param __b
747 /// A 128-bit vector of [2 x double]. The lower double-precision value is
748 /// compared to the lower double-precision value of \a __a.
749 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
750 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmple_sd(__m128d __a,__m128d __b)751 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
752 __m128d __b) {
753 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
754 }
755
756 /// Compares the lower double-precision floating-point values in each of
757 /// the two 128-bit floating-point vectors of [2 x double] to determine if
758 /// the value in the first parameter is greater than the corresponding value
759 /// in the second parameter.
760 ///
761 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
762 /// If either value in a comparison is NaN, returns false.
763 ///
764 /// \headerfile <x86intrin.h>
765 ///
766 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
767 ///
768 /// \param __a
769 /// A 128-bit vector of [2 x double]. The lower double-precision value is
770 /// compared to the lower double-precision value of \a __b.
771 /// \param __b
772 /// A 128-bit vector of [2 x double]. The lower double-precision value is
773 /// compared to the lower double-precision value of \a __a.
774 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
775 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpgt_sd(__m128d __a,__m128d __b)776 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
777 __m128d __b) {
778 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
779 return __extension__(__m128d){__c[0], __a[1]};
780 }
781
782 /// Compares the lower double-precision floating-point values in each of
783 /// the two 128-bit floating-point vectors of [2 x double] to determine if
784 /// the value in the first parameter is greater than or equal to the
785 /// corresponding value in the second parameter.
786 ///
787 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
788 /// If either value in a comparison is NaN, returns false.
789 ///
790 /// \headerfile <x86intrin.h>
791 ///
792 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
793 ///
794 /// \param __a
795 /// A 128-bit vector of [2 x double]. The lower double-precision value is
796 /// compared to the lower double-precision value of \a __b.
797 /// \param __b
798 /// A 128-bit vector of [2 x double]. The lower double-precision value is
799 /// compared to the lower double-precision value of \a __a.
800 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
801 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpge_sd(__m128d __a,__m128d __b)802 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
803 __m128d __b) {
804 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
805 return __extension__(__m128d){__c[0], __a[1]};
806 }
807
808 /// Compares the lower double-precision floating-point values in each of
809 /// the two 128-bit floating-point vectors of [2 x double] to determine if
810 /// the value in the first parameter is ordered with respect to the
811 /// corresponding value in the second parameter.
812 ///
813 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
814 /// of double-precision values are ordered with respect to each other if
815 /// neither value is a NaN.
816 ///
817 /// \headerfile <x86intrin.h>
818 ///
819 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
820 ///
821 /// \param __a
822 /// A 128-bit vector of [2 x double]. The lower double-precision value is
823 /// compared to the lower double-precision value of \a __b.
824 /// \param __b
825 /// A 128-bit vector of [2 x double]. The lower double-precision value is
826 /// compared to the lower double-precision value of \a __a.
827 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
828 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpord_sd(__m128d __a,__m128d __b)829 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
830 __m128d __b) {
831 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
832 }
833
834 /// Compares the lower double-precision floating-point values in each of
835 /// the two 128-bit floating-point vectors of [2 x double] to determine if
836 /// the value in the first parameter is unordered with respect to the
837 /// corresponding value in the second parameter.
838 ///
839 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
840 /// of double-precision values are unordered with respect to each other if
841 /// one or both values are NaN.
842 ///
843 /// \headerfile <x86intrin.h>
844 ///
845 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
846 /// instruction.
847 ///
848 /// \param __a
849 /// A 128-bit vector of [2 x double]. The lower double-precision value is
850 /// compared to the lower double-precision value of \a __b.
851 /// \param __b
852 /// A 128-bit vector of [2 x double]. The lower double-precision value is
853 /// compared to the lower double-precision value of \a __a.
854 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
855 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpunord_sd(__m128d __a,__m128d __b)856 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
857 __m128d __b) {
858 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
859 }
860
861 /// Compares the lower double-precision floating-point values in each of
862 /// the two 128-bit floating-point vectors of [2 x double] to determine if
863 /// the value in the first parameter is unequal to the corresponding value in
864 /// the second parameter.
865 ///
866 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
867 /// If either value in a comparison is NaN, returns true.
868 ///
869 /// \headerfile <x86intrin.h>
870 ///
871 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
872 ///
873 /// \param __a
874 /// A 128-bit vector of [2 x double]. The lower double-precision value is
875 /// compared to the lower double-precision value of \a __b.
876 /// \param __b
877 /// A 128-bit vector of [2 x double]. The lower double-precision value is
878 /// compared to the lower double-precision value of \a __a.
879 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
880 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpneq_sd(__m128d __a,__m128d __b)881 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
882 __m128d __b) {
883 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
884 }
885
886 /// Compares the lower double-precision floating-point values in each of
887 /// the two 128-bit floating-point vectors of [2 x double] to determine if
888 /// the value in the first parameter is not less than the corresponding
889 /// value in the second parameter.
890 ///
891 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
892 /// If either value in a comparison is NaN, returns true.
893 ///
894 /// \headerfile <x86intrin.h>
895 ///
896 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
897 ///
898 /// \param __a
899 /// A 128-bit vector of [2 x double]. The lower double-precision value is
900 /// compared to the lower double-precision value of \a __b.
901 /// \param __b
902 /// A 128-bit vector of [2 x double]. The lower double-precision value is
903 /// compared to the lower double-precision value of \a __a.
904 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
905 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnlt_sd(__m128d __a,__m128d __b)906 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
907 __m128d __b) {
908 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
909 }
910
911 /// Compares the lower double-precision floating-point values in each of
912 /// the two 128-bit floating-point vectors of [2 x double] to determine if
913 /// the value in the first parameter is not less than or equal to the
914 /// corresponding value in the second parameter.
915 ///
916 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
917 /// If either value in a comparison is NaN, returns true.
918 ///
919 /// \headerfile <x86intrin.h>
920 ///
921 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
922 ///
923 /// \param __a
924 /// A 128-bit vector of [2 x double]. The lower double-precision value is
925 /// compared to the lower double-precision value of \a __b.
926 /// \param __b
927 /// A 128-bit vector of [2 x double]. The lower double-precision value is
928 /// compared to the lower double-precision value of \a __a.
929 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
930 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnle_sd(__m128d __a,__m128d __b)931 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
932 __m128d __b) {
933 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
934 }
935
936 /// Compares the lower double-precision floating-point values in each of
937 /// the two 128-bit floating-point vectors of [2 x double] to determine if
938 /// the value in the first parameter is not greater than the corresponding
939 /// value in the second parameter.
940 ///
941 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
942 /// If either value in a comparison is NaN, returns true.
943 ///
944 /// \headerfile <x86intrin.h>
945 ///
946 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
947 ///
948 /// \param __a
949 /// A 128-bit vector of [2 x double]. The lower double-precision value is
950 /// compared to the lower double-precision value of \a __b.
951 /// \param __b
952 /// A 128-bit vector of [2 x double]. The lower double-precision value is
953 /// compared to the lower double-precision value of \a __a.
954 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
955 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpngt_sd(__m128d __a,__m128d __b)956 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
957 __m128d __b) {
958 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
959 return __extension__(__m128d){__c[0], __a[1]};
960 }
961
962 /// Compares the lower double-precision floating-point values in each of
963 /// the two 128-bit floating-point vectors of [2 x double] to determine if
964 /// the value in the first parameter is not greater than or equal to the
965 /// corresponding value in the second parameter.
966 ///
967 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
968 /// If either value in a comparison is NaN, returns true.
969 ///
970 /// \headerfile <x86intrin.h>
971 ///
972 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
973 ///
974 /// \param __a
975 /// A 128-bit vector of [2 x double]. The lower double-precision value is
976 /// compared to the lower double-precision value of \a __b.
977 /// \param __b
978 /// A 128-bit vector of [2 x double]. The lower double-precision value is
979 /// compared to the lower double-precision value of \a __a.
980 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
981 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnge_sd(__m128d __a,__m128d __b)982 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
983 __m128d __b) {
984 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
985 return __extension__(__m128d){__c[0], __a[1]};
986 }
987
988 /// Compares the lower double-precision floating-point values in each of
989 /// the two 128-bit floating-point vectors of [2 x double] for equality.
990 ///
991 /// The comparison returns 0 for false, 1 for true. If either value in a
992 /// comparison is NaN, returns 0.
993 ///
994 /// \headerfile <x86intrin.h>
995 ///
996 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
997 ///
998 /// \param __a
999 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1000 /// compared to the lower double-precision value of \a __b.
1001 /// \param __b
1002 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1003 /// compared to the lower double-precision value of \a __a.
1004 /// \returns An integer containing the comparison results.
_mm_comieq_sd(__m128d __a,__m128d __b)1005 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1006 __m128d __b) {
1007 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1008 }
1009
1010 /// Compares the lower double-precision floating-point values in each of
1011 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1012 /// the value in the first parameter is less than the corresponding value in
1013 /// the second parameter.
1014 ///
1015 /// The comparison returns 0 for false, 1 for true. If either value in a
1016 /// comparison is NaN, returns 0.
1017 ///
1018 /// \headerfile <x86intrin.h>
1019 ///
1020 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1021 ///
1022 /// \param __a
1023 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1024 /// compared to the lower double-precision value of \a __b.
1025 /// \param __b
1026 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1027 /// compared to the lower double-precision value of \a __a.
1028 /// \returns An integer containing the comparison results.
_mm_comilt_sd(__m128d __a,__m128d __b)1029 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1030 __m128d __b) {
1031 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1032 }
1033
1034 /// Compares the lower double-precision floating-point values in each of
1035 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1036 /// the value in the first parameter is less than or equal to the
1037 /// corresponding value in the second parameter.
1038 ///
1039 /// The comparison returns 0 for false, 1 for true. If either value in a
1040 /// comparison is NaN, returns 0.
1041 ///
1042 /// \headerfile <x86intrin.h>
1043 ///
1044 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1045 ///
1046 /// \param __a
1047 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1048 /// compared to the lower double-precision value of \a __b.
1049 /// \param __b
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __a.
1052 /// \returns An integer containing the comparison results.
_mm_comile_sd(__m128d __a,__m128d __b)1053 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1054 __m128d __b) {
1055 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1056 }
1057
1058 /// Compares the lower double-precision floating-point values in each of
1059 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1060 /// the value in the first parameter is greater than the corresponding value
1061 /// in the second parameter.
1062 ///
1063 /// The comparison returns 0 for false, 1 for true. If either value in a
1064 /// comparison is NaN, returns 0.
1065 ///
1066 /// \headerfile <x86intrin.h>
1067 ///
1068 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1069 ///
1070 /// \param __a
1071 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1072 /// compared to the lower double-precision value of \a __b.
1073 /// \param __b
1074 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1075 /// compared to the lower double-precision value of \a __a.
1076 /// \returns An integer containing the comparison results.
_mm_comigt_sd(__m128d __a,__m128d __b)1077 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1078 __m128d __b) {
1079 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1080 }
1081
1082 /// Compares the lower double-precision floating-point values in each of
1083 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1084 /// the value in the first parameter is greater than or equal to the
1085 /// corresponding value in the second parameter.
1086 ///
1087 /// The comparison returns 0 for false, 1 for true. If either value in a
1088 /// comparison is NaN, returns 0.
1089 ///
1090 /// \headerfile <x86intrin.h>
1091 ///
1092 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1093 ///
1094 /// \param __a
1095 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1096 /// compared to the lower double-precision value of \a __b.
1097 /// \param __b
1098 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1099 /// compared to the lower double-precision value of \a __a.
1100 /// \returns An integer containing the comparison results.
_mm_comige_sd(__m128d __a,__m128d __b)1101 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1102 __m128d __b) {
1103 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1104 }
1105
1106 /// Compares the lower double-precision floating-point values in each of
1107 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1108 /// the value in the first parameter is unequal to the corresponding value in
1109 /// the second parameter.
1110 ///
1111 /// The comparison returns 0 for false, 1 for true. If either value in a
1112 /// comparison is NaN, returns 1.
1113 ///
1114 /// \headerfile <x86intrin.h>
1115 ///
1116 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1117 ///
1118 /// \param __a
1119 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1120 /// compared to the lower double-precision value of \a __b.
1121 /// \param __b
1122 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1123 /// compared to the lower double-precision value of \a __a.
1124 /// \returns An integer containing the comparison results.
_mm_comineq_sd(__m128d __a,__m128d __b)1125 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1126 __m128d __b) {
1127 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1128 }
1129
1130 /// Compares the lower double-precision floating-point values in each of
1131 /// the two 128-bit floating-point vectors of [2 x double] for equality.
1132 ///
1133 /// The comparison returns 0 for false, 1 for true. If either value in a
1134 /// comparison is NaN, returns 0.
1135 ///
1136 /// \headerfile <x86intrin.h>
1137 ///
1138 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1139 ///
1140 /// \param __a
1141 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1142 /// compared to the lower double-precision value of \a __b.
1143 /// \param __b
1144 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1145 /// compared to the lower double-precision value of \a __a.
1146 /// \returns An integer containing the comparison results.
_mm_ucomieq_sd(__m128d __a,__m128d __b)1147 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1148 __m128d __b) {
1149 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1150 }
1151
1152 /// Compares the lower double-precision floating-point values in each of
1153 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1154 /// the value in the first parameter is less than the corresponding value in
1155 /// the second parameter.
1156 ///
1157 /// The comparison returns 0 for false, 1 for true. If either value in a
1158 /// comparison is NaN, returns 0.
1159 ///
1160 /// \headerfile <x86intrin.h>
1161 ///
1162 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1163 ///
1164 /// \param __a
1165 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1166 /// compared to the lower double-precision value of \a __b.
1167 /// \param __b
1168 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1169 /// compared to the lower double-precision value of \a __a.
1170 /// \returns An integer containing the comparison results.
_mm_ucomilt_sd(__m128d __a,__m128d __b)1171 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1172 __m128d __b) {
1173 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1174 }
1175
1176 /// Compares the lower double-precision floating-point values in each of
1177 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1178 /// the value in the first parameter is less than or equal to the
1179 /// corresponding value in the second parameter.
1180 ///
1181 /// The comparison returns 0 for false, 1 for true. If either value in a
1182 /// comparison is NaN, returns 0.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1187 ///
1188 /// \param __a
1189 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1190 /// compared to the lower double-precision value of \a __b.
1191 /// \param __b
1192 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1193 /// compared to the lower double-precision value of \a __a.
1194 /// \returns An integer containing the comparison results.
_mm_ucomile_sd(__m128d __a,__m128d __b)1195 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1196 __m128d __b) {
1197 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1198 }
1199
1200 /// Compares the lower double-precision floating-point values in each of
1201 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1202 /// the value in the first parameter is greater than the corresponding value
1203 /// in the second parameter.
1204 ///
1205 /// The comparison returns 0 for false, 1 for true. If either value in a
1206 /// comparison is NaN, returns 0.
1207 ///
1208 /// \headerfile <x86intrin.h>
1209 ///
1210 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1211 ///
1212 /// \param __a
1213 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1214 /// compared to the lower double-precision value of \a __b.
1215 /// \param __b
1216 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1217 /// compared to the lower double-precision value of \a __a.
1218 /// \returns An integer containing the comparison results.
_mm_ucomigt_sd(__m128d __a,__m128d __b)1219 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1220 __m128d __b) {
1221 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1222 }
1223
1224 /// Compares the lower double-precision floating-point values in each of
1225 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1226 /// the value in the first parameter is greater than or equal to the
1227 /// corresponding value in the second parameter.
1228 ///
1229 /// The comparison returns 0 for false, 1 for true. If either value in a
1230 /// comparison is NaN, returns 0.
1231 ///
1232 /// \headerfile <x86intrin.h>
1233 ///
1234 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1235 ///
1236 /// \param __a
1237 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1238 /// compared to the lower double-precision value of \a __b.
1239 /// \param __b
1240 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1241 /// compared to the lower double-precision value of \a __a.
1242 /// \returns An integer containing the comparison results.
_mm_ucomige_sd(__m128d __a,__m128d __b)1243 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1244 __m128d __b) {
1245 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1246 }
1247
1248 /// Compares the lower double-precision floating-point values in each of
1249 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1250 /// the value in the first parameter is unequal to the corresponding value in
1251 /// the second parameter.
1252 ///
1253 /// The comparison returns 0 for false, 1 for true. If either value in a
1254 /// comparison is NaN, returns 1.
1255 ///
1256 /// \headerfile <x86intrin.h>
1257 ///
1258 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1259 ///
1260 /// \param __a
1261 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1262 /// compared to the lower double-precision value of \a __b.
1263 /// \param __b
1264 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1265 /// compared to the lower double-precision value of \a __a.
1266 /// \returns An integer containing the comparison result.
_mm_ucomineq_sd(__m128d __a,__m128d __b)1267 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1268 __m128d __b) {
1269 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1270 }
1271
1272 /// Converts the two double-precision floating-point elements of a
1273 /// 128-bit vector of [2 x double] into two single-precision floating-point
1274 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1275 /// The upper 64 bits of the result vector are set to zero.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1280 ///
1281 /// \param __a
1282 /// A 128-bit vector of [2 x double].
1283 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1284 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_ps(__m128d __a)1285 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1286 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1287 }
1288
1289 /// Converts the lower two single-precision floating-point elements of a
1290 /// 128-bit vector of [4 x float] into two double-precision floating-point
1291 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1292 /// elements of the input vector are unused.
1293 ///
1294 /// \headerfile <x86intrin.h>
1295 ///
1296 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1297 ///
1298 /// \param __a
1299 /// A 128-bit vector of [4 x float]. The lower two single-precision
1300 /// floating-point elements are converted to double-precision values. The
1301 /// upper two elements are unused.
1302 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtps_pd(__m128 __a)1303 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1304 return (__m128d) __builtin_convertvector(
1305 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1306 }
1307
1308 /// Converts the lower two integer elements of a 128-bit vector of
1309 /// [4 x i32] into two double-precision floating-point values, returned in a
1310 /// 128-bit vector of [2 x double].
1311 ///
1312 /// The upper two elements of the input vector are unused.
1313 ///
1314 /// \headerfile <x86intrin.h>
1315 ///
1316 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1317 ///
1318 /// \param __a
1319 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1320 /// converted to double-precision values.
1321 ///
1322 /// The upper two elements are unused.
1323 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtepi32_pd(__m128i __a)1324 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1325 return (__m128d) __builtin_convertvector(
1326 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1327 }
1328
1329 /// Converts the two double-precision floating-point elements of a
1330 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1331 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1332 /// 64 bits of the result vector are set to zero.
1333 ///
1334 /// If a converted value does not fit in a 32-bit integer, raises a
1335 /// floating-point invalid exception. If the exception is masked, returns
1336 /// the most negative integer.
1337 ///
1338 /// \headerfile <x86intrin.h>
1339 ///
1340 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1341 ///
1342 /// \param __a
1343 /// A 128-bit vector of [2 x double].
1344 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1345 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_epi32(__m128d __a)1346 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1347 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1348 }
1349
1350 /// Converts the low-order element of a 128-bit vector of [2 x double]
1351 /// into a 32-bit signed integer value.
1352 ///
1353 /// If the converted value does not fit in a 32-bit integer, raises a
1354 /// floating-point invalid exception. If the exception is masked, returns
1355 /// the most negative integer.
1356 ///
1357 /// \headerfile <x86intrin.h>
1358 ///
1359 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1360 ///
1361 /// \param __a
1362 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1363 /// conversion.
1364 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvtsd_si32(__m128d __a)1365 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1366 return __builtin_ia32_cvtsd2si((__v2df)__a);
1367 }
1368
1369 /// Converts the lower double-precision floating-point element of a
1370 /// 128-bit vector of [2 x double], in the second parameter, into a
1371 /// single-precision floating-point value, returned in the lower 32 bits of a
1372 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1373 /// copied from the upper 96 bits of the first parameter.
1374 ///
1375 /// \headerfile <x86intrin.h>
1376 ///
1377 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1378 ///
1379 /// \param __a
1380 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1381 /// copied to the upper 96 bits of the result.
1382 /// \param __b
1383 /// A 128-bit vector of [2 x double]. The lower double-precision
1384 /// floating-point element is used in the conversion.
1385 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1386 /// converted value from the second parameter. The upper 96 bits are copied
1387 /// from the upper 96 bits of the first parameter.
_mm_cvtsd_ss(__m128 __a,__m128d __b)1388 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1389 __m128d __b) {
1390 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1391 }
1392
1393 /// Converts a 32-bit signed integer value, in the second parameter, into
1394 /// a double-precision floating-point value, returned in the lower 64 bits of
1395 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1396 /// are copied from the upper 64 bits of the first parameter.
1397 ///
1398 /// \headerfile <x86intrin.h>
1399 ///
1400 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1401 ///
1402 /// \param __a
1403 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1404 /// copied to the upper 64 bits of the result.
1405 /// \param __b
1406 /// A 32-bit signed integer containing the value to be converted.
1407 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1408 /// converted value from the second parameter. The upper 64 bits are copied
1409 /// from the upper 64 bits of the first parameter.
_mm_cvtsi32_sd(__m128d __a,int __b)1410 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1411 int __b) {
1412 __a[0] = __b;
1413 return __a;
1414 }
1415
1416 /// Converts the lower single-precision floating-point element of a
1417 /// 128-bit vector of [4 x float], in the second parameter, into a
1418 /// double-precision floating-point value, returned in the lower 64 bits of
1419 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1420 /// are copied from the upper 64 bits of the first parameter.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1425 ///
1426 /// \param __a
1427 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1428 /// copied to the upper 64 bits of the result.
1429 /// \param __b
1430 /// A 128-bit vector of [4 x float]. The lower single-precision
1431 /// floating-point element is used in the conversion.
1432 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1433 /// converted value from the second parameter. The upper 64 bits are copied
1434 /// from the upper 64 bits of the first parameter.
_mm_cvtss_sd(__m128d __a,__m128 __b)1435 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1436 __m128 __b) {
1437 __a[0] = __b[0];
1438 return __a;
1439 }
1440
1441 /// Converts the two double-precision floating-point elements of a
1442 /// 128-bit vector of [2 x double] into two signed truncated (rounded
1443 /// toward zero) 32-bit integer values, returned in the lower 64 bits
1444 /// of a 128-bit vector of [4 x i32].
1445 ///
1446 /// If a converted value does not fit in a 32-bit integer, raises a
1447 /// floating-point invalid exception. If the exception is masked, returns
1448 /// the most negative integer.
1449 ///
1450 /// \headerfile <x86intrin.h>
1451 ///
1452 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1453 /// instruction.
1454 ///
1455 /// \param __a
1456 /// A 128-bit vector of [2 x double].
1457 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1458 /// converted values. The upper 64 bits are set to zero.
_mm_cvttpd_epi32(__m128d __a)1459 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1460 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1461 }
1462
1463 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1464 /// signed truncated (rounded toward zero) integer value.
1465 ///
1466 /// If the converted value does not fit in a 32-bit integer, raises a
1467 /// floating-point invalid exception. If the exception is masked, returns
1468 /// the most negative integer.
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1473 /// instruction.
1474 ///
1475 /// \param __a
1476 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1477 /// conversion.
1478 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvttsd_si32(__m128d __a)1479 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1480 return __builtin_ia32_cvttsd2si((__v2df)__a);
1481 }
1482
1483 /// Converts the two double-precision floating-point elements of a
1484 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1485 /// returned in a 64-bit vector of [2 x i32].
1486 ///
1487 /// If a converted value does not fit in a 32-bit integer, raises a
1488 /// floating-point invalid exception. If the exception is masked, returns
1489 /// the most negative integer.
1490 ///
1491 /// \headerfile <x86intrin.h>
1492 ///
1493 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1494 ///
1495 /// \param __a
1496 /// A 128-bit vector of [2 x double].
1497 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvtpd_pi32(__m128d __a)1498 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1499 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1500 }
1501
1502 /// Converts the two double-precision floating-point elements of a
1503 /// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1504 /// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1505 ///
1506 /// If a converted value does not fit in a 32-bit integer, raises a
1507 /// floating-point invalid exception. If the exception is masked, returns
1508 /// the most negative integer.
1509 ///
1510 /// \headerfile <x86intrin.h>
1511 ///
1512 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1513 ///
1514 /// \param __a
1515 /// A 128-bit vector of [2 x double].
1516 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvttpd_pi32(__m128d __a)1517 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1518 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1519 }
1520
1521 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1522 /// [2 x i32] into two double-precision floating-point values, returned in a
1523 /// 128-bit vector of [2 x double].
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1528 ///
1529 /// \param __a
1530 /// A 64-bit vector of [2 x i32].
1531 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtpi32_pd(__m64 __a)1532 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
1533 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1534 }
1535
1536 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1537 /// a double-precision floating-point value.
1538 ///
1539 /// \headerfile <x86intrin.h>
1540 ///
1541 /// This intrinsic has no corresponding instruction.
1542 ///
1543 /// \param __a
1544 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1545 /// \returns A double-precision floating-point value copied from the lower 64
1546 /// bits of \a __a.
_mm_cvtsd_f64(__m128d __a)1547 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1548 return __a[0];
1549 }
1550
1551 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1552 /// memory location.
1553 ///
1554 /// \headerfile <x86intrin.h>
1555 ///
1556 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1557 ///
1558 /// \param __dp
1559 /// A pointer to a 128-bit memory location. The address of the memory
1560 /// location has to be 16-byte aligned.
1561 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_load_pd(double const * __dp)1562 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1563 return *(const __m128d *)__dp;
1564 }
1565
1566 /// Loads a double-precision floating-point value from a specified memory
1567 /// location and duplicates it to both vector elements of a 128-bit vector of
1568 /// [2 x double].
1569 ///
1570 /// \headerfile <x86intrin.h>
1571 ///
1572 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1573 ///
1574 /// \param __dp
1575 /// A pointer to a memory location containing a double-precision value.
1576 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1577 /// duplicated values.
_mm_load1_pd(double const * __dp)1578 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1579 struct __mm_load1_pd_struct {
1580 double __u;
1581 } __attribute__((__packed__, __may_alias__));
1582 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1583 return __extension__(__m128d){__u, __u};
1584 }
1585
1586 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1587
1588 /// Loads two double-precision values, in reverse order, from an aligned
1589 /// memory location into a 128-bit vector of [2 x double].
1590 ///
1591 /// \headerfile <x86intrin.h>
1592 ///
1593 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1594 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1595 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1596 ///
1597 /// \param __dp
1598 /// A 16-byte aligned pointer to an array of double-precision values to be
1599 /// loaded in reverse order.
1600 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1601 /// values.
_mm_loadr_pd(double const * __dp)1602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1603 __m128d __u = *(const __m128d *)__dp;
1604 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1605 }
1606
1607 /// Loads a 128-bit floating-point vector of [2 x double] from an
1608 /// unaligned memory location.
1609 ///
1610 /// \headerfile <x86intrin.h>
1611 ///
1612 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1613 ///
1614 /// \param __dp
1615 /// A pointer to a 128-bit memory location. The address of the memory
1616 /// location does not have to be aligned.
1617 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_loadu_pd(double const * __dp)1618 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1619 struct __loadu_pd {
1620 __m128d_u __v;
1621 } __attribute__((__packed__, __may_alias__));
1622 return ((const struct __loadu_pd *)__dp)->__v;
1623 }
1624
1625 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1626 /// vector and clears the upper element.
1627 ///
1628 /// \headerfile <x86intrin.h>
1629 ///
1630 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1631 ///
1632 /// \param __a
1633 /// A pointer to a 64-bit memory location. The address of the memory
1634 /// location does not have to be aligned.
1635 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
_mm_loadu_si64(void const * __a)1636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1637 struct __loadu_si64 {
1638 long long __v;
1639 } __attribute__((__packed__, __may_alias__));
1640 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1641 return __extension__(__m128i)(__v2di){__u, 0LL};
1642 }
1643
1644 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1645 /// vector and clears the upper element.
1646 ///
1647 /// \headerfile <x86intrin.h>
1648 ///
1649 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1650 ///
1651 /// \param __a
1652 /// A pointer to a 32-bit memory location. The address of the memory
1653 /// location does not have to be aligned.
1654 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
_mm_loadu_si32(void const * __a)1655 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1656 struct __loadu_si32 {
1657 int __v;
1658 } __attribute__((__packed__, __may_alias__));
1659 int __u = ((const struct __loadu_si32 *)__a)->__v;
1660 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1661 }
1662
1663 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1664 /// vector and clears the upper element.
1665 ///
1666 /// \headerfile <x86intrin.h>
1667 ///
1668 /// This intrinsic does not correspond to a specific instruction.
1669 ///
1670 /// \param __a
1671 /// A pointer to a 16-bit memory location. The address of the memory
1672 /// location does not have to be aligned.
1673 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
_mm_loadu_si16(void const * __a)1674 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1675 struct __loadu_si16 {
1676 short __v;
1677 } __attribute__((__packed__, __may_alias__));
1678 short __u = ((const struct __loadu_si16 *)__a)->__v;
1679 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1680 }
1681
1682 /// Loads a 64-bit double-precision value to the low element of a
1683 /// 128-bit integer vector and clears the upper element.
1684 ///
1685 /// \headerfile <x86intrin.h>
1686 ///
1687 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1688 ///
1689 /// \param __dp
1690 /// A pointer to a memory location containing a double-precision value.
1691 /// The address of the memory location does not have to be aligned.
1692 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
_mm_load_sd(double const * __dp)1693 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1694 struct __mm_load_sd_struct {
1695 double __u;
1696 } __attribute__((__packed__, __may_alias__));
1697 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1698 return __extension__(__m128d){__u, 0};
1699 }
1700
1701 /// Loads a double-precision value into the high-order bits of a 128-bit
1702 /// vector of [2 x double]. The low-order bits are copied from the low-order
1703 /// bits of the first operand.
1704 ///
1705 /// \headerfile <x86intrin.h>
1706 ///
1707 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1708 ///
1709 /// \param __a
1710 /// A 128-bit vector of [2 x double]. \n
1711 /// Bits [63:0] are written to bits [63:0] of the result.
1712 /// \param __dp
1713 /// A pointer to a 64-bit memory location containing a double-precision
1714 /// floating-point value that is loaded. The loaded value is written to bits
1715 /// [127:64] of the result. The address of the memory location does not have
1716 /// to be aligned.
1717 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadh_pd(__m128d __a,double const * __dp)1718 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1719 double const *__dp) {
1720 struct __mm_loadh_pd_struct {
1721 double __u;
1722 } __attribute__((__packed__, __may_alias__));
1723 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1724 return __extension__(__m128d){__a[0], __u};
1725 }
1726
1727 /// Loads a double-precision value into the low-order bits of a 128-bit
1728 /// vector of [2 x double]. The high-order bits are copied from the
1729 /// high-order bits of the first operand.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1734 ///
1735 /// \param __a
1736 /// A 128-bit vector of [2 x double]. \n
1737 /// Bits [127:64] are written to bits [127:64] of the result.
1738 /// \param __dp
1739 /// A pointer to a 64-bit memory location containing a double-precision
1740 /// floating-point value that is loaded. The loaded value is written to bits
1741 /// [63:0] of the result. The address of the memory location does not have to
1742 /// be aligned.
1743 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadl_pd(__m128d __a,double const * __dp)1744 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1745 double const *__dp) {
1746 struct __mm_loadl_pd_struct {
1747 double __u;
1748 } __attribute__((__packed__, __may_alias__));
1749 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1750 return __extension__(__m128d){__u, __a[1]};
1751 }
1752
1753 /// Constructs a 128-bit floating-point vector of [2 x double] with
1754 /// unspecified content. This could be used as an argument to another
1755 /// intrinsic function where the argument is required but the value is not
1756 /// actually used.
1757 ///
1758 /// \headerfile <x86intrin.h>
1759 ///
1760 /// This intrinsic has no corresponding instruction.
1761 ///
1762 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1763 /// content.
_mm_undefined_pd(void)1764 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1765 return (__m128d)__builtin_ia32_undef128();
1766 }
1767
1768 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1769 /// 64 bits of the vector are initialized with the specified double-precision
1770 /// floating-point value. The upper 64 bits are set to zero.
1771 ///
1772 /// \headerfile <x86intrin.h>
1773 ///
1774 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1775 ///
1776 /// \param __w
1777 /// A double-precision floating-point value used to initialize the lower 64
1778 /// bits of the result.
1779 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1780 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1781 /// set to zero.
_mm_set_sd(double __w)1782 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1783 return __extension__(__m128d){__w, 0.0};
1784 }
1785
1786 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1787 /// of the two double-precision floating-point vector elements set to the
1788 /// specified double-precision floating-point value.
1789 ///
1790 /// \headerfile <x86intrin.h>
1791 ///
1792 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1793 ///
1794 /// \param __w
1795 /// A double-precision floating-point value used to initialize each vector
1796 /// element of the result.
1797 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set1_pd(double __w)1798 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1799 return __extension__(__m128d){__w, __w};
1800 }
1801
1802 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1803 /// of the two double-precision floating-point vector elements set to the
1804 /// specified double-precision floating-point value.
1805 ///
1806 /// \headerfile <x86intrin.h>
1807 ///
1808 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1809 ///
1810 /// \param __w
1811 /// A double-precision floating-point value used to initialize each vector
1812 /// element of the result.
1813 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd1(double __w)1814 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1815 return _mm_set1_pd(__w);
1816 }
1817
1818 /// Constructs a 128-bit floating-point vector of [2 x double]
1819 /// initialized with the specified double-precision floating-point values.
1820 ///
1821 /// \headerfile <x86intrin.h>
1822 ///
1823 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1824 ///
1825 /// \param __w
1826 /// A double-precision floating-point value used to initialize the upper 64
1827 /// bits of the result.
1828 /// \param __x
1829 /// A double-precision floating-point value used to initialize the lower 64
1830 /// bits of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd(double __w,double __x)1832 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1833 double __x) {
1834 return __extension__(__m128d){__x, __w};
1835 }
1836
1837 /// Constructs a 128-bit floating-point vector of [2 x double],
1838 /// initialized in reverse order with the specified double-precision
1839 /// floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1844 ///
1845 /// \param __w
1846 /// A double-precision floating-point value used to initialize the lower 64
1847 /// bits of the result.
1848 /// \param __x
1849 /// A double-precision floating-point value used to initialize the upper 64
1850 /// bits of the result.
1851 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_setr_pd(double __w,double __x)1852 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1853 double __x) {
1854 return __extension__(__m128d){__w, __x};
1855 }
1856
1857 /// Constructs a 128-bit floating-point vector of [2 x double]
1858 /// initialized to zero.
1859 ///
1860 /// \headerfile <x86intrin.h>
1861 ///
1862 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1863 ///
1864 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1865 /// all elements set to zero.
_mm_setzero_pd(void)1866 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1867 return __extension__(__m128d){0.0, 0.0};
1868 }
1869
1870 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1871 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1872 /// 64 bits are set to the upper 64 bits of the first parameter.
1873 ///
1874 /// \headerfile <x86intrin.h>
1875 ///
1876 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1877 ///
1878 /// \param __a
1879 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1880 /// upper 64 bits of the result.
1881 /// \param __b
1882 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1883 /// lower 64 bits of the result.
1884 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_move_sd(__m128d __a,__m128d __b)1885 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1886 __m128d __b) {
1887 __a[0] = __b[0];
1888 return __a;
1889 }
1890
1891 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1892 /// memory location.
1893 ///
1894 /// \headerfile <x86intrin.h>
1895 ///
1896 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1897 ///
1898 /// \param __dp
1899 /// A pointer to a 64-bit memory location.
1900 /// \param __a
1901 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_store_sd(double * __dp,__m128d __a)1902 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1903 __m128d __a) {
1904 struct __mm_store_sd_struct {
1905 double __u;
1906 } __attribute__((__packed__, __may_alias__));
1907 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1908 }
1909
1910 /// Moves packed double-precision values from a 128-bit vector of
1911 /// [2 x double] to a memory location.
1912 ///
1913 /// \headerfile <x86intrin.h>
1914 ///
1915 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1916 ///
1917 /// \param __dp
1918 /// A pointer to an aligned memory location that can store two
1919 /// double-precision values.
1920 /// \param __a
1921 /// A packed 128-bit vector of [2 x double] containing the values to be
1922 /// moved.
_mm_store_pd(double * __dp,__m128d __a)1923 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1924 __m128d __a) {
1925 *(__m128d *)__dp = __a;
1926 }
1927
1928 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1929 /// the upper and lower 64 bits of a memory location.
1930 ///
1931 /// \headerfile <x86intrin.h>
1932 ///
1933 /// This intrinsic corresponds to the
1934 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1935 ///
1936 /// \param __dp
1937 /// A pointer to a memory location that can store two double-precision
1938 /// values.
1939 /// \param __a
1940 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1941 /// of the values in \a __dp.
_mm_store1_pd(double * __dp,__m128d __a)1942 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1943 __m128d __a) {
1944 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1945 _mm_store_pd(__dp, __a);
1946 }
1947
1948 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1949 /// the upper and lower 64 bits of a memory location.
1950 ///
1951 /// \headerfile <x86intrin.h>
1952 ///
1953 /// This intrinsic corresponds to the
1954 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1955 ///
1956 /// \param __dp
1957 /// A pointer to a memory location that can store two double-precision
1958 /// values.
1959 /// \param __a
1960 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1961 /// of the values in \a __dp.
_mm_store_pd1(double * __dp,__m128d __a)1962 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1963 __m128d __a) {
1964 _mm_store1_pd(__dp, __a);
1965 }
1966
1967 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1968 /// location.
1969 ///
1970 /// \headerfile <x86intrin.h>
1971 ///
1972 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1973 ///
1974 /// \param __dp
1975 /// A pointer to a 128-bit memory location. The address of the memory
1976 /// location does not have to be aligned.
1977 /// \param __a
1978 /// A 128-bit vector of [2 x double] containing the values to be stored.
_mm_storeu_pd(double * __dp,__m128d __a)1979 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1980 __m128d __a) {
1981 struct __storeu_pd {
1982 __m128d_u __v;
1983 } __attribute__((__packed__, __may_alias__));
1984 ((struct __storeu_pd *)__dp)->__v = __a;
1985 }
1986
1987 /// Stores two double-precision values, in reverse order, from a 128-bit
1988 /// vector of [2 x double] to a 16-byte aligned memory location.
1989 ///
1990 /// \headerfile <x86intrin.h>
1991 ///
1992 /// This intrinsic corresponds to a shuffling instruction followed by a
1993 /// <c> VMOVAPD / MOVAPD </c> instruction.
1994 ///
1995 /// \param __dp
1996 /// A pointer to a 16-byte aligned memory location that can store two
1997 /// double-precision values.
1998 /// \param __a
1999 /// A 128-bit vector of [2 x double] containing the values to be reversed and
2000 /// stored.
_mm_storer_pd(double * __dp,__m128d __a)2001 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2002 __m128d __a) {
2003 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2004 *(__m128d *)__dp = __a;
2005 }
2006
2007 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2008 /// memory location.
2009 ///
2010 /// \headerfile <x86intrin.h>
2011 ///
2012 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2013 ///
2014 /// \param __dp
2015 /// A pointer to a 64-bit memory location.
2016 /// \param __a
2017 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storeh_pd(double * __dp,__m128d __a)2018 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2019 __m128d __a) {
2020 struct __mm_storeh_pd_struct {
2021 double __u;
2022 } __attribute__((__packed__, __may_alias__));
2023 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2024 }
2025
2026 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2027 /// memory location.
2028 ///
2029 /// \headerfile <x86intrin.h>
2030 ///
2031 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2032 ///
2033 /// \param __dp
2034 /// A pointer to a 64-bit memory location.
2035 /// \param __a
2036 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storel_pd(double * __dp,__m128d __a)2037 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2038 __m128d __a) {
2039 struct __mm_storeh_pd_struct {
2040 double __u;
2041 } __attribute__((__packed__, __may_alias__));
2042 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2043 }
2044
2045 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2046 /// saving the lower 8 bits of each sum in the corresponding element of a
2047 /// 128-bit result vector of [16 x i8].
2048 ///
2049 /// The integer elements of both parameters can be either signed or unsigned.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2054 ///
2055 /// \param __a
2056 /// A 128-bit vector of [16 x i8].
2057 /// \param __b
2058 /// A 128-bit vector of [16 x i8].
2059 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2060 /// parameters.
_mm_add_epi8(__m128i __a,__m128i __b)2061 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2062 __m128i __b) {
2063 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2064 }
2065
2066 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2067 /// saving the lower 16 bits of each sum in the corresponding element of a
2068 /// 128-bit result vector of [8 x i16].
2069 ///
2070 /// The integer elements of both parameters can be either signed or unsigned.
2071 ///
2072 /// \headerfile <x86intrin.h>
2073 ///
2074 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2075 ///
2076 /// \param __a
2077 /// A 128-bit vector of [8 x i16].
2078 /// \param __b
2079 /// A 128-bit vector of [8 x i16].
2080 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2081 /// parameters.
_mm_add_epi16(__m128i __a,__m128i __b)2082 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2083 __m128i __b) {
2084 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2085 }
2086
2087 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2088 /// saving the lower 32 bits of each sum in the corresponding element of a
2089 /// 128-bit result vector of [4 x i32].
2090 ///
2091 /// The integer elements of both parameters can be either signed or unsigned.
2092 ///
2093 /// \headerfile <x86intrin.h>
2094 ///
2095 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2096 ///
2097 /// \param __a
2098 /// A 128-bit vector of [4 x i32].
2099 /// \param __b
2100 /// A 128-bit vector of [4 x i32].
2101 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2102 /// parameters.
_mm_add_epi32(__m128i __a,__m128i __b)2103 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2104 __m128i __b) {
2105 return (__m128i)((__v4su)__a + (__v4su)__b);
2106 }
2107
2108 /// Adds two signed or unsigned 64-bit integer values, returning the
2109 /// lower 64 bits of the sum.
2110 ///
2111 /// \headerfile <x86intrin.h>
2112 ///
2113 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2114 ///
2115 /// \param __a
2116 /// A 64-bit integer.
2117 /// \param __b
2118 /// A 64-bit integer.
2119 /// \returns A 64-bit integer containing the sum of both parameters.
_mm_add_si64(__m64 __a,__m64 __b)2120 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2121 return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2122 }
2123
2124 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2125 /// saving the lower 64 bits of each sum in the corresponding element of a
2126 /// 128-bit result vector of [2 x i64].
2127 ///
2128 /// The integer elements of both parameters can be either signed or unsigned.
2129 ///
2130 /// \headerfile <x86intrin.h>
2131 ///
2132 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2133 ///
2134 /// \param __a
2135 /// A 128-bit vector of [2 x i64].
2136 /// \param __b
2137 /// A 128-bit vector of [2 x i64].
2138 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2139 /// parameters.
_mm_add_epi64(__m128i __a,__m128i __b)2140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2141 __m128i __b) {
2142 return (__m128i)((__v2du)__a + (__v2du)__b);
2143 }
2144
2145 /// Adds, with saturation, the corresponding elements of two 128-bit
2146 /// signed [16 x i8] vectors, saving each sum in the corresponding element
2147 /// of a 128-bit result vector of [16 x i8].
2148 ///
2149 /// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2150 /// less than 0x80 are saturated to 0x80.
2151 ///
2152 /// \headerfile <x86intrin.h>
2153 ///
2154 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2155 ///
2156 /// \param __a
2157 /// A 128-bit signed [16 x i8] vector.
2158 /// \param __b
2159 /// A 128-bit signed [16 x i8] vector.
2160 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2161 /// both parameters.
_mm_adds_epi8(__m128i __a,__m128i __b)2162 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2163 __m128i __b) {
2164 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2165 }
2166
2167 /// Adds, with saturation, the corresponding elements of two 128-bit
2168 /// signed [8 x i16] vectors, saving each sum in the corresponding element
2169 /// of a 128-bit result vector of [8 x i16].
2170 ///
2171 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2172 /// less than 0x8000 are saturated to 0x8000.
2173 ///
2174 /// \headerfile <x86intrin.h>
2175 ///
2176 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2177 ///
2178 /// \param __a
2179 /// A 128-bit signed [8 x i16] vector.
2180 /// \param __b
2181 /// A 128-bit signed [8 x i16] vector.
2182 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2183 /// both parameters.
_mm_adds_epi16(__m128i __a,__m128i __b)2184 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2185 __m128i __b) {
2186 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2187 }
2188
2189 /// Adds, with saturation, the corresponding elements of two 128-bit
2190 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2191 /// of a 128-bit result vector of [16 x i8].
2192 ///
2193 /// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2194 /// saturated to 0x00.
2195 ///
2196 /// \headerfile <x86intrin.h>
2197 ///
2198 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2199 ///
2200 /// \param __a
2201 /// A 128-bit unsigned [16 x i8] vector.
2202 /// \param __b
2203 /// A 128-bit unsigned [16 x i8] vector.
2204 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2205 /// of both parameters.
_mm_adds_epu8(__m128i __a,__m128i __b)2206 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2207 __m128i __b) {
2208 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2209 }
2210
2211 /// Adds, with saturation, the corresponding elements of two 128-bit
2212 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2213 /// of a 128-bit result vector of [8 x i16].
2214 ///
2215 /// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2216 /// are saturated to 0x0000.
2217 ///
2218 /// \headerfile <x86intrin.h>
2219 ///
2220 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2221 ///
2222 /// \param __a
2223 /// A 128-bit unsigned [8 x i16] vector.
2224 /// \param __b
2225 /// A 128-bit unsigned [8 x i16] vector.
2226 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2227 /// of both parameters.
_mm_adds_epu16(__m128i __a,__m128i __b)2228 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2229 __m128i __b) {
2230 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2231 }
2232
2233 /// Computes the rounded averages of corresponding elements of two
2234 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2235 /// corresponding element of a 128-bit result vector of [16 x i8].
2236 ///
2237 /// \headerfile <x86intrin.h>
2238 ///
2239 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2240 ///
2241 /// \param __a
2242 /// A 128-bit unsigned [16 x i8] vector.
2243 /// \param __b
2244 /// A 128-bit unsigned [16 x i8] vector.
2245 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2246 /// averages of both parameters.
_mm_avg_epu8(__m128i __a,__m128i __b)2247 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2248 __m128i __b) {
2249 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2250 }
2251
2252 /// Computes the rounded averages of corresponding elements of two
2253 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2254 /// corresponding element of a 128-bit result vector of [8 x i16].
2255 ///
2256 /// \headerfile <x86intrin.h>
2257 ///
2258 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2259 ///
2260 /// \param __a
2261 /// A 128-bit unsigned [8 x i16] vector.
2262 /// \param __b
2263 /// A 128-bit unsigned [8 x i16] vector.
2264 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2265 /// averages of both parameters.
_mm_avg_epu16(__m128i __a,__m128i __b)2266 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2267 __m128i __b) {
2268 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2269 }
2270
2271 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2272 /// vectors, producing eight intermediate 32-bit signed integer products, and
2273 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2274 /// [4 x i32] vector.
2275 ///
2276 /// For example, bits [15:0] of both parameters are multiplied producing a
2277 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2278 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2279 /// of the result.
2280 ///
2281 /// \headerfile <x86intrin.h>
2282 ///
2283 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2284 ///
2285 /// \param __a
2286 /// A 128-bit signed [8 x i16] vector.
2287 /// \param __b
2288 /// A 128-bit signed [8 x i16] vector.
2289 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2290 /// of both parameters.
_mm_madd_epi16(__m128i __a,__m128i __b)2291 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2292 __m128i __b) {
2293 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2294 }
2295
2296 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2297 /// vectors, saving the greater value from each comparison in the
2298 /// corresponding element of a 128-bit result vector of [8 x i16].
2299 ///
2300 /// \headerfile <x86intrin.h>
2301 ///
2302 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2303 ///
2304 /// \param __a
2305 /// A 128-bit signed [8 x i16] vector.
2306 /// \param __b
2307 /// A 128-bit signed [8 x i16] vector.
2308 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2309 /// each comparison.
_mm_max_epi16(__m128i __a,__m128i __b)2310 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2311 __m128i __b) {
2312 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2313 }
2314
2315 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2316 /// vectors, saving the greater value from each comparison in the
2317 /// corresponding element of a 128-bit result vector of [16 x i8].
2318 ///
2319 /// \headerfile <x86intrin.h>
2320 ///
2321 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2322 ///
2323 /// \param __a
2324 /// A 128-bit unsigned [16 x i8] vector.
2325 /// \param __b
2326 /// A 128-bit unsigned [16 x i8] vector.
2327 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2328 /// each comparison.
_mm_max_epu8(__m128i __a,__m128i __b)2329 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2330 __m128i __b) {
2331 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2332 }
2333
2334 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2335 /// vectors, saving the smaller value from each comparison in the
2336 /// corresponding element of a 128-bit result vector of [8 x i16].
2337 ///
2338 /// \headerfile <x86intrin.h>
2339 ///
2340 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2341 ///
2342 /// \param __a
2343 /// A 128-bit signed [8 x i16] vector.
2344 /// \param __b
2345 /// A 128-bit signed [8 x i16] vector.
2346 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2347 /// each comparison.
_mm_min_epi16(__m128i __a,__m128i __b)2348 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2349 __m128i __b) {
2350 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2351 }
2352
2353 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2354 /// vectors, saving the smaller value from each comparison in the
2355 /// corresponding element of a 128-bit result vector of [16 x i8].
2356 ///
2357 /// \headerfile <x86intrin.h>
2358 ///
2359 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2360 ///
2361 /// \param __a
2362 /// A 128-bit unsigned [16 x i8] vector.
2363 /// \param __b
2364 /// A 128-bit unsigned [16 x i8] vector.
2365 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2366 /// each comparison.
_mm_min_epu8(__m128i __a,__m128i __b)2367 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2368 __m128i __b) {
2369 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2370 }
2371
2372 /// Multiplies the corresponding elements of two signed [8 x i16]
2373 /// vectors, saving the upper 16 bits of each 32-bit product in the
2374 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2375 ///
2376 /// \headerfile <x86intrin.h>
2377 ///
2378 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2379 ///
2380 /// \param __a
2381 /// A 128-bit signed [8 x i16] vector.
2382 /// \param __b
2383 /// A 128-bit signed [8 x i16] vector.
2384 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2385 /// each of the eight 32-bit products.
_mm_mulhi_epi16(__m128i __a,__m128i __b)2386 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2387 __m128i __b) {
2388 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2389 }
2390
2391 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2392 /// vectors, saving the upper 16 bits of each 32-bit product in the
2393 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2394 ///
2395 /// \headerfile <x86intrin.h>
2396 ///
2397 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2398 ///
2399 /// \param __a
2400 /// A 128-bit unsigned [8 x i16] vector.
2401 /// \param __b
2402 /// A 128-bit unsigned [8 x i16] vector.
2403 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2404 /// of each of the eight 32-bit products.
_mm_mulhi_epu16(__m128i __a,__m128i __b)2405 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2406 __m128i __b) {
2407 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2408 }
2409
2410 /// Multiplies the corresponding elements of two signed [8 x i16]
2411 /// vectors, saving the lower 16 bits of each 32-bit product in the
2412 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2413 ///
2414 /// \headerfile <x86intrin.h>
2415 ///
2416 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2417 ///
2418 /// \param __a
2419 /// A 128-bit signed [8 x i16] vector.
2420 /// \param __b
2421 /// A 128-bit signed [8 x i16] vector.
2422 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2423 /// each of the eight 32-bit products.
_mm_mullo_epi16(__m128i __a,__m128i __b)2424 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2425 __m128i __b) {
2426 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2427 }
2428
2429 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2430 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2431 /// product.
2432 ///
2433 /// \headerfile <x86intrin.h>
2434 ///
2435 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2436 ///
2437 /// \param __a
2438 /// A 64-bit integer containing one of the source operands.
2439 /// \param __b
2440 /// A 64-bit integer containing one of the source operands.
2441 /// \returns A 64-bit integer vector containing the product of both operands.
_mm_mul_su32(__m64 __a,__m64 __b)2442 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2443 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2444 (__v4si)__anyext128(__b)));
2445 }
2446
2447 /// Multiplies 32-bit unsigned integer values contained in the lower
2448 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2449 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2450 ///
2451 /// \headerfile <x86intrin.h>
2452 ///
2453 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2454 ///
2455 /// \param __a
2456 /// A [2 x i64] vector containing one of the source operands.
2457 /// \param __b
2458 /// A [2 x i64] vector containing one of the source operands.
2459 /// \returns A [2 x i64] vector containing the product of both operands.
_mm_mul_epu32(__m128i __a,__m128i __b)2460 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2461 __m128i __b) {
2462 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2463 }
2464
2465 /// Computes the absolute differences of corresponding 8-bit integer
2466 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2467 /// separately sums the second 8 absolute differences. Packs these two
2468 /// unsigned 16-bit integer sums into the upper and lower elements of a
2469 /// [2 x i64] vector.
2470 ///
2471 /// \headerfile <x86intrin.h>
2472 ///
2473 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2474 ///
2475 /// \param __a
2476 /// A 128-bit integer vector containing one of the source operands.
2477 /// \param __b
2478 /// A 128-bit integer vector containing one of the source operands.
2479 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2480 /// differences between both operands.
_mm_sad_epu8(__m128i __a,__m128i __b)2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2482 __m128i __b) {
2483 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2484 }
2485
2486 /// Subtracts the corresponding 8-bit integer values in the operands.
2487 ///
2488 /// \headerfile <x86intrin.h>
2489 ///
2490 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2491 ///
2492 /// \param __a
2493 /// A 128-bit integer vector containing the minuends.
2494 /// \param __b
2495 /// A 128-bit integer vector containing the subtrahends.
2496 /// \returns A 128-bit integer vector containing the differences of the values
2497 /// in the operands.
_mm_sub_epi8(__m128i __a,__m128i __b)2498 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2499 __m128i __b) {
2500 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2501 }
2502
2503 /// Subtracts the corresponding 16-bit integer values in the operands.
2504 ///
2505 /// \headerfile <x86intrin.h>
2506 ///
2507 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2508 ///
2509 /// \param __a
2510 /// A 128-bit integer vector containing the minuends.
2511 /// \param __b
2512 /// A 128-bit integer vector containing the subtrahends.
2513 /// \returns A 128-bit integer vector containing the differences of the values
2514 /// in the operands.
_mm_sub_epi16(__m128i __a,__m128i __b)2515 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2516 __m128i __b) {
2517 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2518 }
2519
2520 /// Subtracts the corresponding 32-bit integer values in the operands.
2521 ///
2522 /// \headerfile <x86intrin.h>
2523 ///
2524 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2525 ///
2526 /// \param __a
2527 /// A 128-bit integer vector containing the minuends.
2528 /// \param __b
2529 /// A 128-bit integer vector containing the subtrahends.
2530 /// \returns A 128-bit integer vector containing the differences of the values
2531 /// in the operands.
_mm_sub_epi32(__m128i __a,__m128i __b)2532 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2533 __m128i __b) {
2534 return (__m128i)((__v4su)__a - (__v4su)__b);
2535 }
2536
2537 /// Subtracts signed or unsigned 64-bit integer values and writes the
2538 /// difference to the corresponding bits in the destination.
2539 ///
2540 /// \headerfile <x86intrin.h>
2541 ///
2542 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2543 ///
2544 /// \param __a
2545 /// A 64-bit integer vector containing the minuend.
2546 /// \param __b
2547 /// A 64-bit integer vector containing the subtrahend.
2548 /// \returns A 64-bit integer vector containing the difference of the values in
2549 /// the operands.
_mm_sub_si64(__m64 __a,__m64 __b)2550 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2551 return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2552 }
2553
2554 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2555 ///
2556 /// \headerfile <x86intrin.h>
2557 ///
2558 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2559 ///
2560 /// \param __a
2561 /// A 128-bit integer vector containing the minuends.
2562 /// \param __b
2563 /// A 128-bit integer vector containing the subtrahends.
2564 /// \returns A 128-bit integer vector containing the differences of the values
2565 /// in the operands.
_mm_sub_epi64(__m128i __a,__m128i __b)2566 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2567 __m128i __b) {
2568 return (__m128i)((__v2du)__a - (__v2du)__b);
2569 }
2570
2571 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2572 /// the input and returns the differences in the corresponding bytes in the
2573 /// destination.
2574 ///
2575 /// Differences greater than 0x7F are saturated to 0x7F, and differences
2576 /// less than 0x80 are saturated to 0x80.
2577 ///
2578 /// \headerfile <x86intrin.h>
2579 ///
2580 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2581 ///
2582 /// \param __a
2583 /// A 128-bit integer vector containing the minuends.
2584 /// \param __b
2585 /// A 128-bit integer vector containing the subtrahends.
2586 /// \returns A 128-bit integer vector containing the differences of the values
2587 /// in the operands.
_mm_subs_epi8(__m128i __a,__m128i __b)2588 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2589 __m128i __b) {
2590 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2591 }
2592
2593 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2594 /// the input and returns the differences in the corresponding bytes in the
2595 /// destination.
2596 ///
2597 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2598 /// than 0x8000 are saturated to 0x8000.
2599 ///
2600 /// \headerfile <x86intrin.h>
2601 ///
2602 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2603 ///
2604 /// \param __a
2605 /// A 128-bit integer vector containing the minuends.
2606 /// \param __b
2607 /// A 128-bit integer vector containing the subtrahends.
2608 /// \returns A 128-bit integer vector containing the differences of the values
2609 /// in the operands.
_mm_subs_epi16(__m128i __a,__m128i __b)2610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2611 __m128i __b) {
2612 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2613 }
2614
2615 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2616 /// the input and returns the differences in the corresponding bytes in the
2617 /// destination.
2618 ///
2619 /// Differences less than 0x00 are saturated to 0x00.
2620 ///
2621 /// \headerfile <x86intrin.h>
2622 ///
2623 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2624 ///
2625 /// \param __a
2626 /// A 128-bit integer vector containing the minuends.
2627 /// \param __b
2628 /// A 128-bit integer vector containing the subtrahends.
2629 /// \returns A 128-bit integer vector containing the unsigned integer
2630 /// differences of the values in the operands.
_mm_subs_epu8(__m128i __a,__m128i __b)2631 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2632 __m128i __b) {
2633 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2634 }
2635
2636 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2637 /// the input and returns the differences in the corresponding bytes in the
2638 /// destination.
2639 ///
2640 /// Differences less than 0x0000 are saturated to 0x0000.
2641 ///
2642 /// \headerfile <x86intrin.h>
2643 ///
2644 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2645 ///
2646 /// \param __a
2647 /// A 128-bit integer vector containing the minuends.
2648 /// \param __b
2649 /// A 128-bit integer vector containing the subtrahends.
2650 /// \returns A 128-bit integer vector containing the unsigned integer
2651 /// differences of the values in the operands.
_mm_subs_epu16(__m128i __a,__m128i __b)2652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2653 __m128i __b) {
2654 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2655 }
2656
2657 /// Performs a bitwise AND of two 128-bit integer vectors.
2658 ///
2659 /// \headerfile <x86intrin.h>
2660 ///
2661 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2662 ///
2663 /// \param __a
2664 /// A 128-bit integer vector containing one of the source operands.
2665 /// \param __b
2666 /// A 128-bit integer vector containing one of the source operands.
2667 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2668 /// in both operands.
_mm_and_si128(__m128i __a,__m128i __b)2669 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2670 __m128i __b) {
2671 return (__m128i)((__v2du)__a & (__v2du)__b);
2672 }
2673
2674 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2675 /// one's complement of the values contained in the first source operand.
2676 ///
2677 /// \headerfile <x86intrin.h>
2678 ///
2679 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2680 ///
2681 /// \param __a
2682 /// A 128-bit vector containing the left source operand. The one's complement
2683 /// of this value is used in the bitwise AND.
2684 /// \param __b
2685 /// A 128-bit vector containing the right source operand.
2686 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2687 /// complement of the first operand and the values in the second operand.
_mm_andnot_si128(__m128i __a,__m128i __b)2688 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2689 __m128i __b) {
2690 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2691 }
2692 /// Performs a bitwise OR of two 128-bit integer vectors.
2693 ///
2694 /// \headerfile <x86intrin.h>
2695 ///
2696 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2697 ///
2698 /// \param __a
2699 /// A 128-bit integer vector containing one of the source operands.
2700 /// \param __b
2701 /// A 128-bit integer vector containing one of the source operands.
2702 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2703 /// in both operands.
_mm_or_si128(__m128i __a,__m128i __b)2704 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2705 __m128i __b) {
2706 return (__m128i)((__v2du)__a | (__v2du)__b);
2707 }
2708
2709 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2710 ///
2711 /// \headerfile <x86intrin.h>
2712 ///
2713 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2714 ///
2715 /// \param __a
2716 /// A 128-bit integer vector containing one of the source operands.
2717 /// \param __b
2718 /// A 128-bit integer vector containing one of the source operands.
2719 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2720 /// values in both operands.
_mm_xor_si128(__m128i __a,__m128i __b)2721 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2722 __m128i __b) {
2723 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2724 }
2725
2726 /// Left-shifts the 128-bit integer vector operand by the specified
2727 /// number of bytes. Low-order bits are cleared.
2728 ///
2729 /// \headerfile <x86intrin.h>
2730 ///
2731 /// \code
2732 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2733 /// \endcode
2734 ///
2735 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2736 ///
2737 /// \param a
2738 /// A 128-bit integer vector containing the source operand.
2739 /// \param imm
2740 /// An immediate value specifying the number of bytes to left-shift operand
2741 /// \a a.
2742 /// \returns A 128-bit integer vector containing the left-shifted value.
2743 #define _mm_slli_si128(a, imm) \
2744 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2745 (int)(imm)))
2746
2747 #define _mm_bslli_si128(a, imm) \
2748 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2749 (int)(imm)))
2750
2751 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2752 /// by the specified number of bits. Low-order bits are cleared.
2753 ///
2754 /// \headerfile <x86intrin.h>
2755 ///
2756 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2757 ///
2758 /// \param __a
2759 /// A 128-bit integer vector containing the source operand.
2760 /// \param __count
2761 /// An integer value specifying the number of bits to left-shift each value
2762 /// in operand \a __a.
2763 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi16(__m128i __a,int __count)2764 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2765 int __count) {
2766 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2767 }
2768
2769 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2770 /// by the specified number of bits. Low-order bits are cleared.
2771 ///
2772 /// \headerfile <x86intrin.h>
2773 ///
2774 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2775 ///
2776 /// \param __a
2777 /// A 128-bit integer vector containing the source operand.
2778 /// \param __count
2779 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2780 /// to left-shift each value in operand \a __a.
2781 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi16(__m128i __a,__m128i __count)2782 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2783 __m128i __count) {
2784 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2785 }
2786
2787 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2788 /// by the specified number of bits. Low-order bits are cleared.
2789 ///
2790 /// \headerfile <x86intrin.h>
2791 ///
2792 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2793 ///
2794 /// \param __a
2795 /// A 128-bit integer vector containing the source operand.
2796 /// \param __count
2797 /// An integer value specifying the number of bits to left-shift each value
2798 /// in operand \a __a.
2799 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi32(__m128i __a,int __count)2800 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2801 int __count) {
2802 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2803 }
2804
2805 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2806 /// by the specified number of bits. Low-order bits are cleared.
2807 ///
2808 /// \headerfile <x86intrin.h>
2809 ///
2810 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2811 ///
2812 /// \param __a
2813 /// A 128-bit integer vector containing the source operand.
2814 /// \param __count
2815 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2816 /// to left-shift each value in operand \a __a.
2817 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi32(__m128i __a,__m128i __count)2818 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2819 __m128i __count) {
2820 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2821 }
2822
2823 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2824 /// by the specified number of bits. Low-order bits are cleared.
2825 ///
2826 /// \headerfile <x86intrin.h>
2827 ///
2828 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2829 ///
2830 /// \param __a
2831 /// A 128-bit integer vector containing the source operand.
2832 /// \param __count
2833 /// An integer value specifying the number of bits to left-shift each value
2834 /// in operand \a __a.
2835 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi64(__m128i __a,int __count)2836 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2837 int __count) {
2838 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2839 }
2840
2841 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2842 /// by the specified number of bits. Low-order bits are cleared.
2843 ///
2844 /// \headerfile <x86intrin.h>
2845 ///
2846 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2847 ///
2848 /// \param __a
2849 /// A 128-bit integer vector containing the source operand.
2850 /// \param __count
2851 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2852 /// to left-shift each value in operand \a __a.
2853 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi64(__m128i __a,__m128i __count)2854 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2855 __m128i __count) {
2856 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2857 }
2858
2859 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2860 /// by the specified number of bits. High-order bits are filled with the sign
2861 /// bit of the initial value.
2862 ///
2863 /// \headerfile <x86intrin.h>
2864 ///
2865 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2866 ///
2867 /// \param __a
2868 /// A 128-bit integer vector containing the source operand.
2869 /// \param __count
2870 /// An integer value specifying the number of bits to right-shift each value
2871 /// in operand \a __a.
2872 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi16(__m128i __a,int __count)2873 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2874 int __count) {
2875 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2876 }
2877
2878 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2879 /// by the specified number of bits. High-order bits are filled with the sign
2880 /// bit of the initial value.
2881 ///
2882 /// \headerfile <x86intrin.h>
2883 ///
2884 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2885 ///
2886 /// \param __a
2887 /// A 128-bit integer vector containing the source operand.
2888 /// \param __count
2889 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2890 /// to right-shift each value in operand \a __a.
2891 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi16(__m128i __a,__m128i __count)2892 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2893 __m128i __count) {
2894 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2895 }
2896
2897 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2898 /// by the specified number of bits. High-order bits are filled with the sign
2899 /// bit of the initial value.
2900 ///
2901 /// \headerfile <x86intrin.h>
2902 ///
2903 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2904 ///
2905 /// \param __a
2906 /// A 128-bit integer vector containing the source operand.
2907 /// \param __count
2908 /// An integer value specifying the number of bits to right-shift each value
2909 /// in operand \a __a.
2910 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi32(__m128i __a,int __count)2911 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2912 int __count) {
2913 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2914 }
2915
2916 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2917 /// by the specified number of bits. High-order bits are filled with the sign
2918 /// bit of the initial value.
2919 ///
2920 /// \headerfile <x86intrin.h>
2921 ///
2922 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2923 ///
2924 /// \param __a
2925 /// A 128-bit integer vector containing the source operand.
2926 /// \param __count
2927 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2928 /// to right-shift each value in operand \a __a.
2929 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi32(__m128i __a,__m128i __count)2930 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2931 __m128i __count) {
2932 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2933 }
2934
2935 /// Right-shifts the 128-bit integer vector operand by the specified
2936 /// number of bytes. High-order bits are cleared.
2937 ///
2938 /// \headerfile <x86intrin.h>
2939 ///
2940 /// \code
2941 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2942 /// \endcode
2943 ///
2944 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2945 ///
2946 /// \param a
2947 /// A 128-bit integer vector containing the source operand.
2948 /// \param imm
2949 /// An immediate value specifying the number of bytes to right-shift operand
2950 /// \a a.
2951 /// \returns A 128-bit integer vector containing the right-shifted value.
2952 #define _mm_srli_si128(a, imm) \
2953 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2954 (int)(imm)))
2955
2956 #define _mm_bsrli_si128(a, imm) \
2957 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2958 (int)(imm)))
2959
2960 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2961 /// operand by the specified number of bits. High-order bits are cleared.
2962 ///
2963 /// \headerfile <x86intrin.h>
2964 ///
2965 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2966 ///
2967 /// \param __a
2968 /// A 128-bit integer vector containing the source operand.
2969 /// \param __count
2970 /// An integer value specifying the number of bits to right-shift each value
2971 /// in operand \a __a.
2972 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi16(__m128i __a,int __count)2973 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2974 int __count) {
2975 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2976 }
2977
2978 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2979 /// operand by the specified number of bits. High-order bits are cleared.
2980 ///
2981 /// \headerfile <x86intrin.h>
2982 ///
2983 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2984 ///
2985 /// \param __a
2986 /// A 128-bit integer vector containing the source operand.
2987 /// \param __count
2988 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2989 /// to right-shift each value in operand \a __a.
2990 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi16(__m128i __a,__m128i __count)2991 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2992 __m128i __count) {
2993 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2994 }
2995
2996 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2997 /// operand by the specified number of bits. High-order bits are cleared.
2998 ///
2999 /// \headerfile <x86intrin.h>
3000 ///
3001 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3002 ///
3003 /// \param __a
3004 /// A 128-bit integer vector containing the source operand.
3005 /// \param __count
3006 /// An integer value specifying the number of bits to right-shift each value
3007 /// in operand \a __a.
3008 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi32(__m128i __a,int __count)3009 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3010 int __count) {
3011 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3012 }
3013
3014 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3015 /// operand by the specified number of bits. High-order bits are cleared.
3016 ///
3017 /// \headerfile <x86intrin.h>
3018 ///
3019 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3020 ///
3021 /// \param __a
3022 /// A 128-bit integer vector containing the source operand.
3023 /// \param __count
3024 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3025 /// to right-shift each value in operand \a __a.
3026 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi32(__m128i __a,__m128i __count)3027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3028 __m128i __count) {
3029 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3030 }
3031
3032 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3033 /// operand by the specified number of bits. High-order bits are cleared.
3034 ///
3035 /// \headerfile <x86intrin.h>
3036 ///
3037 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3038 ///
3039 /// \param __a
3040 /// A 128-bit integer vector containing the source operand.
3041 /// \param __count
3042 /// An integer value specifying the number of bits to right-shift each value
3043 /// in operand \a __a.
3044 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi64(__m128i __a,int __count)3045 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3046 int __count) {
3047 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3048 }
3049
3050 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3051 /// operand by the specified number of bits. High-order bits are cleared.
3052 ///
3053 /// \headerfile <x86intrin.h>
3054 ///
3055 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3056 ///
3057 /// \param __a
3058 /// A 128-bit integer vector containing the source operand.
3059 /// \param __count
3060 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3061 /// to right-shift each value in operand \a __a.
3062 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi64(__m128i __a,__m128i __count)3063 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3064 __m128i __count) {
3065 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3066 }
3067
3068 /// Compares each of the corresponding 8-bit values of the 128-bit
3069 /// integer vectors for equality.
3070 ///
3071 /// Each comparison returns 0x0 for false, 0xFF for true.
3072 ///
3073 /// \headerfile <x86intrin.h>
3074 ///
3075 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3076 ///
3077 /// \param __a
3078 /// A 128-bit integer vector.
3079 /// \param __b
3080 /// A 128-bit integer vector.
3081 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3082 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3083 __m128i __b) {
3084 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3085 }
3086
3087 /// Compares each of the corresponding 16-bit values of the 128-bit
3088 /// integer vectors for equality.
3089 ///
3090 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3091 ///
3092 /// \headerfile <x86intrin.h>
3093 ///
3094 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3095 ///
3096 /// \param __a
3097 /// A 128-bit integer vector.
3098 /// \param __b
3099 /// A 128-bit integer vector.
3100 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3101 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3102 __m128i __b) {
3103 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3104 }
3105
3106 /// Compares each of the corresponding 32-bit values of the 128-bit
3107 /// integer vectors for equality.
3108 ///
3109 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3110 ///
3111 /// \headerfile <x86intrin.h>
3112 ///
3113 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3114 ///
3115 /// \param __a
3116 /// A 128-bit integer vector.
3117 /// \param __b
3118 /// A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3121 __m128i __b) {
3122 return (__m128i)((__v4si)__a == (__v4si)__b);
3123 }
3124
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 /// integer vectors to determine if the values in the first operand are
3127 /// greater than those in the second operand.
3128 ///
3129 /// Each comparison returns 0x0 for false, 0xFF for true.
3130 ///
3131 /// \headerfile <x86intrin.h>
3132 ///
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134 ///
3135 /// \param __a
3136 /// A 128-bit integer vector.
3137 /// \param __b
3138 /// A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3141 __m128i __b) {
3142 /* This function always performs a signed comparison, but __v16qi is a char
3143 which may be signed or unsigned, so use __v16qs. */
3144 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3145 }
3146
3147 /// Compares each of the corresponding signed 16-bit values of the
3148 /// 128-bit integer vectors to determine if the values in the first operand
3149 /// are greater than those in the second operand.
3150 ///
3151 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3152 ///
3153 /// \headerfile <x86intrin.h>
3154 ///
3155 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3156 ///
3157 /// \param __a
3158 /// A 128-bit integer vector.
3159 /// \param __b
3160 /// A 128-bit integer vector.
3161 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3162 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3163 __m128i __b) {
3164 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3165 }
3166
3167 /// Compares each of the corresponding signed 32-bit values of the
3168 /// 128-bit integer vectors to determine if the values in the first operand
3169 /// are greater than those in the second operand.
3170 ///
3171 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3172 ///
3173 /// \headerfile <x86intrin.h>
3174 ///
3175 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3176 ///
3177 /// \param __a
3178 /// A 128-bit integer vector.
3179 /// \param __b
3180 /// A 128-bit integer vector.
3181 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3182 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3183 __m128i __b) {
3184 return (__m128i)((__v4si)__a > (__v4si)__b);
3185 }
3186
3187 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3188 /// integer vectors to determine if the values in the first operand are less
3189 /// than those in the second operand.
3190 ///
3191 /// Each comparison returns 0x0 for false, 0xFF for true.
3192 ///
3193 /// \headerfile <x86intrin.h>
3194 ///
3195 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3196 ///
3197 /// \param __a
3198 /// A 128-bit integer vector.
3199 /// \param __b
3200 /// A 128-bit integer vector.
3201 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi8(__m128i __a,__m128i __b)3202 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3203 __m128i __b) {
3204 return _mm_cmpgt_epi8(__b, __a);
3205 }
3206
3207 /// Compares each of the corresponding signed 16-bit values of the
3208 /// 128-bit integer vectors to determine if the values in the first operand
3209 /// are less than those in the second operand.
3210 ///
3211 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3212 ///
3213 /// \headerfile <x86intrin.h>
3214 ///
3215 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3216 ///
3217 /// \param __a
3218 /// A 128-bit integer vector.
3219 /// \param __b
3220 /// A 128-bit integer vector.
3221 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi16(__m128i __a,__m128i __b)3222 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3223 __m128i __b) {
3224 return _mm_cmpgt_epi16(__b, __a);
3225 }
3226
3227 /// Compares each of the corresponding signed 32-bit values of the
3228 /// 128-bit integer vectors to determine if the values in the first operand
3229 /// are less than those in the second operand.
3230 ///
3231 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3232 ///
3233 /// \headerfile <x86intrin.h>
3234 ///
3235 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3236 ///
3237 /// \param __a
3238 /// A 128-bit integer vector.
3239 /// \param __b
3240 /// A 128-bit integer vector.
3241 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi32(__m128i __a,__m128i __b)3242 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3243 __m128i __b) {
3244 return _mm_cmpgt_epi32(__b, __a);
3245 }
3246
3247 #ifdef __x86_64__
3248 /// Converts a 64-bit signed integer value from the second operand into a
3249 /// double-precision value and returns it in the lower element of a [2 x
3250 /// double] vector; the upper element of the returned vector is copied from
3251 /// the upper element of the first operand.
3252 ///
3253 /// \headerfile <x86intrin.h>
3254 ///
3255 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3256 ///
3257 /// \param __a
3258 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3259 /// copied to the upper 64 bits of the destination.
3260 /// \param __b
3261 /// A 64-bit signed integer operand containing the value to be converted.
3262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3263 /// converted value of the second operand. The upper 64 bits are copied from
3264 /// the upper 64 bits of the first operand.
_mm_cvtsi64_sd(__m128d __a,long long __b)3265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3266 long long __b) {
3267 __a[0] = __b;
3268 return __a;
3269 }
3270
3271 /// Converts the first (lower) element of a vector of [2 x double] into a
3272 /// 64-bit signed integer value.
3273 ///
3274 /// If the converted value does not fit in a 64-bit integer, raises a
3275 /// floating-point invalid exception. If the exception is masked, returns
3276 /// the most negative integer.
3277 ///
3278 /// \headerfile <x86intrin.h>
3279 ///
3280 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3281 ///
3282 /// \param __a
3283 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3284 /// conversion.
3285 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvtsd_si64(__m128d __a)3286 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3287 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3288 }
3289
3290 /// Converts the first (lower) element of a vector of [2 x double] into a
3291 /// 64-bit signed truncated (rounded toward zero) integer value.
3292 ///
3293 /// If a converted value does not fit in a 64-bit integer, raises a
3294 /// floating-point invalid exception. If the exception is masked, returns
3295 /// the most negative integer.
3296 ///
3297 /// \headerfile <x86intrin.h>
3298 ///
3299 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3300 /// instruction.
3301 ///
3302 /// \param __a
3303 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3304 /// conversion.
3305 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvttsd_si64(__m128d __a)3306 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3307 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3308 }
3309 #endif
3310
3311 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3312 ///
3313 /// \headerfile <x86intrin.h>
3314 ///
3315 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3316 ///
3317 /// \param __a
3318 /// A 128-bit integer vector.
3319 /// \returns A 128-bit vector of [4 x float] containing the converted values.
_mm_cvtepi32_ps(__m128i __a)3320 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3321 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3322 }
3323
3324 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3325 ///
3326 /// If a converted value does not fit in a 32-bit integer, raises a
3327 /// floating-point invalid exception. If the exception is masked, returns
3328 /// the most negative integer.
3329 ///
3330 /// \headerfile <x86intrin.h>
3331 ///
3332 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3333 ///
3334 /// \param __a
3335 /// A 128-bit vector of [4 x float].
3336 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3337 /// values.
_mm_cvtps_epi32(__m128 __a)3338 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3339 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3340 }
3341
3342 /// Converts a vector of [4 x float] into four signed truncated (rounded toward
3343 /// zero) 32-bit integers, returned in a vector of [4 x i32].
3344 ///
3345 /// If a converted value does not fit in a 32-bit integer, raises a
3346 /// floating-point invalid exception. If the exception is masked, returns
3347 /// the most negative integer.
3348 ///
3349 /// \headerfile <x86intrin.h>
3350 ///
3351 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3352 /// instruction.
3353 ///
3354 /// \param __a
3355 /// A 128-bit vector of [4 x float].
3356 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
_mm_cvttps_epi32(__m128 __a)3357 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3358 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3359 }
3360
3361 /// Returns a vector of [4 x i32] where the lowest element is the input
3362 /// operand and the remaining elements are zero.
3363 ///
3364 /// \headerfile <x86intrin.h>
3365 ///
3366 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3367 ///
3368 /// \param __a
3369 /// A 32-bit signed integer operand.
3370 /// \returns A 128-bit vector of [4 x i32].
_mm_cvtsi32_si128(int __a)3371 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3372 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3373 }
3374
3375 /// Returns a vector of [2 x i64] where the lower element is the input
3376 /// operand and the upper element is zero.
3377 ///
3378 /// \headerfile <x86intrin.h>
3379 ///
3380 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3381 /// in 64-bit mode.
3382 ///
3383 /// \param __a
3384 /// A 64-bit signed integer operand containing the value to be converted.
3385 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
_mm_cvtsi64_si128(long long __a)3386 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3387 return __extension__(__m128i)(__v2di){__a, 0};
3388 }
3389
3390 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3391 /// 32-bit signed integer value.
3392 ///
3393 /// \headerfile <x86intrin.h>
3394 ///
3395 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3396 ///
3397 /// \param __a
3398 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3399 /// destination.
3400 /// \returns A 32-bit signed integer containing the moved value.
_mm_cvtsi128_si32(__m128i __a)3401 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3402 __v4si __b = (__v4si)__a;
3403 return __b[0];
3404 }
3405
3406 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3407 /// 64-bit signed integer value.
3408 ///
3409 /// \headerfile <x86intrin.h>
3410 ///
3411 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3412 ///
3413 /// \param __a
3414 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3415 /// destination.
3416 /// \returns A 64-bit signed integer containing the moved value.
_mm_cvtsi128_si64(__m128i __a)3417 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3418 return __a[0];
3419 }
3420
3421 /// Moves packed integer values from an aligned 128-bit memory location
3422 /// to elements in a 128-bit integer vector.
3423 ///
3424 /// \headerfile <x86intrin.h>
3425 ///
3426 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3427 ///
3428 /// \param __p
3429 /// An aligned pointer to a memory location containing integer values.
3430 /// \returns A 128-bit integer vector containing the moved values.
3431 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3432 _mm_load_si128(__m128i const *__p) {
3433 return *__p;
3434 }
3435
3436 /// Moves packed integer values from an unaligned 128-bit memory location
3437 /// to elements in a 128-bit integer vector.
3438 ///
3439 /// \headerfile <x86intrin.h>
3440 ///
3441 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3442 ///
3443 /// \param __p
3444 /// A pointer to a memory location containing integer values.
3445 /// \returns A 128-bit integer vector containing the moved values.
3446 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3447 _mm_loadu_si128(__m128i_u const *__p) {
3448 struct __loadu_si128 {
3449 __m128i_u __v;
3450 } __attribute__((__packed__, __may_alias__));
3451 return ((const struct __loadu_si128 *)__p)->__v;
3452 }
3453
3454 /// Returns a vector of [2 x i64] where the lower element is taken from
3455 /// the lower element of the operand, and the upper element is zero.
3456 ///
3457 /// \headerfile <x86intrin.h>
3458 ///
3459 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3460 ///
3461 /// \param __p
3462 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3463 /// the destination.
3464 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3465 /// moved value. The higher order bits are cleared.
3466 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3467 _mm_loadl_epi64(__m128i_u const *__p) {
3468 struct __mm_loadl_epi64_struct {
3469 long long __u;
3470 } __attribute__((__packed__, __may_alias__));
3471 return __extension__(__m128i){
3472 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3473 }
3474
3475 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3476 /// This could be used as an argument to another intrinsic function where the
3477 /// argument is required but the value is not actually used.
3478 ///
3479 /// \headerfile <x86intrin.h>
3480 ///
3481 /// This intrinsic has no corresponding instruction.
3482 ///
3483 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
_mm_undefined_si128(void)3484 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3485 return (__m128i)__builtin_ia32_undef128();
3486 }
3487
3488 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3489 /// the specified 64-bit integer values.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic is a utility function and does not correspond to a specific
3494 /// instruction.
3495 ///
3496 /// \param __q1
3497 /// A 64-bit integer value used to initialize the upper 64 bits of the
3498 /// destination vector of [2 x i64].
3499 /// \param __q0
3500 /// A 64-bit integer value used to initialize the lower 64 bits of the
3501 /// destination vector of [2 x i64].
3502 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3503 /// provided in the operands.
_mm_set_epi64x(long long __q1,long long __q0)3504 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3505 long long __q0) {
3506 return __extension__(__m128i)(__v2di){__q0, __q1};
3507 }
3508
3509 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3510 /// the specified 64-bit integer values.
3511 ///
3512 /// \headerfile <x86intrin.h>
3513 ///
3514 /// This intrinsic is a utility function and does not correspond to a specific
3515 /// instruction.
3516 ///
3517 /// \param __q1
3518 /// A 64-bit integer value used to initialize the upper 64 bits of the
3519 /// destination vector of [2 x i64].
3520 /// \param __q0
3521 /// A 64-bit integer value used to initialize the lower 64 bits of the
3522 /// destination vector of [2 x i64].
3523 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3524 /// provided in the operands.
_mm_set_epi64(__m64 __q1,__m64 __q0)3525 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3526 __m64 __q0) {
3527 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3528 }
3529
3530 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3531 /// the specified 32-bit integer values.
3532 ///
3533 /// \headerfile <x86intrin.h>
3534 ///
3535 /// This intrinsic is a utility function and does not correspond to a specific
3536 /// instruction.
3537 ///
3538 /// \param __i3
3539 /// A 32-bit integer value used to initialize bits [127:96] of the
3540 /// destination vector.
3541 /// \param __i2
3542 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3543 /// vector.
3544 /// \param __i1
3545 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3546 /// vector.
3547 /// \param __i0
3548 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3549 /// vector.
3550 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3551 /// provided in the operands.
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3552 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3553 int __i1, int __i0) {
3554 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3555 }
3556
3557 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3558 /// the specified 16-bit integer values.
3559 ///
3560 /// \headerfile <x86intrin.h>
3561 ///
3562 /// This intrinsic is a utility function and does not correspond to a specific
3563 /// instruction.
3564 ///
3565 /// \param __w7
3566 /// A 16-bit integer value used to initialize bits [127:112] of the
3567 /// destination vector.
3568 /// \param __w6
3569 /// A 16-bit integer value used to initialize bits [111:96] of the
3570 /// destination vector.
3571 /// \param __w5
3572 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3573 /// vector.
3574 /// \param __w4
3575 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3576 /// vector.
3577 /// \param __w3
3578 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3579 /// vector.
3580 /// \param __w2
3581 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3582 /// vector.
3583 /// \param __w1
3584 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3585 /// vector.
3586 /// \param __w0
3587 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3588 /// vector.
3589 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3590 /// provided in the operands.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3592 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3593 short __w2, short __w1, short __w0) {
3594 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3595 __w4, __w5, __w6, __w7};
3596 }
3597
3598 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3599 /// the specified 8-bit integer values.
3600 ///
3601 /// \headerfile <x86intrin.h>
3602 ///
3603 /// This intrinsic is a utility function and does not correspond to a specific
3604 /// instruction.
3605 ///
3606 /// \param __b15
3607 /// Initializes bits [127:120] of the destination vector.
3608 /// \param __b14
3609 /// Initializes bits [119:112] of the destination vector.
3610 /// \param __b13
3611 /// Initializes bits [111:104] of the destination vector.
3612 /// \param __b12
3613 /// Initializes bits [103:96] of the destination vector.
3614 /// \param __b11
3615 /// Initializes bits [95:88] of the destination vector.
3616 /// \param __b10
3617 /// Initializes bits [87:80] of the destination vector.
3618 /// \param __b9
3619 /// Initializes bits [79:72] of the destination vector.
3620 /// \param __b8
3621 /// Initializes bits [71:64] of the destination vector.
3622 /// \param __b7
3623 /// Initializes bits [63:56] of the destination vector.
3624 /// \param __b6
3625 /// Initializes bits [55:48] of the destination vector.
3626 /// \param __b5
3627 /// Initializes bits [47:40] of the destination vector.
3628 /// \param __b4
3629 /// Initializes bits [39:32] of the destination vector.
3630 /// \param __b3
3631 /// Initializes bits [31:24] of the destination vector.
3632 /// \param __b2
3633 /// Initializes bits [23:16] of the destination vector.
3634 /// \param __b1
3635 /// Initializes bits [15:8] of the destination vector.
3636 /// \param __b0
3637 /// Initializes bits [7:0] of the destination vector.
3638 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3639 /// provided in the operands.
3640 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3641 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3642 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3643 char __b4, char __b3, char __b2, char __b1, char __b0) {
3644 return __extension__(__m128i)(__v16qi){
3645 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3646 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3647 }
3648
3649 /// Initializes both values in a 128-bit integer vector with the
3650 /// specified 64-bit integer value.
3651 ///
3652 /// \headerfile <x86intrin.h>
3653 ///
3654 /// This intrinsic is a utility function and does not correspond to a specific
3655 /// instruction.
3656 ///
3657 /// \param __q
3658 /// Integer value used to initialize the elements of the destination integer
3659 /// vector.
3660 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3661 /// elements containing the value provided in the operand.
_mm_set1_epi64x(long long __q)3662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3663 return _mm_set_epi64x(__q, __q);
3664 }
3665
3666 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3667 /// specified 64-bit value.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic is a utility function and does not correspond to a specific
3672 /// instruction.
3673 ///
3674 /// \param __q
3675 /// A 64-bit value used to initialize the elements of the destination integer
3676 /// vector.
3677 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3678 /// containing the value provided in the operand.
_mm_set1_epi64(__m64 __q)3679 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3680 return _mm_set_epi64(__q, __q);
3681 }
3682
3683 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3684 /// specified 32-bit value.
3685 ///
3686 /// \headerfile <x86intrin.h>
3687 ///
3688 /// This intrinsic is a utility function and does not correspond to a specific
3689 /// instruction.
3690 ///
3691 /// \param __i
3692 /// A 32-bit value used to initialize the elements of the destination integer
3693 /// vector.
3694 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3695 /// containing the value provided in the operand.
_mm_set1_epi32(int __i)3696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3697 return _mm_set_epi32(__i, __i, __i, __i);
3698 }
3699
3700 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3701 /// specified 16-bit value.
3702 ///
3703 /// \headerfile <x86intrin.h>
3704 ///
3705 /// This intrinsic is a utility function and does not correspond to a specific
3706 /// instruction.
3707 ///
3708 /// \param __w
3709 /// A 16-bit value used to initialize the elements of the destination integer
3710 /// vector.
3711 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3712 /// containing the value provided in the operand.
_mm_set1_epi16(short __w)3713 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3714 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3715 }
3716
3717 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3718 /// specified 8-bit value.
3719 ///
3720 /// \headerfile <x86intrin.h>
3721 ///
3722 /// This intrinsic is a utility function and does not correspond to a specific
3723 /// instruction.
3724 ///
3725 /// \param __b
3726 /// An 8-bit value used to initialize the elements of the destination integer
3727 /// vector.
3728 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3729 /// containing the value provided in the operand.
_mm_set1_epi8(char __b)3730 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3731 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3732 __b, __b, __b, __b, __b);
3733 }
3734
3735 /// Constructs a 128-bit integer vector, initialized in reverse order
3736 /// with the specified 64-bit integral values.
3737 ///
3738 /// \headerfile <x86intrin.h>
3739 ///
3740 /// This intrinsic does not correspond to a specific instruction.
3741 ///
3742 /// \param __q0
3743 /// A 64-bit integral value used to initialize the lower 64 bits of the
3744 /// result.
3745 /// \param __q1
3746 /// A 64-bit integral value used to initialize the upper 64 bits of the
3747 /// result.
3748 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi64(__m64 __q0,__m64 __q1)3749 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3750 __m64 __q1) {
3751 return _mm_set_epi64(__q1, __q0);
3752 }
3753
3754 /// Constructs a 128-bit integer vector, initialized in reverse order
3755 /// with the specified 32-bit integral values.
3756 ///
3757 /// \headerfile <x86intrin.h>
3758 ///
3759 /// This intrinsic is a utility function and does not correspond to a specific
3760 /// instruction.
3761 ///
3762 /// \param __i0
3763 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3764 /// \param __i1
3765 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3766 /// \param __i2
3767 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3768 /// \param __i3
3769 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3770 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3771 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3772 int __i2,
3773 int __i3) {
3774 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3775 }
3776
3777 /// Constructs a 128-bit integer vector, initialized in reverse order
3778 /// with the specified 16-bit integral values.
3779 ///
3780 /// \headerfile <x86intrin.h>
3781 ///
3782 /// This intrinsic is a utility function and does not correspond to a specific
3783 /// instruction.
3784 ///
3785 /// \param __w0
3786 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3787 /// \param __w1
3788 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3789 /// \param __w2
3790 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3791 /// \param __w3
3792 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3793 /// \param __w4
3794 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3795 /// \param __w5
3796 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3797 /// \param __w6
3798 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3799 /// \param __w7
3800 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3801 /// \returns An initialized 128-bit integer vector.
3802 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3803 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3804 short __w5, short __w6, short __w7) {
3805 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3806 }
3807
3808 /// Constructs a 128-bit integer vector, initialized in reverse order
3809 /// with the specified 8-bit integral values.
3810 ///
3811 /// \headerfile <x86intrin.h>
3812 ///
3813 /// This intrinsic is a utility function and does not correspond to a specific
3814 /// instruction.
3815 ///
3816 /// \param __b0
3817 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3818 /// \param __b1
3819 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3820 /// \param __b2
3821 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3822 /// \param __b3
3823 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3824 /// \param __b4
3825 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3826 /// \param __b5
3827 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3828 /// \param __b6
3829 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3830 /// \param __b7
3831 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3832 /// \param __b8
3833 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3834 /// \param __b9
3835 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3836 /// \param __b10
3837 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3838 /// \param __b11
3839 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3840 /// \param __b12
3841 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3842 /// \param __b13
3843 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3844 /// \param __b14
3845 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3846 /// \param __b15
3847 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3848 /// \returns An initialized 128-bit integer vector.
3849 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3850 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3851 char __b6, char __b7, char __b8, char __b9, char __b10,
3852 char __b11, char __b12, char __b13, char __b14, char __b15) {
3853 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3854 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3855 }
3856
3857 /// Creates a 128-bit integer vector initialized to zero.
3858 ///
3859 /// \headerfile <x86intrin.h>
3860 ///
3861 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3862 ///
3863 /// \returns An initialized 128-bit integer vector with all elements set to
3864 /// zero.
_mm_setzero_si128(void)3865 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3866 return __extension__(__m128i)(__v2di){0LL, 0LL};
3867 }
3868
3869 /// Stores a 128-bit integer vector to a memory location aligned on a
3870 /// 128-bit boundary.
3871 ///
3872 /// \headerfile <x86intrin.h>
3873 ///
3874 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3875 ///
3876 /// \param __p
3877 /// A pointer to an aligned memory location that will receive the integer
3878 /// values.
3879 /// \param __b
3880 /// A 128-bit integer vector containing the values to be moved.
_mm_store_si128(__m128i * __p,__m128i __b)3881 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3882 __m128i __b) {
3883 *__p = __b;
3884 }
3885
3886 /// Stores a 128-bit integer vector to an unaligned memory location.
3887 ///
3888 /// \headerfile <x86intrin.h>
3889 ///
3890 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3891 ///
3892 /// \param __p
3893 /// A pointer to a memory location that will receive the integer values.
3894 /// \param __b
3895 /// A 128-bit integer vector containing the values to be moved.
_mm_storeu_si128(__m128i_u * __p,__m128i __b)3896 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3897 __m128i __b) {
3898 struct __storeu_si128 {
3899 __m128i_u __v;
3900 } __attribute__((__packed__, __may_alias__));
3901 ((struct __storeu_si128 *)__p)->__v = __b;
3902 }
3903
3904 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3905 /// vector.
3906 ///
3907 /// \headerfile <x86intrin.h>
3908 ///
3909 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3910 ///
3911 /// \param __p
3912 /// A pointer to a 64-bit memory location. The address of the memory
3913 /// location does not have to be aligned.
3914 /// \param __b
3915 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si64(void * __p,__m128i __b)3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3917 __m128i __b) {
3918 struct __storeu_si64 {
3919 long long __v;
3920 } __attribute__((__packed__, __may_alias__));
3921 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3922 }
3923
3924 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3925 /// vector.
3926 ///
3927 /// \headerfile <x86intrin.h>
3928 ///
3929 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3930 ///
3931 /// \param __p
3932 /// A pointer to a 32-bit memory location. The address of the memory
3933 /// location does not have to be aligned.
3934 /// \param __b
3935 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si32(void * __p,__m128i __b)3936 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3937 __m128i __b) {
3938 struct __storeu_si32 {
3939 int __v;
3940 } __attribute__((__packed__, __may_alias__));
3941 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3942 }
3943
3944 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3945 /// vector.
3946 ///
3947 /// \headerfile <x86intrin.h>
3948 ///
3949 /// This intrinsic does not correspond to a specific instruction.
3950 ///
3951 /// \param __p
3952 /// A pointer to a 16-bit memory location. The address of the memory
3953 /// location does not have to be aligned.
3954 /// \param __b
3955 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si16(void * __p,__m128i __b)3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3957 __m128i __b) {
3958 struct __storeu_si16 {
3959 short __v;
3960 } __attribute__((__packed__, __may_alias__));
3961 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3962 }
3963
3964 /// Moves bytes selected by the mask from the first operand to the
3965 /// specified unaligned memory location. When a mask bit is 1, the
3966 /// corresponding byte is written, otherwise it is not written.
3967 ///
3968 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3969 /// used again soon). Exception and trap behavior for elements not selected
3970 /// for storage to memory are implementation dependent.
3971 ///
3972 /// \headerfile <x86intrin.h>
3973 ///
3974 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3975 /// instruction.
3976 ///
3977 /// \param __d
3978 /// A 128-bit integer vector containing the values to be moved.
3979 /// \param __n
3980 /// A 128-bit integer vector containing the mask. The most significant bit of
3981 /// each byte represents the mask bits.
3982 /// \param __p
3983 /// A pointer to an unaligned 128-bit memory location where the specified
3984 /// values are moved.
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)3985 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3986 __m128i __n,
3987 char *__p) {
3988 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3989 }
3990
3991 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3992 /// a memory location.
3993 ///
3994 /// \headerfile <x86intrin.h>
3995 ///
3996 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3997 ///
3998 /// \param __p
3999 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
4000 /// of the integer vector parameter.
4001 /// \param __a
4002 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4003 /// value to be stored.
_mm_storel_epi64(__m128i_u * __p,__m128i __a)4004 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4005 __m128i __a) {
4006 struct __mm_storel_epi64_struct {
4007 long long __u;
4008 } __attribute__((__packed__, __may_alias__));
4009 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4010 }
4011
4012 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4013 /// aligned memory location.
4014 ///
4015 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4016 /// used again soon).
4017 ///
4018 /// \headerfile <x86intrin.h>
4019 ///
4020 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4021 ///
4022 /// \param __p
4023 /// A pointer to the 128-bit aligned memory location used to store the value.
4024 /// \param __a
4025 /// A vector of [2 x double] containing the 64-bit values to be stored.
_mm_stream_pd(void * __p,__m128d __a)4026 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4027 __m128d __a) {
4028 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4029 }
4030
4031 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4032 ///
4033 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4034 /// used again soon).
4035 ///
4036 /// \headerfile <x86intrin.h>
4037 ///
4038 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4039 ///
4040 /// \param __p
4041 /// A pointer to the 128-bit aligned memory location used to store the value.
4042 /// \param __a
4043 /// A 128-bit integer vector containing the values to be stored.
_mm_stream_si128(void * __p,__m128i __a)4044 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4045 __m128i __a) {
4046 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4047 }
4048
4049 /// Stores a 32-bit integer value in the specified memory location.
4050 ///
4051 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4052 /// used again soon).
4053 ///
4054 /// \headerfile <x86intrin.h>
4055 ///
4056 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4057 ///
4058 /// \param __p
4059 /// A pointer to the 32-bit memory location used to store the value.
4060 /// \param __a
4061 /// A 32-bit integer containing the value to be stored.
4062 static __inline__ void
4063 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void * __p,int __a)4064 _mm_stream_si32(void *__p, int __a) {
4065 __builtin_ia32_movnti((int *)__p, __a);
4066 }
4067
4068 #ifdef __x86_64__
4069 /// Stores a 64-bit integer value in the specified memory location.
4070 ///
4071 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4072 /// used again soon).
4073 ///
4074 /// \headerfile <x86intrin.h>
4075 ///
4076 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4077 ///
4078 /// \param __p
4079 /// A pointer to the 64-bit memory location used to store the value.
4080 /// \param __a
4081 /// A 64-bit integer containing the value to be stored.
4082 static __inline__ void
4083 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void * __p,long long __a)4084 _mm_stream_si64(void *__p, long long __a) {
4085 __builtin_ia32_movnti64((long long *)__p, __a);
4086 }
4087 #endif
4088
4089 #if defined(__cplusplus)
4090 extern "C" {
4091 #endif
4092
4093 /// The cache line containing \a __p is flushed and invalidated from all
4094 /// caches in the coherency domain.
4095 ///
4096 /// \headerfile <x86intrin.h>
4097 ///
4098 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4099 ///
4100 /// \param __p
4101 /// A pointer to the memory location used to identify the cache line to be
4102 /// flushed.
4103 void _mm_clflush(void const *__p);
4104
4105 /// Forces strong memory ordering (serialization) between load
4106 /// instructions preceding this instruction and load instructions following
4107 /// this instruction, ensuring the system completes all previous loads before
4108 /// executing subsequent loads.
4109 ///
4110 /// \headerfile <x86intrin.h>
4111 ///
4112 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4113 ///
4114 void _mm_lfence(void);
4115
4116 /// Forces strong memory ordering (serialization) between load and store
4117 /// instructions preceding this instruction and load and store instructions
4118 /// following this instruction, ensuring that the system completes all
4119 /// previous memory accesses before executing subsequent memory accesses.
4120 ///
4121 /// \headerfile <x86intrin.h>
4122 ///
4123 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4124 ///
4125 void _mm_mfence(void);
4126
4127 #if defined(__cplusplus)
4128 } // extern "C"
4129 #endif
4130
4131 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4132 /// vector operands into 8-bit signed integers, and packs the results into
4133 /// the destination.
4134 ///
4135 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4136 /// less than 0x80 are saturated to 0x80.
4137 ///
4138 /// \headerfile <x86intrin.h>
4139 ///
4140 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4141 ///
4142 /// \param __a
4143 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4144 /// written to the lower 64 bits of the result.
4145 /// \param __b
4146 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4147 /// written to the higher 64 bits of the result.
4148 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packs_epi16(__m128i __a,__m128i __b)4149 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4150 __m128i __b) {
4151 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4152 }
4153
4154 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4155 /// vector operands into 16-bit signed integers, and packs the results into
4156 /// the destination.
4157 ///
4158 /// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4159 /// values less than 0x8000 are saturated to 0x8000.
4160 ///
4161 /// \headerfile <x86intrin.h>
4162 ///
4163 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4164 ///
4165 /// \param __a
4166 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4167 /// are written to the lower 64 bits of the result.
4168 /// \param __b
4169 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4170 /// are written to the higher 64 bits of the result.
4171 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
_mm_packs_epi32(__m128i __a,__m128i __b)4172 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4173 __m128i __b) {
4174 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4175 }
4176
4177 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4178 /// vector operands into 8-bit unsigned integers, and packs the results into
4179 /// the destination.
4180 ///
4181 /// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4182 /// are saturated to 0x00.
4183 ///
4184 /// \headerfile <x86intrin.h>
4185 ///
4186 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4187 ///
4188 /// \param __a
4189 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4190 /// written to the lower 64 bits of the result.
4191 /// \param __b
4192 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4193 /// written to the higher 64 bits of the result.
4194 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packus_epi16(__m128i __a,__m128i __b)4195 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4196 __m128i __b) {
4197 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4198 }
4199
4200 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4201 /// the immediate-value parameter as a selector.
4202 ///
4203 /// \headerfile <x86intrin.h>
4204 ///
4205 /// \code
4206 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4207 /// \endcode
4208 ///
4209 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4210 ///
4211 /// \param a
4212 /// A 128-bit integer vector.
4213 /// \param imm
4214 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4215 /// to bits[15:0] of the result. \n
4216 /// 000: assign values from bits [15:0] of \a a. \n
4217 /// 001: assign values from bits [31:16] of \a a. \n
4218 /// 010: assign values from bits [47:32] of \a a. \n
4219 /// 011: assign values from bits [63:48] of \a a. \n
4220 /// 100: assign values from bits [79:64] of \a a. \n
4221 /// 101: assign values from bits [95:80] of \a a. \n
4222 /// 110: assign values from bits [111:96] of \a a. \n
4223 /// 111: assign values from bits [127:112] of \a a.
4224 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4225 /// integer vector parameter and the remaining bits are assigned zeros.
4226 #define _mm_extract_epi16(a, imm) \
4227 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4228 (int)(imm)))
4229
4230 /// Constructs a 128-bit integer vector by first making a copy of the
4231 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4232 /// of an integer parameter into an offset specified by the immediate-value
4233 /// parameter.
4234 ///
4235 /// \headerfile <x86intrin.h>
4236 ///
4237 /// \code
4238 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4239 /// \endcode
4240 ///
4241 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4242 ///
4243 /// \param a
4244 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4245 /// result and then one of the eight elements in the result is replaced by
4246 /// the lower 16 bits of \a b.
4247 /// \param b
4248 /// An integer. The lower 16 bits of this parameter are written to the
4249 /// result beginning at an offset specified by \a imm.
4250 /// \param imm
4251 /// An immediate value specifying the bit offset in the result at which the
4252 /// lower 16 bits of \a b are written.
4253 /// \returns A 128-bit integer vector containing the constructed values.
4254 #define _mm_insert_epi16(a, b, imm) \
4255 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4256 (int)(imm)))
4257
4258 /// Copies the values of the most significant bits from each 8-bit
4259 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4260 /// value, zero-extends the value, and writes it to the destination.
4261 ///
4262 /// \headerfile <x86intrin.h>
4263 ///
4264 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4265 ///
4266 /// \param __a
4267 /// A 128-bit integer vector containing the values with bits to be extracted.
4268 /// \returns The most significant bits from each 8-bit element in \a __a,
4269 /// written to bits [15:0]. The other bits are assigned zeros.
_mm_movemask_epi8(__m128i __a)4270 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4271 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4272 }
4273
4274 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4275 /// elements of a 128-bit integer vector parameter, using the immediate-value
4276 /// parameter as a specifier.
4277 ///
4278 /// \headerfile <x86intrin.h>
4279 ///
4280 /// \code
4281 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4282 /// \endcode
4283 ///
4284 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4285 ///
4286 /// \param a
4287 /// A 128-bit integer vector containing the values to be copied.
4288 /// \param imm
4289 /// An immediate value containing an 8-bit value specifying which elements to
4290 /// copy from a. The destinations within the 128-bit destination are assigned
4291 /// values as follows: \n
4292 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4293 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4294 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4295 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4296 /// Bit value assignments: \n
4297 /// 00: assign values from bits [31:0] of \a a. \n
4298 /// 01: assign values from bits [63:32] of \a a. \n
4299 /// 10: assign values from bits [95:64] of \a a. \n
4300 /// 11: assign values from bits [127:96] of \a a. \n
4301 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 /// <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shuffle_epi32(a, imm) \
4306 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4307
4308 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4309 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4310 /// value parameter as a specifier.
4311 ///
4312 /// \headerfile <x86intrin.h>
4313 ///
4314 /// \code
4315 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4316 /// \endcode
4317 ///
4318 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4319 ///
4320 /// \param a
4321 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4322 /// [127:64] of the result.
4323 /// \param imm
4324 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4325 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4326 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4327 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4328 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4329 /// Bit value assignments: \n
4330 /// 00: assign values from bits [15:0] of \a a. \n
4331 /// 01: assign values from bits [31:16] of \a a. \n
4332 /// 10: assign values from bits [47:32] of \a a. \n
4333 /// 11: assign values from bits [63:48] of \a a. \n
4334 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4335 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4336 /// <c>[b6, b4, b2, b0]</c>.
4337 /// \returns A 128-bit integer vector containing the shuffled values.
4338 #define _mm_shufflelo_epi16(a, imm) \
4339 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4340
4341 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4342 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4343 /// value parameter as a specifier.
4344 ///
4345 /// \headerfile <x86intrin.h>
4346 ///
4347 /// \code
4348 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4349 /// \endcode
4350 ///
4351 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4352 ///
4353 /// \param a
4354 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4355 /// [63:0] of the result.
4356 /// \param imm
4357 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4358 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4359 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4360 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4361 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4362 /// Bit value assignments: \n
4363 /// 00: assign values from bits [79:64] of \a a. \n
4364 /// 01: assign values from bits [95:80] of \a a. \n
4365 /// 10: assign values from bits [111:96] of \a a. \n
4366 /// 11: assign values from bits [127:112] of \a a. \n
4367 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4368 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4369 /// <c>[b6, b4, b2, b0]</c>.
4370 /// \returns A 128-bit integer vector containing the shuffled values.
4371 #define _mm_shufflehi_epi16(a, imm) \
4372 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4373
4374 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4375 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4376 ///
4377 /// \headerfile <x86intrin.h>
4378 ///
4379 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4380 /// instruction.
4381 ///
4382 /// \param __a
4383 /// A 128-bit vector of [16 x i8].
4384 /// Bits [71:64] are written to bits [7:0] of the result. \n
4385 /// Bits [79:72] are written to bits [23:16] of the result. \n
4386 /// Bits [87:80] are written to bits [39:32] of the result. \n
4387 /// Bits [95:88] are written to bits [55:48] of the result. \n
4388 /// Bits [103:96] are written to bits [71:64] of the result. \n
4389 /// Bits [111:104] are written to bits [87:80] of the result. \n
4390 /// Bits [119:112] are written to bits [103:96] of the result. \n
4391 /// Bits [127:120] are written to bits [119:112] of the result.
4392 /// \param __b
4393 /// A 128-bit vector of [16 x i8]. \n
4394 /// Bits [71:64] are written to bits [15:8] of the result. \n
4395 /// Bits [79:72] are written to bits [31:24] of the result. \n
4396 /// Bits [87:80] are written to bits [47:40] of the result. \n
4397 /// Bits [95:88] are written to bits [63:56] of the result. \n
4398 /// Bits [103:96] are written to bits [79:72] of the result. \n
4399 /// Bits [111:104] are written to bits [95:88] of the result. \n
4400 /// Bits [119:112] are written to bits [111:104] of the result. \n
4401 /// Bits [127:120] are written to bits [127:120] of the result.
4402 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4403 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4404 __m128i __b) {
4405 return (__m128i)__builtin_shufflevector(
4406 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4407 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4408 }
4409
4410 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4411 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4412 ///
4413 /// \headerfile <x86intrin.h>
4414 ///
4415 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4416 /// instruction.
4417 ///
4418 /// \param __a
4419 /// A 128-bit vector of [8 x i16].
4420 /// Bits [79:64] are written to bits [15:0] of the result. \n
4421 /// Bits [95:80] are written to bits [47:32] of the result. \n
4422 /// Bits [111:96] are written to bits [79:64] of the result. \n
4423 /// Bits [127:112] are written to bits [111:96] of the result.
4424 /// \param __b
4425 /// A 128-bit vector of [8 x i16].
4426 /// Bits [79:64] are written to bits [31:16] of the result. \n
4427 /// Bits [95:80] are written to bits [63:48] of the result. \n
4428 /// Bits [111:96] are written to bits [95:80] of the result. \n
4429 /// Bits [127:112] are written to bits [127:112] of the result.
4430 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4431 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4432 __m128i __b) {
4433 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4434 8 + 5, 6, 8 + 6, 7, 8 + 7);
4435 }
4436
4437 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4438 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4439 ///
4440 /// \headerfile <x86intrin.h>
4441 ///
4442 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4443 /// instruction.
4444 ///
4445 /// \param __a
4446 /// A 128-bit vector of [4 x i32]. \n
4447 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4448 /// Bits [127:96] are written to bits [95:64] of the destination.
4449 /// \param __b
4450 /// A 128-bit vector of [4 x i32]. \n
4451 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4452 /// Bits [127:96] are written to bits [127:96] of the destination.
4453 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4455 __m128i __b) {
4456 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4457 4 + 3);
4458 }
4459
4460 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4461 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4462 ///
4463 /// \headerfile <x86intrin.h>
4464 ///
4465 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4466 /// instruction.
4467 ///
4468 /// \param __a
4469 /// A 128-bit vector of [2 x i64]. \n
4470 /// Bits [127:64] are written to bits [63:0] of the destination.
4471 /// \param __b
4472 /// A 128-bit vector of [2 x i64]. \n
4473 /// Bits [127:64] are written to bits [127:64] of the destination.
4474 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4475 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4476 __m128i __b) {
4477 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4478 }
4479
4480 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4481 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4482 ///
4483 /// \headerfile <x86intrin.h>
4484 ///
4485 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4486 /// instruction.
4487 ///
4488 /// \param __a
4489 /// A 128-bit vector of [16 x i8]. \n
4490 /// Bits [7:0] are written to bits [7:0] of the result. \n
4491 /// Bits [15:8] are written to bits [23:16] of the result. \n
4492 /// Bits [23:16] are written to bits [39:32] of the result. \n
4493 /// Bits [31:24] are written to bits [55:48] of the result. \n
4494 /// Bits [39:32] are written to bits [71:64] of the result. \n
4495 /// Bits [47:40] are written to bits [87:80] of the result. \n
4496 /// Bits [55:48] are written to bits [103:96] of the result. \n
4497 /// Bits [63:56] are written to bits [119:112] of the result.
4498 /// \param __b
4499 /// A 128-bit vector of [16 x i8].
4500 /// Bits [7:0] are written to bits [15:8] of the result. \n
4501 /// Bits [15:8] are written to bits [31:24] of the result. \n
4502 /// Bits [23:16] are written to bits [47:40] of the result. \n
4503 /// Bits [31:24] are written to bits [63:56] of the result. \n
4504 /// Bits [39:32] are written to bits [79:72] of the result. \n
4505 /// Bits [47:40] are written to bits [95:88] of the result. \n
4506 /// Bits [55:48] are written to bits [111:104] of the result. \n
4507 /// Bits [63:56] are written to bits [127:120] of the result.
4508 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4509 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4510 __m128i __b) {
4511 return (__m128i)__builtin_shufflevector(
4512 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4513 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4514 }
4515
4516 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4517 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4518 /// [8 x i16].
4519 ///
4520 /// \headerfile <x86intrin.h>
4521 ///
4522 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4523 /// instruction.
4524 ///
4525 /// \param __a
4526 /// A 128-bit vector of [8 x i16].
4527 /// Bits [15:0] are written to bits [15:0] of the result. \n
4528 /// Bits [31:16] are written to bits [47:32] of the result. \n
4529 /// Bits [47:32] are written to bits [79:64] of the result. \n
4530 /// Bits [63:48] are written to bits [111:96] of the result.
4531 /// \param __b
4532 /// A 128-bit vector of [8 x i16].
4533 /// Bits [15:0] are written to bits [31:16] of the result. \n
4534 /// Bits [31:16] are written to bits [63:48] of the result. \n
4535 /// Bits [47:32] are written to bits [95:80] of the result. \n
4536 /// Bits [63:48] are written to bits [127:112] of the result.
4537 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4538 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4539 __m128i __b) {
4540 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4541 8 + 1, 2, 8 + 2, 3, 8 + 3);
4542 }
4543
4544 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4545 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4546 ///
4547 /// \headerfile <x86intrin.h>
4548 ///
4549 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4550 /// instruction.
4551 ///
4552 /// \param __a
4553 /// A 128-bit vector of [4 x i32]. \n
4554 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4555 /// Bits [63:32] are written to bits [95:64] of the destination.
4556 /// \param __b
4557 /// A 128-bit vector of [4 x i32]. \n
4558 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4559 /// Bits [63:32] are written to bits [127:96] of the destination.
4560 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4561 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4562 __m128i __b) {
4563 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4564 4 + 1);
4565 }
4566
4567 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4568 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4569 ///
4570 /// \headerfile <x86intrin.h>
4571 ///
4572 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4573 /// instruction.
4574 ///
4575 /// \param __a
4576 /// A 128-bit vector of [2 x i64]. \n
4577 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4578 /// \param __b
4579 /// A 128-bit vector of [2 x i64]. \n
4580 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4581 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4582 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4583 __m128i __b) {
4584 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4585 }
4586
4587 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4588 /// integer.
4589 ///
4590 /// \headerfile <x86intrin.h>
4591 ///
4592 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4593 ///
4594 /// \param __a
4595 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4596 /// destination.
4597 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
_mm_movepi64_pi64(__m128i __a)4598 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4599 return (__m64)__a[0];
4600 }
4601
4602 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4603 /// upper bits.
4604 ///
4605 /// \headerfile <x86intrin.h>
4606 ///
4607 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4608 ///
4609 /// \param __a
4610 /// A 64-bit value.
4611 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4612 /// the operand. The upper 64 bits are assigned zeros.
_mm_movpi64_epi64(__m64 __a)4613 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4614 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4615 }
4616
4617 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4618 /// integer vector, zeroing the upper bits.
4619 ///
4620 /// \headerfile <x86intrin.h>
4621 ///
4622 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4623 ///
4624 /// \param __a
4625 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4626 /// destination.
4627 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4628 /// the operand. The upper 64 bits are assigned zeros.
_mm_move_epi64(__m128i __a)4629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4630 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4631 }
4632
4633 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4634 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4635 /// double].
4636 ///
4637 /// \headerfile <x86intrin.h>
4638 ///
4639 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4640 ///
4641 /// \param __a
4642 /// A 128-bit vector of [2 x double]. \n
4643 /// Bits [127:64] are written to bits [63:0] of the destination.
4644 /// \param __b
4645 /// A 128-bit vector of [2 x double]. \n
4646 /// Bits [127:64] are written to bits [127:64] of the destination.
4647 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpackhi_pd(__m128d __a,__m128d __b)4648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4649 __m128d __b) {
4650 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4651 }
4652
4653 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4654 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4655 /// double].
4656 ///
4657 /// \headerfile <x86intrin.h>
4658 ///
4659 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4660 ///
4661 /// \param __a
4662 /// A 128-bit vector of [2 x double]. \n
4663 /// Bits [63:0] are written to bits [63:0] of the destination.
4664 /// \param __b
4665 /// A 128-bit vector of [2 x double]. \n
4666 /// Bits [63:0] are written to bits [127:64] of the destination.
4667 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpacklo_pd(__m128d __a,__m128d __b)4668 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4669 __m128d __b) {
4670 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4671 }
4672
4673 /// Extracts the sign bits of the double-precision values in the 128-bit
4674 /// vector of [2 x double], zero-extends the value, and writes it to the
4675 /// low-order bits of the destination.
4676 ///
4677 /// \headerfile <x86intrin.h>
4678 ///
4679 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4680 ///
4681 /// \param __a
4682 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4683 /// be extracted.
4684 /// \returns The sign bits from each of the double-precision elements in \a __a,
4685 /// written to bits [1:0]. The remaining bits are assigned values of zero.
_mm_movemask_pd(__m128d __a)4686 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4687 return __builtin_ia32_movmskpd((__v2df)__a);
4688 }
4689
4690 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4691 /// 128-bit vector parameters of [2 x double], using the immediate-value
4692 /// parameter as a specifier.
4693 ///
4694 /// \headerfile <x86intrin.h>
4695 ///
4696 /// \code
4697 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4698 /// \endcode
4699 ///
4700 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4701 ///
4702 /// \param a
4703 /// A 128-bit vector of [2 x double].
4704 /// \param b
4705 /// A 128-bit vector of [2 x double].
4706 /// \param i
4707 /// An 8-bit immediate value. The least significant two bits specify which
4708 /// elements to copy from \a a and \a b: \n
4709 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4710 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4711 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4712 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4713 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4714 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4715 /// <c>[b1, b0]</c>.
4716 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4717 #define _mm_shuffle_pd(a, b, i) \
4718 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4719 (int)(i)))
4720
4721 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4722 /// floating-point vector of [4 x float].
4723 ///
4724 /// \headerfile <x86intrin.h>
4725 ///
4726 /// This intrinsic has no corresponding instruction.
4727 ///
4728 /// \param __a
4729 /// A 128-bit floating-point vector of [2 x double].
4730 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4731 /// bitwise pattern as the parameter.
_mm_castpd_ps(__m128d __a)4732 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4733 return (__m128)__a;
4734 }
4735
4736 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4737 /// integer vector.
4738 ///
4739 /// \headerfile <x86intrin.h>
4740 ///
4741 /// This intrinsic has no corresponding instruction.
4742 ///
4743 /// \param __a
4744 /// A 128-bit floating-point vector of [2 x double].
4745 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4746 /// parameter.
_mm_castpd_si128(__m128d __a)4747 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4748 return (__m128i)__a;
4749 }
4750
4751 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4752 /// floating-point vector of [2 x double].
4753 ///
4754 /// \headerfile <x86intrin.h>
4755 ///
4756 /// This intrinsic has no corresponding instruction.
4757 ///
4758 /// \param __a
4759 /// A 128-bit floating-point vector of [4 x float].
4760 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4761 /// bitwise pattern as the parameter.
_mm_castps_pd(__m128 __a)4762 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4763 return (__m128d)__a;
4764 }
4765
4766 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4767 /// integer vector.
4768 ///
4769 /// \headerfile <x86intrin.h>
4770 ///
4771 /// This intrinsic has no corresponding instruction.
4772 ///
4773 /// \param __a
4774 /// A 128-bit floating-point vector of [4 x float].
4775 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4776 /// parameter.
_mm_castps_si128(__m128 __a)4777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4778 return (__m128i)__a;
4779 }
4780
4781 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4782 /// of [4 x float].
4783 ///
4784 /// \headerfile <x86intrin.h>
4785 ///
4786 /// This intrinsic has no corresponding instruction.
4787 ///
4788 /// \param __a
4789 /// A 128-bit integer vector.
4790 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4791 /// bitwise pattern as the parameter.
_mm_castsi128_ps(__m128i __a)4792 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4793 return (__m128)__a;
4794 }
4795
4796 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4797 /// of [2 x double].
4798 ///
4799 /// \headerfile <x86intrin.h>
4800 ///
4801 /// This intrinsic has no corresponding instruction.
4802 ///
4803 /// \param __a
4804 /// A 128-bit integer vector.
4805 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4806 /// bitwise pattern as the parameter.
_mm_castsi128_pd(__m128i __a)4807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4808 return (__m128d)__a;
4809 }
4810
4811 /// Compares each of the corresponding double-precision values of two
4812 /// 128-bit vectors of [2 x double], using the operation specified by the
4813 /// immediate integer operand.
4814 ///
4815 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4816 /// If either value in a comparison is NaN, comparisons that are ordered
4817 /// return false, and comparisons that are unordered return true.
4818 ///
4819 /// \headerfile <x86intrin.h>
4820 ///
4821 /// \code
4822 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4823 /// \endcode
4824 ///
4825 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4826 ///
4827 /// \param a
4828 /// A 128-bit vector of [2 x double].
4829 /// \param b
4830 /// A 128-bit vector of [2 x double].
4831 /// \param c
4832 /// An immediate integer operand, with bits [4:0] specifying which comparison
4833 /// operation to use: \n
4834 /// 0x00: Equal (ordered, non-signaling) \n
4835 /// 0x01: Less-than (ordered, signaling) \n
4836 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4837 /// 0x03: Unordered (non-signaling) \n
4838 /// 0x04: Not-equal (unordered, non-signaling) \n
4839 /// 0x05: Not-less-than (unordered, signaling) \n
4840 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4841 /// 0x07: Ordered (non-signaling) \n
4842 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4843 #define _mm_cmp_pd(a, b, c) \
4844 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4845 (c)))
4846
4847 /// Compares each of the corresponding scalar double-precision values of
4848 /// two 128-bit vectors of [2 x double], using the operation specified by the
4849 /// immediate integer operand.
4850 ///
4851 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4852 /// If either value in a comparison is NaN, comparisons that are ordered
4853 /// return false, and comparisons that are unordered return true.
4854 ///
4855 /// \headerfile <x86intrin.h>
4856 ///
4857 /// \code
4858 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4859 /// \endcode
4860 ///
4861 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4862 ///
4863 /// \param a
4864 /// A 128-bit vector of [2 x double].
4865 /// \param b
4866 /// A 128-bit vector of [2 x double].
4867 /// \param c
4868 /// An immediate integer operand, with bits [4:0] specifying which comparison
4869 /// operation to use: \n
4870 /// 0x00: Equal (ordered, non-signaling) \n
4871 /// 0x01: Less-than (ordered, signaling) \n
4872 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4873 /// 0x03: Unordered (non-signaling) \n
4874 /// 0x04: Not-equal (unordered, non-signaling) \n
4875 /// 0x05: Not-less-than (unordered, signaling) \n
4876 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4877 /// 0x07: Ordered (non-signaling) \n
4878 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4879 #define _mm_cmp_sd(a, b, c) \
4880 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4881 (c)))
4882
4883 #if defined(__cplusplus)
4884 extern "C" {
4885 #endif
4886
4887 /// Indicates that a spin loop is being executed for the purposes of
4888 /// optimizing power consumption during the loop.
4889 ///
4890 /// \headerfile <x86intrin.h>
4891 ///
4892 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4893 ///
4894 void _mm_pause(void);
4895
4896 #if defined(__cplusplus)
4897 } // extern "C"
4898 #endif
4899
4900 #undef __anyext128
4901 #undef __trunc64
4902 #undef __DEFAULT_FN_ATTRS
4903
4904 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4905
4906 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4907 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4908
4909 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4910
4911 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4912 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4913 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4914
4915 #endif /* __EMMINTRIN_H */
4916