• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <mmintrin.h>
18 
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22 
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27 
28 /* This header should only be included in a hosted environment as it depends on
29  * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
33 
34 /* Define the default attributes for the functions in this file. */
35 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
36 #define __DEFAULT_FN_ATTRS                                                     \
37   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
38                  __min_vector_width__(128)))
39 #define __DEFAULT_FN_ATTRS_SSE2                                                \
40   __attribute__((__always_inline__, __nodebug__,                               \
41                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
42 #else
43 #define __DEFAULT_FN_ATTRS                                                     \
44   __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
45                  __min_vector_width__(128)))
46 #define __DEFAULT_FN_ATTRS_SSE2                                                \
47   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
48                  __min_vector_width__(128)))
49 #endif
50 
51 #define __trunc64(x)                                                           \
52   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
53 #define __zext128(x)                                                           \
54   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
55                                     1, 2, 3)
56 #define __anyext128(x)                                                         \
57   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
58                                     1, -1, -1)
59 #define __zeroupper64(x)                                                       \
60   (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
61                                     1, 4, 5)
62 
63 /// Adds the 32-bit float values in the low-order bits of the operands.
64 ///
65 /// \headerfile <x86intrin.h>
66 ///
67 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
68 ///
69 /// \param __a
70 ///    A 128-bit vector of [4 x float] containing one of the source operands.
71 ///    The lower 32 bits of this operand are used in the calculation.
72 /// \param __b
73 ///    A 128-bit vector of [4 x float] containing one of the source operands.
74 ///    The lower 32 bits of this operand are used in the calculation.
75 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
76 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
77 ///    the upper 96 bits of the first source operand.
78 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ss(__m128 __a,__m128 __b)79 _mm_add_ss(__m128 __a, __m128 __b)
80 {
81   __a[0] += __b[0];
82   return __a;
83 }
84 
85 /// Adds two 128-bit vectors of [4 x float], and returns the results of
86 ///    the addition.
87 ///
88 /// \headerfile <x86intrin.h>
89 ///
90 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
91 ///
92 /// \param __a
93 ///    A 128-bit vector of [4 x float] containing one of the source operands.
94 /// \param __b
95 ///    A 128-bit vector of [4 x float] containing one of the source operands.
96 /// \returns A 128-bit vector of [4 x float] containing the sums of both
97 ///    operands.
98 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ps(__m128 __a,__m128 __b)99 _mm_add_ps(__m128 __a, __m128 __b)
100 {
101   return (__m128)((__v4sf)__a + (__v4sf)__b);
102 }
103 
104 /// Subtracts the 32-bit float value in the low-order bits of the second
105 ///    operand from the corresponding value in the first operand.
106 ///
107 /// \headerfile <x86intrin.h>
108 ///
109 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
110 ///
111 /// \param __a
112 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
113 ///    of this operand are used in the calculation.
114 /// \param __b
115 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
116 ///    bits of this operand are used in the calculation.
117 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
118 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
119 ///    copied from the upper 96 bits of the first source operand.
120 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ss(__m128 __a,__m128 __b)121 _mm_sub_ss(__m128 __a, __m128 __b)
122 {
123   __a[0] -= __b[0];
124   return __a;
125 }
126 
127 /// Subtracts each of the values of the second operand from the first
128 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
129 ///    the results of the subtraction.
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
134 ///
135 /// \param __a
136 ///    A 128-bit vector of [4 x float] containing the minuend.
137 /// \param __b
138 ///    A 128-bit vector of [4 x float] containing the subtrahend.
139 /// \returns A 128-bit vector of [4 x float] containing the differences between
140 ///    both operands.
141 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ps(__m128 __a,__m128 __b)142 _mm_sub_ps(__m128 __a, __m128 __b)
143 {
144   return (__m128)((__v4sf)__a - (__v4sf)__b);
145 }
146 
147 /// Multiplies two 32-bit float values in the low-order bits of the
148 ///    operands.
149 ///
150 /// \headerfile <x86intrin.h>
151 ///
152 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
153 ///
154 /// \param __a
155 ///    A 128-bit vector of [4 x float] containing one of the source operands.
156 ///    The lower 32 bits of this operand are used in the calculation.
157 /// \param __b
158 ///    A 128-bit vector of [4 x float] containing one of the source operands.
159 ///    The lower 32 bits of this operand are used in the calculation.
160 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
161 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
162 ///    bits of the first source operand.
163 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ss(__m128 __a,__m128 __b)164 _mm_mul_ss(__m128 __a, __m128 __b)
165 {
166   __a[0] *= __b[0];
167   return __a;
168 }
169 
170 /// Multiplies two 128-bit vectors of [4 x float] and returns the
171 ///    results of the multiplication.
172 ///
173 /// \headerfile <x86intrin.h>
174 ///
175 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
176 ///
177 /// \param __a
178 ///    A 128-bit vector of [4 x float] containing one of the source operands.
179 /// \param __b
180 ///    A 128-bit vector of [4 x float] containing one of the source operands.
181 /// \returns A 128-bit vector of [4 x float] containing the products of both
182 ///    operands.
183 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ps(__m128 __a,__m128 __b)184 _mm_mul_ps(__m128 __a, __m128 __b)
185 {
186   return (__m128)((__v4sf)__a * (__v4sf)__b);
187 }
188 
189 /// Divides the value in the low-order 32 bits of the first operand by
190 ///    the corresponding value in the second operand.
191 ///
192 /// \headerfile <x86intrin.h>
193 ///
194 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
195 ///
196 /// \param __a
197 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
198 ///    bits of this operand are used in the calculation.
199 /// \param __b
200 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
201 ///    of this operand are used in the calculation.
202 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
203 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
204 ///    upper 96 bits of the first source operand.
205 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ss(__m128 __a,__m128 __b)206 _mm_div_ss(__m128 __a, __m128 __b)
207 {
208   __a[0] /= __b[0];
209   return __a;
210 }
211 
212 /// Divides two 128-bit vectors of [4 x float].
213 ///
214 /// \headerfile <x86intrin.h>
215 ///
216 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
217 ///
218 /// \param __a
219 ///    A 128-bit vector of [4 x float] containing the dividend.
220 /// \param __b
221 ///    A 128-bit vector of [4 x float] containing the divisor.
222 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
223 ///    operands.
224 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ps(__m128 __a,__m128 __b)225 _mm_div_ps(__m128 __a, __m128 __b)
226 {
227   return (__m128)((__v4sf)__a / (__v4sf)__b);
228 }
229 
230 /// Calculates the square root of the value stored in the low-order bits
231 ///    of a 128-bit vector of [4 x float].
232 ///
233 /// \headerfile <x86intrin.h>
234 ///
235 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
236 ///
237 /// \param __a
238 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
239 ///    used in the calculation.
240 /// \returns A 128-bit vector of [4 x float] containing the square root of the
241 ///    value in the low-order bits of the operand.
242 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)243 _mm_sqrt_ss(__m128 __a)
244 {
245   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
246 }
247 
248 /// Calculates the square roots of the values stored in a 128-bit vector
249 ///    of [4 x float].
250 ///
251 /// \headerfile <x86intrin.h>
252 ///
253 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
254 ///
255 /// \param __a
256 ///    A 128-bit vector of [4 x float].
257 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
258 ///    values in the operand.
259 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)260 _mm_sqrt_ps(__m128 __a)
261 {
262   return __builtin_ia32_sqrtps((__v4sf)__a);
263 }
264 
265 /// Calculates the approximate reciprocal of the value stored in the
266 ///    low-order bits of a 128-bit vector of [4 x float].
267 ///
268 /// \headerfile <x86intrin.h>
269 ///
270 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
271 ///
272 /// \param __a
273 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
274 ///    used in the calculation.
275 /// \returns A 128-bit vector of [4 x float] containing the approximate
276 ///    reciprocal of the value in the low-order bits of the operand.
277 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ss(__m128 __a)278 _mm_rcp_ss(__m128 __a)
279 {
280   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
281 }
282 
283 /// Calculates the approximate reciprocals of the values stored in a
284 ///    128-bit vector of [4 x float].
285 ///
286 /// \headerfile <x86intrin.h>
287 ///
288 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
289 ///
290 /// \param __a
291 ///    A 128-bit vector of [4 x float].
292 /// \returns A 128-bit vector of [4 x float] containing the approximate
293 ///    reciprocals of the values in the operand.
294 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ps(__m128 __a)295 _mm_rcp_ps(__m128 __a)
296 {
297   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
298 }
299 
300 /// Calculates the approximate reciprocal of the square root of the value
301 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
302 ///
303 /// \headerfile <x86intrin.h>
304 ///
305 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
306 ///
307 /// \param __a
308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
309 ///    used in the calculation.
310 /// \returns A 128-bit vector of [4 x float] containing the approximate
311 ///    reciprocal of the square root of the value in the low-order bits of the
312 ///    operand.
313 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ss(__m128 __a)314 _mm_rsqrt_ss(__m128 __a)
315 {
316   return __builtin_ia32_rsqrtss((__v4sf)__a);
317 }
318 
319 /// Calculates the approximate reciprocals of the square roots of the
320 ///    values stored in a 128-bit vector of [4 x float].
321 ///
322 /// \headerfile <x86intrin.h>
323 ///
324 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
325 ///
326 /// \param __a
327 ///    A 128-bit vector of [4 x float].
328 /// \returns A 128-bit vector of [4 x float] containing the approximate
329 ///    reciprocals of the square roots of the values in the operand.
330 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ps(__m128 __a)331 _mm_rsqrt_ps(__m128 __a)
332 {
333   return __builtin_ia32_rsqrtps((__v4sf)__a);
334 }
335 
336 /// Compares two 32-bit float values in the low-order bits of both
337 ///    operands and returns the lesser value in the low-order bits of the
338 ///    vector of [4 x float].
339 ///
340 ///    If either value in a comparison is NaN, returns the value from \a __b.
341 ///
342 /// \headerfile <x86intrin.h>
343 ///
344 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
345 ///
346 /// \param __a
347 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
348 ///    32 bits of this operand are used in the comparison.
349 /// \param __b
350 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
351 ///    32 bits of this operand are used in the comparison.
352 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
353 ///    minimum value between both operands. The upper 96 bits are copied from
354 ///    the upper 96 bits of the first source operand.
355 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ss(__m128 __a,__m128 __b)356 _mm_min_ss(__m128 __a, __m128 __b)
357 {
358   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
359 }
360 
361 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
362 ///    of each pair of values.
363 ///
364 ///    If either value in a comparison is NaN, returns the value from \a __b.
365 ///
366 /// \headerfile <x86intrin.h>
367 ///
368 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
369 ///
370 /// \param __a
371 ///    A 128-bit vector of [4 x float] containing one of the operands.
372 /// \param __b
373 ///    A 128-bit vector of [4 x float] containing one of the operands.
374 /// \returns A 128-bit vector of [4 x float] containing the minimum values
375 ///    between both operands.
376 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ps(__m128 __a,__m128 __b)377 _mm_min_ps(__m128 __a, __m128 __b)
378 {
379   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
380 }
381 
382 /// Compares two 32-bit float values in the low-order bits of both
383 ///    operands and returns the greater value in the low-order bits of a 128-bit
384 ///    vector of [4 x float].
385 ///
386 ///    If either value in a comparison is NaN, returns the value from \a __b.
387 ///
388 /// \headerfile <x86intrin.h>
389 ///
390 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
391 ///
392 /// \param __a
393 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
394 ///    32 bits of this operand are used in the comparison.
395 /// \param __b
396 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
397 ///    32 bits of this operand are used in the comparison.
398 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
399 ///    maximum value between both operands. The upper 96 bits are copied from
400 ///    the upper 96 bits of the first source operand.
401 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ss(__m128 __a,__m128 __b)402 _mm_max_ss(__m128 __a, __m128 __b)
403 {
404   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
405 }
406 
407 /// Compares two 128-bit vectors of [4 x float] and returns the greater
408 ///    of each pair of values.
409 ///
410 ///    If either value in a comparison is NaN, returns the value from \a __b.
411 ///
412 /// \headerfile <x86intrin.h>
413 ///
414 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
415 ///
416 /// \param __a
417 ///    A 128-bit vector of [4 x float] containing one of the operands.
418 /// \param __b
419 ///    A 128-bit vector of [4 x float] containing one of the operands.
420 /// \returns A 128-bit vector of [4 x float] containing the maximum values
421 ///    between both operands.
422 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ps(__m128 __a,__m128 __b)423 _mm_max_ps(__m128 __a, __m128 __b)
424 {
425   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
426 }
427 
428 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
429 ///
430 /// \headerfile <x86intrin.h>
431 ///
432 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
433 ///
434 /// \param __a
435 ///    A 128-bit vector containing one of the source operands.
436 /// \param __b
437 ///    A 128-bit vector containing one of the source operands.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439 ///    values between both operands.
440 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_and_ps(__m128 __a,__m128 __b)441 _mm_and_ps(__m128 __a, __m128 __b)
442 {
443   return (__m128)((__v4su)__a & (__v4su)__b);
444 }
445 
446 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
447 ///    the one's complement of the values contained in the first source
448 ///    operand.
449 ///
450 /// \headerfile <x86intrin.h>
451 ///
452 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
453 ///
454 /// \param __a
455 ///    A 128-bit vector of [4 x float] containing the first source operand. The
456 ///    one's complement of this value is used in the bitwise AND.
457 /// \param __b
458 ///    A 128-bit vector of [4 x float] containing the second source operand.
459 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
460 ///    one's complement of the first operand and the values in the second
461 ///    operand.
462 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_andnot_ps(__m128 __a,__m128 __b)463 _mm_andnot_ps(__m128 __a, __m128 __b)
464 {
465   return (__m128)(~(__v4su)__a & (__v4su)__b);
466 }
467 
468 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
469 ///
470 /// \headerfile <x86intrin.h>
471 ///
472 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
473 ///
474 /// \param __a
475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \param __b
477 ///    A 128-bit vector of [4 x float] containing one of the source operands.
478 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
479 ///    values between both operands.
480 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_or_ps(__m128 __a,__m128 __b)481 _mm_or_ps(__m128 __a, __m128 __b)
482 {
483   return (__m128)((__v4su)__a | (__v4su)__b);
484 }
485 
486 /// Performs a bitwise exclusive OR of two 128-bit vectors of
487 ///    [4 x float].
488 ///
489 /// \headerfile <x86intrin.h>
490 ///
491 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
492 ///
493 /// \param __a
494 ///    A 128-bit vector of [4 x float] containing one of the source operands.
495 /// \param __b
496 ///    A 128-bit vector of [4 x float] containing one of the source operands.
497 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
498 ///    of the values between both operands.
499 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_xor_ps(__m128 __a,__m128 __b)500 _mm_xor_ps(__m128 __a, __m128 __b)
501 {
502   return (__m128)((__v4su)__a ^ (__v4su)__b);
503 }
504 
505 /// Compares two 32-bit float values in the low-order bits of both
506 ///    operands for equality.
507 ///
508 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
509 ///    low-order bits of a vector [4 x float].
510 ///    If either value in a comparison is NaN, returns false.
511 ///
512 /// \headerfile <x86intrin.h>
513 ///
514 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
515 ///
516 /// \param __a
517 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
518 ///    32 bits of this operand are used in the comparison.
519 /// \param __b
520 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
521 ///    32 bits of this operand are used in the comparison.
522 /// \returns A 128-bit vector of [4 x float] containing the comparison results
523 ///    in the low-order bits.
524 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ss(__m128 __a,__m128 __b)525 _mm_cmpeq_ss(__m128 __a, __m128 __b)
526 {
527   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
528 }
529 
530 /// Compares each of the corresponding 32-bit float values of the
531 ///    128-bit vectors of [4 x float] for equality.
532 ///
533 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
534 ///    If either value in a comparison is NaN, returns false.
535 ///
536 /// \headerfile <x86intrin.h>
537 ///
538 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
539 ///
540 /// \param __a
541 ///    A 128-bit vector of [4 x float].
542 /// \param __b
543 ///    A 128-bit vector of [4 x float].
544 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
545 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ps(__m128 __a,__m128 __b)546 _mm_cmpeq_ps(__m128 __a, __m128 __b)
547 {
548   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
549 }
550 
551 /// Compares two 32-bit float values in the low-order bits of both
552 ///    operands to determine if the value in the first operand is less than the
553 ///    corresponding value in the second operand.
554 ///
555 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
556 ///    low-order bits of a vector of [4 x float].
557 ///    If either value in a comparison is NaN, returns false.
558 ///
559 /// \headerfile <x86intrin.h>
560 ///
561 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
562 ///
563 /// \param __a
564 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
565 ///    32 bits of this operand are used in the comparison.
566 /// \param __b
567 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
568 ///    32 bits of this operand are used in the comparison.
569 /// \returns A 128-bit vector of [4 x float] containing the comparison results
570 ///    in the low-order bits.
571 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ss(__m128 __a,__m128 __b)572 _mm_cmplt_ss(__m128 __a, __m128 __b)
573 {
574   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
575 }
576 
577 /// Compares each of the corresponding 32-bit float values of the
578 ///    128-bit vectors of [4 x float] to determine if the values in the first
579 ///    operand are less than those in the second operand.
580 ///
581 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
582 ///    If either value in a comparison is NaN, returns false.
583 ///
584 /// \headerfile <x86intrin.h>
585 ///
586 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
587 ///
588 /// \param __a
589 ///    A 128-bit vector of [4 x float].
590 /// \param __b
591 ///    A 128-bit vector of [4 x float].
592 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
593 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ps(__m128 __a,__m128 __b)594 _mm_cmplt_ps(__m128 __a, __m128 __b)
595 {
596   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
597 }
598 
599 /// Compares two 32-bit float values in the low-order bits of both
600 ///    operands to determine if the value in the first operand is less than or
601 ///    equal to the corresponding value in the second operand.
602 ///
603 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
604 ///    the low-order bits of a vector of [4 x float].
605 ///    If either value in a comparison is NaN, returns false.
606 ///
607 /// \headerfile <x86intrin.h>
608 ///
609 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
610 ///
611 /// \param __a
612 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
613 ///    32 bits of this operand are used in the comparison.
614 /// \param __b
615 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
616 ///    32 bits of this operand are used in the comparison.
617 /// \returns A 128-bit vector of [4 x float] containing the comparison results
618 ///    in the low-order bits.
619 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ss(__m128 __a,__m128 __b)620 _mm_cmple_ss(__m128 __a, __m128 __b)
621 {
622   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
623 }
624 
625 /// Compares each of the corresponding 32-bit float values of the
626 ///    128-bit vectors of [4 x float] to determine if the values in the first
627 ///    operand are less than or equal to those in the second operand.
628 ///
629 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
630 ///    If either value in a comparison is NaN, returns false.
631 ///
632 /// \headerfile <x86intrin.h>
633 ///
634 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
635 ///
636 /// \param __a
637 ///    A 128-bit vector of [4 x float].
638 /// \param __b
639 ///    A 128-bit vector of [4 x float].
640 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
641 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ps(__m128 __a,__m128 __b)642 _mm_cmple_ps(__m128 __a, __m128 __b)
643 {
644   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
645 }
646 
647 /// Compares two 32-bit float values in the low-order bits of both
648 ///    operands to determine if the value in the first operand is greater than
649 ///    the corresponding value in the second operand.
650 ///
651 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
652 ///    low-order bits of a vector of [4 x float].
653 ///    If either value in a comparison is NaN, returns false.
654 ///
655 /// \headerfile <x86intrin.h>
656 ///
657 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
658 ///
659 /// \param __a
660 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
661 ///    32 bits of this operand are used in the comparison.
662 /// \param __b
663 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
664 ///    32 bits of this operand are used in the comparison.
665 /// \returns A 128-bit vector of [4 x float] containing the comparison results
666 ///    in the low-order bits.
667 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a,__m128 __b)668 _mm_cmpgt_ss(__m128 __a, __m128 __b)
669 {
670   return (__m128)__builtin_shufflevector((__v4sf)__a,
671                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
672                                          4, 1, 2, 3);
673 }
674 
675 /// Compares each of the corresponding 32-bit float values of the
676 ///    128-bit vectors of [4 x float] to determine if the values in the first
677 ///    operand are greater than those in the second operand.
678 ///
679 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
680 ///    If either value in a comparison is NaN, returns false.
681 ///
682 /// \headerfile <x86intrin.h>
683 ///
684 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
685 ///
686 /// \param __a
687 ///    A 128-bit vector of [4 x float].
688 /// \param __b
689 ///    A 128-bit vector of [4 x float].
690 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
691 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ps(__m128 __a,__m128 __b)692 _mm_cmpgt_ps(__m128 __a, __m128 __b)
693 {
694   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
695 }
696 
697 /// Compares two 32-bit float values in the low-order bits of both
698 ///    operands to determine if the value in the first operand is greater than
699 ///    or equal to the corresponding value in the second operand.
700 ///
701 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
702 ///    low-order bits of a vector of [4 x float].
703 ///    If either value in a comparison is NaN, returns false.
704 ///
705 /// \headerfile <x86intrin.h>
706 ///
707 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
708 ///
709 /// \param __a
710 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
711 ///    32 bits of this operand are used in the comparison.
712 /// \param __b
713 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
714 ///    32 bits of this operand are used in the comparison.
715 /// \returns A 128-bit vector of [4 x float] containing the comparison results
716 ///    in the low-order bits.
717 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a,__m128 __b)718 _mm_cmpge_ss(__m128 __a, __m128 __b)
719 {
720   return (__m128)__builtin_shufflevector((__v4sf)__a,
721                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
722                                          4, 1, 2, 3);
723 }
724 
725 /// Compares each of the corresponding 32-bit float values of the
726 ///    128-bit vectors of [4 x float] to determine if the values in the first
727 ///    operand are greater than or equal to those in the second operand.
728 ///
729 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730 ///    If either value in a comparison is NaN, returns false.
731 ///
732 /// \headerfile <x86intrin.h>
733 ///
734 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
735 ///
736 /// \param __a
737 ///    A 128-bit vector of [4 x float].
738 /// \param __b
739 ///    A 128-bit vector of [4 x float].
740 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
741 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ps(__m128 __a,__m128 __b)742 _mm_cmpge_ps(__m128 __a, __m128 __b)
743 {
744   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
745 }
746 
747 /// Compares two 32-bit float values in the low-order bits of both operands
748 ///    for inequality.
749 ///
750 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
751 ///    low-order bits of a vector of [4 x float].
752 ///    If either value in a comparison is NaN, returns true.
753 ///
754 /// \headerfile <x86intrin.h>
755 ///
756 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
757 ///   instructions.
758 ///
759 /// \param __a
760 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
761 ///    32 bits of this operand are used in the comparison.
762 /// \param __b
763 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
764 ///    32 bits of this operand are used in the comparison.
765 /// \returns A 128-bit vector of [4 x float] containing the comparison results
766 ///    in the low-order bits.
767 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ss(__m128 __a,__m128 __b)768 _mm_cmpneq_ss(__m128 __a, __m128 __b)
769 {
770   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
771 }
772 
773 /// Compares each of the corresponding 32-bit float values of the
774 ///    128-bit vectors of [4 x float] for inequality.
775 ///
776 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
777 ///    If either value in a comparison is NaN, returns true.
778 ///
779 /// \headerfile <x86intrin.h>
780 ///
781 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
782 ///   instructions.
783 ///
784 /// \param __a
785 ///    A 128-bit vector of [4 x float].
786 /// \param __b
787 ///    A 128-bit vector of [4 x float].
788 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
789 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ps(__m128 __a,__m128 __b)790 _mm_cmpneq_ps(__m128 __a, __m128 __b)
791 {
792   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
793 }
794 
795 /// Compares two 32-bit float values in the low-order bits of both
796 ///    operands to determine if the value in the first operand is not less than
797 ///    the corresponding value in the second operand.
798 ///
799 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
800 ///    low-order bits of a vector of [4 x float].
801 ///    If either value in a comparison is NaN, returns true.
802 ///
803 /// \headerfile <x86intrin.h>
804 ///
805 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
806 ///   instructions.
807 ///
808 /// \param __a
809 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
810 ///    32 bits of this operand are used in the comparison.
811 /// \param __b
812 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
813 ///    32 bits of this operand are used in the comparison.
814 /// \returns A 128-bit vector of [4 x float] containing the comparison results
815 ///    in the low-order bits.
816 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ss(__m128 __a,__m128 __b)817 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
818 {
819   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
820 }
821 
822 /// Compares each of the corresponding 32-bit float values of the
823 ///    128-bit vectors of [4 x float] to determine if the values in the first
824 ///    operand are not less than those in the second operand.
825 ///
826 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
827 ///    If either value in a comparison is NaN, returns true.
828 ///
829 /// \headerfile <x86intrin.h>
830 ///
831 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
832 ///   instructions.
833 ///
834 /// \param __a
835 ///    A 128-bit vector of [4 x float].
836 /// \param __b
837 ///    A 128-bit vector of [4 x float].
838 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
839 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ps(__m128 __a,__m128 __b)840 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
841 {
842   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
843 }
844 
845 /// Compares two 32-bit float values in the low-order bits of both
846 ///    operands to determine if the value in the first operand is not less than
847 ///    or equal to the corresponding value in the second operand.
848 ///
849 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
850 ///    low-order bits of a vector of [4 x float].
851 ///    If either value in a comparison is NaN, returns true.
852 ///
853 /// \headerfile <x86intrin.h>
854 ///
855 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
856 ///   instructions.
857 ///
858 /// \param __a
859 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
860 ///    32 bits of this operand are used in the comparison.
861 /// \param __b
862 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
863 ///    32 bits of this operand are used in the comparison.
864 /// \returns A 128-bit vector of [4 x float] containing the comparison results
865 ///    in the low-order bits.
866 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ss(__m128 __a,__m128 __b)867 _mm_cmpnle_ss(__m128 __a, __m128 __b)
868 {
869   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
870 }
871 
872 /// Compares each of the corresponding 32-bit float values of the
873 ///    128-bit vectors of [4 x float] to determine if the values in the first
874 ///    operand are not less than or equal to those in the second operand.
875 ///
876 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
877 ///    If either value in a comparison is NaN, returns true.
878 ///
879 /// \headerfile <x86intrin.h>
880 ///
881 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
882 ///   instructions.
883 ///
884 /// \param __a
885 ///    A 128-bit vector of [4 x float].
886 /// \param __b
887 ///    A 128-bit vector of [4 x float].
888 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
889 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ps(__m128 __a,__m128 __b)890 _mm_cmpnle_ps(__m128 __a, __m128 __b)
891 {
892   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
893 }
894 
895 /// Compares two 32-bit float values in the low-order bits of both
896 ///    operands to determine if the value in the first operand is not greater
897 ///    than the corresponding value in the second operand.
898 ///
899 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
900 ///    low-order bits of a vector of [4 x float].
901 ///    If either value in a comparison is NaN, returns true.
902 ///
903 /// \headerfile <x86intrin.h>
904 ///
905 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
906 ///   instructions.
907 ///
908 /// \param __a
909 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
910 ///    32 bits of this operand are used in the comparison.
911 /// \param __b
912 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
913 ///    32 bits of this operand are used in the comparison.
914 /// \returns A 128-bit vector of [4 x float] containing the comparison results
915 ///    in the low-order bits.
916 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a,__m128 __b)917 _mm_cmpngt_ss(__m128 __a, __m128 __b)
918 {
919   return (__m128)__builtin_shufflevector((__v4sf)__a,
920                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
921                                          4, 1, 2, 3);
922 }
923 
924 /// Compares each of the corresponding 32-bit float values of the
925 ///    128-bit vectors of [4 x float] to determine if the values in the first
926 ///    operand are not greater than those in the second operand.
927 ///
928 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
929 ///    If either value in a comparison is NaN, returns true.
930 ///
931 /// \headerfile <x86intrin.h>
932 ///
933 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
934 ///   instructions.
935 ///
936 /// \param __a
937 ///    A 128-bit vector of [4 x float].
938 /// \param __b
939 ///    A 128-bit vector of [4 x float].
940 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
941 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ps(__m128 __a,__m128 __b)942 _mm_cmpngt_ps(__m128 __a, __m128 __b)
943 {
944   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
945 }
946 
947 /// Compares two 32-bit float values in the low-order bits of both
948 ///    operands to determine if the value in the first operand is not greater
949 ///    than or equal to the corresponding value in the second operand.
950 ///
951 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
952 ///    low-order bits of a vector of [4 x float].
953 ///    If either value in a comparison is NaN, returns true.
954 ///
955 /// \headerfile <x86intrin.h>
956 ///
957 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
958 ///   instructions.
959 ///
960 /// \param __a
961 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
962 ///    32 bits of this operand are used in the comparison.
963 /// \param __b
964 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
965 ///    32 bits of this operand are used in the comparison.
966 /// \returns A 128-bit vector of [4 x float] containing the comparison results
967 ///    in the low-order bits.
968 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a,__m128 __b)969 _mm_cmpnge_ss(__m128 __a, __m128 __b)
970 {
971   return (__m128)__builtin_shufflevector((__v4sf)__a,
972                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
973                                          4, 1, 2, 3);
974 }
975 
976 /// Compares each of the corresponding 32-bit float values of the
977 ///    128-bit vectors of [4 x float] to determine if the values in the first
978 ///    operand are not greater than or equal to those in the second operand.
979 ///
980 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
981 ///    If either value in a comparison is NaN, returns true.
982 ///
983 /// \headerfile <x86intrin.h>
984 ///
985 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
986 ///   instructions.
987 ///
988 /// \param __a
989 ///    A 128-bit vector of [4 x float].
990 /// \param __b
991 ///    A 128-bit vector of [4 x float].
992 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
993 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ps(__m128 __a,__m128 __b)994 _mm_cmpnge_ps(__m128 __a, __m128 __b)
995 {
996   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
997 }
998 
999 /// Compares two 32-bit float values in the low-order bits of both
1000 ///    operands to determine if the value in the first operand is ordered with
1001 ///    respect to the corresponding value in the second operand.
1002 ///
1003 ///    A pair of floating-point values are ordered with respect to each
1004 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1005 ///    0xFFFFFFFF for true.
1006 ///
1007 /// \headerfile <x86intrin.h>
1008 ///
1009 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1010 ///   instructions.
1011 ///
1012 /// \param __a
1013 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1014 ///    32 bits of this operand are used in the comparison.
1015 /// \param __b
1016 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1017 ///    32 bits of this operand are used in the comparison.
1018 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1019 ///    in the low-order bits.
1020 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ss(__m128 __a,__m128 __b)1021 _mm_cmpord_ss(__m128 __a, __m128 __b)
1022 {
1023   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1024 }
1025 
1026 /// Compares each of the corresponding 32-bit float values of the
1027 ///    128-bit vectors of [4 x float] to determine if the values in the first
1028 ///    operand are ordered with respect to those in the second operand.
1029 ///
1030 ///    A pair of floating-point values are ordered with respect to each
1031 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1032 ///    0xFFFFFFFF for true.
1033 ///
1034 /// \headerfile <x86intrin.h>
1035 ///
1036 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1037 ///   instructions.
1038 ///
1039 /// \param __a
1040 ///    A 128-bit vector of [4 x float].
1041 /// \param __b
1042 ///    A 128-bit vector of [4 x float].
1043 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1044 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ps(__m128 __a,__m128 __b)1045 _mm_cmpord_ps(__m128 __a, __m128 __b)
1046 {
1047   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1048 }
1049 
1050 /// Compares two 32-bit float values in the low-order bits of both
1051 ///    operands to determine if the value in the first operand is unordered
1052 ///    with respect to the corresponding value in the second operand.
1053 ///
1054 ///    A pair of double-precision values are unordered with respect to each
1055 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1056 ///    false, 0xFFFFFFFF for true.
1057 ///
1058 /// \headerfile <x86intrin.h>
1059 ///
1060 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1061 ///   instructions.
1062 ///
1063 /// \param __a
1064 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1065 ///    32 bits of this operand are used in the comparison.
1066 /// \param __b
1067 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1068 ///    32 bits of this operand are used in the comparison.
1069 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1070 ///    in the low-order bits.
1071 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ss(__m128 __a,__m128 __b)1072 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1073 {
1074   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1075 }
1076 
1077 /// Compares each of the corresponding 32-bit float values of the
1078 ///    128-bit vectors of [4 x float] to determine if the values in the first
1079 ///    operand are unordered with respect to those in the second operand.
1080 ///
1081 ///    A pair of double-precision values are unordered with respect to each
1082 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1083 ///    false, 0xFFFFFFFFFFFFFFFF for true.
1084 ///
1085 /// \headerfile <x86intrin.h>
1086 ///
1087 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1088 ///   instructions.
1089 ///
1090 /// \param __a
1091 ///    A 128-bit vector of [4 x float].
1092 /// \param __b
1093 ///    A 128-bit vector of [4 x float].
1094 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1095 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ps(__m128 __a,__m128 __b)1096 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1097 {
1098   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1099 }
1100 
1101 /// Compares two 32-bit float values in the low-order bits of both
1102 ///    operands for equality.
1103 ///
1104 ///    The comparison returns 0 for false, 1 for true. If either value in a
1105 ///    comparison is NaN, returns 0.
1106 ///
1107 /// \headerfile <x86intrin.h>
1108 ///
1109 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1110 ///   instructions.
1111 ///
1112 /// \param __a
1113 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114 ///    used in the comparison.
1115 /// \param __b
1116 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1117 ///    used in the comparison.
1118 /// \returns An integer containing the comparison results.
1119 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_ss(__m128 __a,__m128 __b)1120 _mm_comieq_ss(__m128 __a, __m128 __b)
1121 {
1122   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1123 }
1124 
1125 /// Compares two 32-bit float values in the low-order bits of both
1126 ///    operands to determine if the first operand is less than the second
1127 ///    operand.
1128 ///
1129 ///    The comparison returns 0 for false, 1 for true. If either value in a
1130 ///    comparison is NaN, returns 0.
1131 ///
1132 /// \headerfile <x86intrin.h>
1133 ///
1134 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1135 ///   instructions.
1136 ///
1137 /// \param __a
1138 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1139 ///    used in the comparison.
1140 /// \param __b
1141 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142 ///    used in the comparison.
1143 /// \returns An integer containing the comparison results.
1144 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_ss(__m128 __a,__m128 __b)1145 _mm_comilt_ss(__m128 __a, __m128 __b)
1146 {
1147   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1148 }
1149 
1150 /// Compares two 32-bit float values in the low-order bits of both
1151 ///    operands to determine if the first operand is less than or equal to the
1152 ///    second operand.
1153 ///
1154 ///    The comparison returns 0 for false, 1 for true. If either value in a
1155 ///    comparison is NaN, returns 0.
1156 ///
1157 /// \headerfile <x86intrin.h>
1158 ///
1159 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1160 ///
1161 /// \param __a
1162 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1163 ///    used in the comparison.
1164 /// \param __b
1165 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166 ///    used in the comparison.
1167 /// \returns An integer containing the comparison results.
1168 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_ss(__m128 __a,__m128 __b)1169 _mm_comile_ss(__m128 __a, __m128 __b)
1170 {
1171   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1172 }
1173 
1174 /// Compares two 32-bit float values in the low-order bits of both
1175 ///    operands to determine if the first operand is greater than the second
1176 ///    operand.
1177 ///
1178 ///    The comparison returns 0 for false, 1 for true. If either value in a
1179 ///    comparison is NaN, returns 0.
1180 ///
1181 /// \headerfile <x86intrin.h>
1182 ///
1183 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1184 ///
1185 /// \param __a
1186 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1187 ///    used in the comparison.
1188 /// \param __b
1189 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190 ///    used in the comparison.
1191 /// \returns An integer containing the comparison results.
1192 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_ss(__m128 __a,__m128 __b)1193 _mm_comigt_ss(__m128 __a, __m128 __b)
1194 {
1195   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1196 }
1197 
1198 /// Compares two 32-bit float values in the low-order bits of both
1199 ///    operands to determine if the first operand is greater than or equal to
1200 ///    the second operand.
1201 ///
1202 ///    The comparison returns 0 for false, 1 for true. If either value in a
1203 ///    comparison is NaN, returns 0.
1204 ///
1205 /// \headerfile <x86intrin.h>
1206 ///
1207 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1208 ///
1209 /// \param __a
1210 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1211 ///    used in the comparison.
1212 /// \param __b
1213 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214 ///    used in the comparison.
1215 /// \returns An integer containing the comparison results.
1216 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_ss(__m128 __a,__m128 __b)1217 _mm_comige_ss(__m128 __a, __m128 __b)
1218 {
1219   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1220 }
1221 
1222 /// Compares two 32-bit float values in the low-order bits of both
1223 ///    operands to determine if the first operand is not equal to the second
1224 ///    operand.
1225 ///
1226 ///    The comparison returns 0 for false, 1 for true. If either value in a
1227 ///    comparison is NaN, returns 1.
1228 ///
1229 /// \headerfile <x86intrin.h>
1230 ///
1231 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1232 ///
1233 /// \param __a
1234 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235 ///    used in the comparison.
1236 /// \param __b
1237 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1238 ///    used in the comparison.
1239 /// \returns An integer containing the comparison results.
1240 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_ss(__m128 __a,__m128 __b)1241 _mm_comineq_ss(__m128 __a, __m128 __b)
1242 {
1243   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1244 }
1245 
1246 /// Performs an unordered comparison of two 32-bit float values using
1247 ///    the low-order bits of both operands to determine equality.
1248 ///
1249 ///    The comparison returns 0 for false, 1 for true. If either value in a
1250 ///    comparison is NaN, returns 0.
1251 ///
1252 /// \headerfile <x86intrin.h>
1253 ///
1254 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1255 ///
1256 /// \param __a
1257 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1258 ///    used in the comparison.
1259 /// \param __b
1260 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261 ///    used in the comparison.
1262 /// \returns An integer containing the comparison results.
1263 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_ss(__m128 __a,__m128 __b)1264 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1265 {
1266   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1267 }
1268 
1269 /// Performs an unordered comparison of two 32-bit float values using
1270 ///    the low-order bits of both operands to determine if the first operand is
1271 ///    less than the second operand.
1272 ///
1273 ///    The comparison returns 0 for false, 1 for true. If either value in a
1274 ///    comparison is NaN, returns 0.
1275 ///
1276 /// \headerfile <x86intrin.h>
1277 ///
1278 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1279 ///
1280 /// \param __a
1281 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1282 ///    used in the comparison.
1283 /// \param __b
1284 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285 ///    used in the comparison.
1286 /// \returns An integer containing the comparison results.
1287 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_ss(__m128 __a,__m128 __b)1288 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1289 {
1290   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1291 }
1292 
1293 /// Performs an unordered comparison of two 32-bit float values using
1294 ///    the low-order bits of both operands to determine if the first operand is
1295 ///    less than or equal to the second operand.
1296 ///
1297 ///    The comparison returns 0 for false, 1 for true. If either value in a
1298 ///    comparison is NaN, returns 0.
1299 ///
1300 /// \headerfile <x86intrin.h>
1301 ///
1302 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1303 ///
1304 /// \param __a
1305 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1306 ///    used in the comparison.
1307 /// \param __b
1308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309 ///    used in the comparison.
1310 /// \returns An integer containing the comparison results.
1311 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_ss(__m128 __a,__m128 __b)1312 _mm_ucomile_ss(__m128 __a, __m128 __b)
1313 {
1314   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1315 }
1316 
1317 /// Performs an unordered comparison of two 32-bit float values using
1318 ///    the low-order bits of both operands to determine if the first operand is
1319 ///    greater than the second operand.
1320 ///
1321 ///    The comparison returns 0 for false, 1 for true. If either value in a
1322 ///    comparison is NaN, returns 0.
1323 ///
1324 /// \headerfile <x86intrin.h>
1325 ///
1326 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1327 ///
1328 /// \param __a
1329 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1330 ///    used in the comparison.
1331 /// \param __b
1332 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333 ///    used in the comparison.
1334 /// \returns An integer containing the comparison results.
1335 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_ss(__m128 __a,__m128 __b)1336 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1337 {
1338   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1339 }
1340 
1341 /// Performs an unordered comparison of two 32-bit float values using
1342 ///    the low-order bits of both operands to determine if the first operand is
1343 ///    greater than or equal to the second operand.
1344 ///
1345 ///    The comparison returns 0 for false, 1 for true. If either value in a
1346 ///    comparison is NaN, returns 0.
1347 ///
1348 /// \headerfile <x86intrin.h>
1349 ///
1350 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1351 ///
1352 /// \param __a
1353 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1354 ///    used in the comparison.
1355 /// \param __b
1356 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1357 ///    used in the comparison.
1358 /// \returns An integer containing the comparison results.
1359 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_ss(__m128 __a,__m128 __b)1360 _mm_ucomige_ss(__m128 __a, __m128 __b)
1361 {
1362   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1363 }
1364 
1365 /// Performs an unordered comparison of two 32-bit float values using
1366 ///    the low-order bits of both operands to determine inequality.
1367 ///
1368 ///    The comparison returns 0 for false, 1 for true. If either value in a
1369 ///    comparison is NaN, returns 0.
1370 ///
1371 /// \headerfile <x86intrin.h>
1372 ///
1373 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1374 ///
1375 /// \param __a
1376 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1377 ///    used in the comparison.
1378 /// \param __b
1379 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1380 ///    used in the comparison.
1381 /// \returns An integer containing the comparison results.
1382 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_ss(__m128 __a,__m128 __b)1383 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1384 {
1385   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1386 }
1387 
1388 /// Converts a float value contained in the lower 32 bits of a vector of
1389 ///    [4 x float] into a 32-bit integer.
1390 ///
1391 ///    If the converted value does not fit in a 32-bit integer, raises a
1392 ///    floating-point invalid exception. If the exception is masked, returns
1393 ///    the most negative integer.
1394 ///
1395 /// \headerfile <x86intrin.h>
1396 ///
1397 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1398 ///   instructions.
1399 ///
1400 /// \param __a
1401 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1402 ///    used in the conversion.
1403 /// \returns A 32-bit integer containing the converted value.
1404 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtss_si32(__m128 __a)1405 _mm_cvtss_si32(__m128 __a)
1406 {
1407   return __builtin_ia32_cvtss2si((__v4sf)__a);
1408 }
1409 
1410 /// Converts a float value contained in the lower 32 bits of a vector of
1411 ///    [4 x float] into a 32-bit integer.
1412 ///
1413 ///    If the converted value does not fit in a 32-bit integer, raises a
1414 ///    floating-point invalid exception. If the exception is masked, returns
1415 ///    the most negative integer.
1416 ///
1417 /// \headerfile <x86intrin.h>
1418 ///
1419 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1420 ///   instructions.
1421 ///
1422 /// \param __a
1423 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1424 ///    used in the conversion.
1425 /// \returns A 32-bit integer containing the converted value.
1426 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvt_ss2si(__m128 __a)1427 _mm_cvt_ss2si(__m128 __a)
1428 {
1429   return _mm_cvtss_si32(__a);
1430 }
1431 
1432 #ifdef __x86_64__
1433 
1434 /// Converts a float value contained in the lower 32 bits of a vector of
1435 ///    [4 x float] into a 64-bit integer.
1436 ///
1437 ///    If the converted value does not fit in a 32-bit integer, raises a
1438 ///    floating-point invalid exception. If the exception is masked, returns
1439 ///    the most negative integer.
1440 ///
1441 /// \headerfile <x86intrin.h>
1442 ///
1443 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1444 ///   instructions.
1445 ///
1446 /// \param __a
1447 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1448 ///    used in the conversion.
1449 /// \returns A 64-bit integer containing the converted value.
1450 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtss_si64(__m128 __a)1451 _mm_cvtss_si64(__m128 __a)
1452 {
1453   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1454 }
1455 
1456 #endif
1457 
1458 /// Converts two low-order float values in a 128-bit vector of
1459 ///    [4 x float] into a 64-bit vector of [2 x i32].
1460 ///
1461 ///    If a converted value does not fit in a 32-bit integer, raises a
1462 ///    floating-point invalid exception. If the exception is masked, returns
1463 ///    the most negative integer.
1464 ///
1465 /// \headerfile <x86intrin.h>
1466 ///
1467 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1468 ///
1469 /// \param __a
1470 ///    A 128-bit vector of [4 x float].
1471 /// \returns A 64-bit integer vector containing the converted values.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi32(__m128 __a)1473 _mm_cvtps_pi32(__m128 __a)
1474 {
1475   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1476 }
1477 
1478 /// Converts two low-order float values in a 128-bit vector of
1479 ///    [4 x float] into a 64-bit vector of [2 x i32].
1480 ///
1481 ///    If a converted value does not fit in a 32-bit integer, raises a
1482 ///    floating-point invalid exception. If the exception is masked, returns
1483 ///    the most negative integer.
1484 ///
1485 /// \headerfile <x86intrin.h>
1486 ///
1487 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1488 ///
1489 /// \param __a
1490 ///    A 128-bit vector of [4 x float].
1491 /// \returns A 64-bit integer vector containing the converted values.
1492 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_ps2pi(__m128 __a)1493 _mm_cvt_ps2pi(__m128 __a)
1494 {
1495   return _mm_cvtps_pi32(__a);
1496 }
1497 
1498 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1499 ///    truncated (rounded toward zero) 32-bit integer.
1500 ///
1501 ///    If the converted value does not fit in a 32-bit integer, raises a
1502 ///    floating-point invalid exception. If the exception is masked, returns
1503 ///    the most negative integer.
1504 ///
1505 /// \headerfile <x86intrin.h>
1506 ///
1507 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1508 ///   instructions.
1509 ///
1510 /// \param __a
1511 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1512 ///    used in the conversion.
1513 /// \returns A 32-bit integer containing the converted value.
1514 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttss_si32(__m128 __a)1515 _mm_cvttss_si32(__m128 __a)
1516 {
1517   return __builtin_ia32_cvttss2si((__v4sf)__a);
1518 }
1519 
1520 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1521 ///    truncated (rounded toward zero) 32-bit integer.
1522 ///
1523 ///    If the converted value does not fit in a 32-bit integer, raises a
1524 ///    floating-point invalid exception. If the exception is masked, returns
1525 ///    the most negative integer.
1526 ///
1527 /// \headerfile <x86intrin.h>
1528 ///
1529 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1530 ///   instructions.
1531 ///
1532 /// \param __a
1533 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1534 ///    used in the conversion.
1535 /// \returns A 32-bit integer containing the converted value.
1536 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtt_ss2si(__m128 __a)1537 _mm_cvtt_ss2si(__m128 __a)
1538 {
1539   return _mm_cvttss_si32(__a);
1540 }
1541 
1542 #ifdef __x86_64__
1543 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1544 ///    truncated (rounded toward zero) 64-bit integer.
1545 ///
1546 ///    If the converted value does not fit in a 64-bit integer, raises a
1547 ///    floating-point invalid exception. If the exception is masked, returns
1548 ///    the most negative integer.
1549 ///
1550 /// \headerfile <x86intrin.h>
1551 ///
1552 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1553 ///   instructions.
1554 ///
1555 /// \param __a
1556 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1557 ///    used in the conversion.
1558 /// \returns A 64-bit integer containing the converted value.
1559 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttss_si64(__m128 __a)1560 _mm_cvttss_si64(__m128 __a)
1561 {
1562   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1563 }
1564 #endif
1565 
1566 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1567 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1568 ///    returned in a 64-bit vector of [2 x i32].
1569 ///
1570 ///    If a converted value does not fit in a 32-bit integer, raises a
1571 ///    floating-point invalid exception. If the exception is masked, returns
1572 ///    the most negative integer.
1573 ///
1574 /// \headerfile <x86intrin.h>
1575 ///
1576 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1577 ///   instructions.
1578 ///
1579 /// \param __a
1580 ///    A 128-bit vector of [4 x float].
1581 /// \returns A 64-bit integer vector containing the converted values.
1582 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvttps_pi32(__m128 __a)1583 _mm_cvttps_pi32(__m128 __a)
1584 {
1585   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1586 }
1587 
1588 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1589 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1590 ///    returned in a 64-bit vector of [2 x i32].
1591 ///
1592 ///    If a converted value does not fit in a 32-bit integer, raises a
1593 ///    floating-point invalid exception. If the exception is masked, returns
1594 ///    the most negative integer.
1595 ///
1596 /// \headerfile <x86intrin.h>
1597 ///
1598 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1599 ///
1600 /// \param __a
1601 ///    A 128-bit vector of [4 x float].
1602 /// \returns A 64-bit integer vector containing the converted values.
1603 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtt_ps2pi(__m128 __a)1604 _mm_cvtt_ps2pi(__m128 __a)
1605 {
1606   return _mm_cvttps_pi32(__a);
1607 }
1608 
1609 /// Converts a 32-bit signed integer value into a floating point value
1610 ///    and writes it to the lower 32 bits of the destination. The remaining
1611 ///    higher order elements of the destination vector are copied from the
1612 ///    corresponding elements in the first operand.
1613 ///
1614 /// \headerfile <x86intrin.h>
1615 ///
1616 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1617 ///
1618 /// \param __a
1619 ///    A 128-bit vector of [4 x float].
1620 /// \param __b
1621 ///    A 32-bit signed integer operand containing the value to be converted.
1622 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1623 ///    converted value of the second operand. The upper 96 bits are copied from
1624 ///    the upper 96 bits of the first operand.
1625 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi32_ss(__m128 __a,int __b)1626 _mm_cvtsi32_ss(__m128 __a, int __b)
1627 {
1628   __a[0] = __b;
1629   return __a;
1630 }
1631 
1632 /// Converts a 32-bit signed integer value into a floating point value
1633 ///    and writes it to the lower 32 bits of the destination. The remaining
1634 ///    higher order elements of the destination are copied from the
1635 ///    corresponding elements in the first operand.
1636 ///
1637 /// \headerfile <x86intrin.h>
1638 ///
1639 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1640 ///
1641 /// \param __a
1642 ///    A 128-bit vector of [4 x float].
1643 /// \param __b
1644 ///    A 32-bit signed integer operand containing the value to be converted.
1645 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1646 ///    converted value of the second operand. The upper 96 bits are copied from
1647 ///    the upper 96 bits of the first operand.
1648 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvt_si2ss(__m128 __a,int __b)1649 _mm_cvt_si2ss(__m128 __a, int __b)
1650 {
1651   return _mm_cvtsi32_ss(__a, __b);
1652 }
1653 
1654 #ifdef __x86_64__
1655 
1656 /// Converts a 64-bit signed integer value into a floating point value
1657 ///    and writes it to the lower 32 bits of the destination. The remaining
1658 ///    higher order elements of the destination are copied from the
1659 ///    corresponding elements in the first operand.
1660 ///
1661 /// \headerfile <x86intrin.h>
1662 ///
1663 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1664 ///
1665 /// \param __a
1666 ///    A 128-bit vector of [4 x float].
1667 /// \param __b
1668 ///    A 64-bit signed integer operand containing the value to be converted.
1669 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1670 ///    converted value of the second operand. The upper 96 bits are copied from
1671 ///    the upper 96 bits of the first operand.
1672 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi64_ss(__m128 __a,long long __b)1673 _mm_cvtsi64_ss(__m128 __a, long long __b)
1674 {
1675   __a[0] = __b;
1676   return __a;
1677 }
1678 
1679 #endif
1680 
1681 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1682 ///    floating point values and writes them to the lower 64-bits of the
1683 ///    destination. The remaining higher order elements of the destination are
1684 ///    copied from the corresponding elements in the first operand.
1685 ///
1686 /// \headerfile <x86intrin.h>
1687 ///
1688 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1689 ///
1690 /// \param __a
1691 ///    A 128-bit vector of [4 x float].
1692 /// \param __b
1693 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1694 ///    and written to the corresponding low-order elements in the destination.
1695 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1696 ///    converted value of the second operand. The upper 64 bits are copied from
1697 ///    the upper 64 bits of the first operand.
1698 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32_ps(__m128 __a,__m64 __b)1699 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1700 {
1701   return (__m128)__builtin_shufflevector(
1702       (__v4sf)__a,
1703       __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1704       4, 5, 2, 3);
1705 }
1706 
1707 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1708 ///    floating point values and writes them to the lower 64-bits of the
1709 ///    destination. The remaining higher order elements of the destination are
1710 ///    copied from the corresponding elements in the first operand.
1711 ///
1712 /// \headerfile <x86intrin.h>
1713 ///
1714 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1715 ///
1716 /// \param __a
1717 ///    A 128-bit vector of [4 x float].
1718 /// \param __b
1719 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1720 ///    and written to the corresponding low-order elements in the destination.
1721 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1722 ///    converted value from the second operand. The upper 64 bits are copied
1723 ///    from the upper 64 bits of the first operand.
1724 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_pi2ps(__m128 __a,__m64 __b)1725 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1726 {
1727   return _mm_cvtpi32_ps(__a, __b);
1728 }
1729 
1730 /// Extracts a float value contained in the lower 32 bits of a vector of
1731 ///    [4 x float].
1732 ///
1733 /// \headerfile <x86intrin.h>
1734 ///
1735 /// This intrinsic has no corresponding instruction.
1736 ///
1737 /// \param __a
1738 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1739 ///    used in the extraction.
1740 /// \returns A 32-bit float containing the extracted value.
1741 static __inline__ float __DEFAULT_FN_ATTRS
_mm_cvtss_f32(__m128 __a)1742 _mm_cvtss_f32(__m128 __a)
1743 {
1744   return __a[0];
1745 }
1746 
1747 /// Loads two packed float values from the address \a __p into the
1748 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1749 ///     are copied from the low-order bits of the first operand.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1754 ///
1755 /// \param __a
1756 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1757 ///    of the destination.
1758 /// \param __p
1759 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1760 ///    [127:64] of the destination.
1761 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1762 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a,const __m64 * __p)1763 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1764 {
1765   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1766   struct __mm_loadh_pi_struct {
1767     __mm_loadh_pi_v2f32 __u;
1768   } __attribute__((__packed__, __may_alias__));
1769   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1770   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1771   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1772 }
1773 
1774 /// Loads two packed float values from the address \a __p into the
1775 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1776 ///    are copied from the high-order bits of the first operand.
1777 ///
1778 /// \headerfile <x86intrin.h>
1779 ///
1780 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1781 ///
1782 /// \param __a
1783 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1784 ///    [127:64] of the destination.
1785 /// \param __p
1786 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1787 ///    [63:0] of the destination.
1788 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1789 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a,const __m64 * __p)1790 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1791 {
1792   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1793   struct __mm_loadl_pi_struct {
1794     __mm_loadl_pi_v2f32 __u;
1795   } __attribute__((__packed__, __may_alias__));
1796   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1797   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1798   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1799 }
1800 
1801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1802 ///    32 bits of the vector are initialized with the single-precision
1803 ///    floating-point value loaded from a specified memory location. The upper
1804 ///    96 bits are set to zero.
1805 ///
1806 /// \headerfile <x86intrin.h>
1807 ///
1808 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1809 ///
1810 /// \param __p
1811 ///    A pointer to a 32-bit memory location containing a single-precision
1812 ///    floating-point value.
1813 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1814 ///    lower 32 bits contain the value loaded from the memory location. The
1815 ///    upper 96 bits are set to zero.
1816 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ss(const float * __p)1817 _mm_load_ss(const float *__p)
1818 {
1819   struct __mm_load_ss_struct {
1820     float __u;
1821   } __attribute__((__packed__, __may_alias__));
1822   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1823   return __extension__ (__m128){ __u, 0, 0, 0 };
1824 }
1825 
1826 /// Loads a 32-bit float value and duplicates it to all four vector
1827 ///    elements of a 128-bit vector of [4 x float].
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1832 ///    instruction.
1833 ///
1834 /// \param __p
1835 ///    A pointer to a float value to be loaded and duplicated.
1836 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1837 ///    duplicated values.
1838 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float * __p)1839 _mm_load1_ps(const float *__p)
1840 {
1841   struct __mm_load1_ps_struct {
1842     float __u;
1843   } __attribute__((__packed__, __may_alias__));
1844   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1845   return __extension__ (__m128){ __u, __u, __u, __u };
1846 }
1847 
1848 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1849 
1850 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1851 ///    memory location.
1852 ///
1853 /// \headerfile <x86intrin.h>
1854 ///
1855 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1856 ///
1857 /// \param __p
1858 ///    A pointer to a 128-bit memory location. The address of the memory
1859 ///    location has to be 128-bit aligned.
1860 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1861 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ps(const float * __p)1862 _mm_load_ps(const float *__p)
1863 {
1864   return *(const __m128*)__p;
1865 }
1866 
1867 /// Loads a 128-bit floating-point vector of [4 x float] from an
1868 ///    unaligned memory location.
1869 ///
1870 /// \headerfile <x86intrin.h>
1871 ///
1872 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1873 ///
1874 /// \param __p
1875 ///    A pointer to a 128-bit memory location. The address of the memory
1876 ///    location does not have to be aligned.
1877 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1878 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadu_ps(const float * __p)1879 _mm_loadu_ps(const float *__p)
1880 {
1881   struct __loadu_ps {
1882     __m128_u __v;
1883   } __attribute__((__packed__, __may_alias__));
1884   return ((const struct __loadu_ps*)__p)->__v;
1885 }
1886 
1887 /// Loads four packed float values, in reverse order, from an aligned
1888 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1889 ///
1890 /// \headerfile <x86intrin.h>
1891 ///
1892 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1893 ///    instruction.
1894 ///
1895 /// \param __p
1896 ///    A pointer to a 128-bit memory location. The address of the memory
1897 ///    location has to be 128-bit aligned.
1898 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1899 ///    in reverse order.
1900 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float * __p)1901 _mm_loadr_ps(const float *__p)
1902 {
1903   __m128 __a = _mm_load_ps(__p);
1904   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1905 }
1906 
1907 /// Create a 128-bit vector of [4 x float] with undefined values.
1908 ///
1909 /// \headerfile <x86intrin.h>
1910 ///
1911 /// This intrinsic has no corresponding instruction.
1912 ///
1913 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1914 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)1915 _mm_undefined_ps(void)
1916 {
1917   return (__m128)__builtin_ia32_undef128();
1918 }
1919 
1920 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1921 ///    32 bits of the vector are initialized with the specified single-precision
1922 ///    floating-point value. The upper 96 bits are set to zero.
1923 ///
1924 /// \headerfile <x86intrin.h>
1925 ///
1926 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1927 ///
1928 /// \param __w
1929 ///    A single-precision floating-point value used to initialize the lower 32
1930 ///    bits of the result.
1931 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1932 ///    lower 32 bits contain the value provided in the source operand. The
1933 ///    upper 96 bits are set to zero.
1934 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ss(float __w)1935 _mm_set_ss(float __w)
1936 {
1937   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1938 }
1939 
1940 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1941 ///    of the four single-precision floating-point vector elements set to the
1942 ///    specified single-precision floating-point value.
1943 ///
1944 /// \headerfile <x86intrin.h>
1945 ///
1946 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1947 ///
1948 /// \param __w
1949 ///    A single-precision floating-point value used to initialize each vector
1950 ///    element of the result.
1951 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1952 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set1_ps(float __w)1953 _mm_set1_ps(float __w)
1954 {
1955   return __extension__ (__m128){ __w, __w, __w, __w };
1956 }
1957 
1958 /* Microsoft specific. */
1959 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1960 ///    of the four single-precision floating-point vector elements set to the
1961 ///    specified single-precision floating-point value.
1962 ///
1963 /// \headerfile <x86intrin.h>
1964 ///
1965 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1966 ///
1967 /// \param __w
1968 ///    A single-precision floating-point value used to initialize each vector
1969 ///    element of the result.
1970 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1971 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps1(float __w)1972 _mm_set_ps1(float __w)
1973 {
1974     return _mm_set1_ps(__w);
1975 }
1976 
1977 /// Constructs a 128-bit floating-point vector of [4 x float]
1978 ///    initialized with the specified single-precision floating-point values.
1979 ///
1980 /// \headerfile <x86intrin.h>
1981 ///
1982 /// This intrinsic is a utility function and does not correspond to a specific
1983 ///    instruction.
1984 ///
1985 /// \param __z
1986 ///    A single-precision floating-point value used to initialize bits [127:96]
1987 ///    of the result.
1988 /// \param __y
1989 ///    A single-precision floating-point value used to initialize bits [95:64]
1990 ///    of the result.
1991 /// \param __x
1992 ///    A single-precision floating-point value used to initialize bits [63:32]
1993 ///    of the result.
1994 /// \param __w
1995 ///    A single-precision floating-point value used to initialize bits [31:0]
1996 ///    of the result.
1997 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1998 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps(float __z,float __y,float __x,float __w)1999 _mm_set_ps(float __z, float __y, float __x, float __w)
2000 {
2001   return __extension__ (__m128){ __w, __x, __y, __z };
2002 }
2003 
2004 /// Constructs a 128-bit floating-point vector of [4 x float],
2005 ///    initialized in reverse order with the specified 32-bit single-precision
2006 ///    float-point values.
2007 ///
2008 /// \headerfile <x86intrin.h>
2009 ///
2010 /// This intrinsic is a utility function and does not correspond to a specific
2011 ///    instruction.
2012 ///
2013 /// \param __z
2014 ///    A single-precision floating-point value used to initialize bits [31:0]
2015 ///    of the result.
2016 /// \param __y
2017 ///    A single-precision floating-point value used to initialize bits [63:32]
2018 ///    of the result.
2019 /// \param __x
2020 ///    A single-precision floating-point value used to initialize bits [95:64]
2021 ///    of the result.
2022 /// \param __w
2023 ///    A single-precision floating-point value used to initialize bits [127:96]
2024 ///    of the result.
2025 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2026 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setr_ps(float __z,float __y,float __x,float __w)2027 _mm_setr_ps(float __z, float __y, float __x, float __w)
2028 {
2029   return __extension__ (__m128){ __z, __y, __x, __w };
2030 }
2031 
2032 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2033 ///    to zero.
2034 ///
2035 /// \headerfile <x86intrin.h>
2036 ///
2037 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2038 ///
2039 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2040 ///    all elements set to zero.
2041 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setzero_ps(void)2042 _mm_setzero_ps(void)
2043 {
2044   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2045 }
2046 
2047 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2048 ///    memory location.
2049 ///
2050 /// \headerfile <x86intrin.h>
2051 ///
2052 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2053 ///
2054 /// \param __p
2055 ///    A pointer to a 64-bit memory location.
2056 /// \param __a
2057 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2058 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 * __p,__m128 __a)2059 _mm_storeh_pi(__m64 *__p, __m128 __a)
2060 {
2061   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2062   struct __mm_storeh_pi_struct {
2063     __mm_storeh_pi_v2f32 __u;
2064   } __attribute__((__packed__, __may_alias__));
2065   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2066 }
2067 
2068 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2069 ///     memory location.
2070 ///
2071 /// \headerfile <x86intrin.h>
2072 ///
2073 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2074 ///
2075 /// \param __p
2076 ///    A pointer to a memory location that will receive the float values.
2077 /// \param __a
2078 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2079 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 * __p,__m128 __a)2080 _mm_storel_pi(__m64 *__p, __m128 __a)
2081 {
2082   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2083   struct __mm_storeh_pi_struct {
2084     __mm_storeh_pi_v2f32 __u;
2085   } __attribute__((__packed__, __may_alias__));
2086   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2087 }
2088 
2089 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2090 ///     memory location.
2091 ///
2092 /// \headerfile <x86intrin.h>
2093 ///
2094 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2095 ///
2096 /// \param __p
2097 ///    A pointer to a 32-bit memory location.
2098 /// \param __a
2099 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2100 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ss(float * __p,__m128 __a)2101 _mm_store_ss(float *__p, __m128 __a)
2102 {
2103   struct __mm_store_ss_struct {
2104     float __u;
2105   } __attribute__((__packed__, __may_alias__));
2106   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2107 }
2108 
2109 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2110 ///    location.
2111 ///
2112 /// \headerfile <x86intrin.h>
2113 ///
2114 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2115 ///
2116 /// \param __p
2117 ///    A pointer to a 128-bit memory location. The address of the memory
2118 ///    location does not have to be aligned.
2119 /// \param __a
2120 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2121 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_ps(float * __p,__m128 __a)2122 _mm_storeu_ps(float *__p, __m128 __a)
2123 {
2124   struct __storeu_ps {
2125     __m128_u __v;
2126   } __attribute__((__packed__, __may_alias__));
2127   ((struct __storeu_ps*)__p)->__v = __a;
2128 }
2129 
2130 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2131 ///    location.
2132 ///
2133 /// \headerfile <x86intrin.h>
2134 ///
2135 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2136 ///
2137 /// \param __p
2138 ///    A pointer to a 128-bit memory location. The address of the memory
2139 ///    location has to be 16-byte aligned.
2140 /// \param __a
2141 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2142 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float * __p,__m128 __a)2143 _mm_store_ps(float *__p, __m128 __a)
2144 {
2145   *(__m128*)__p = __a;
2146 }
2147 
2148 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2149 ///    four contiguous elements in an aligned memory location.
2150 ///
2151 /// \headerfile <x86intrin.h>
2152 ///
2153 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2154 ///    instruction.
2155 ///
2156 /// \param __p
2157 ///    A pointer to a 128-bit memory location.
2158 /// \param __a
2159 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2160 ///    of the four contiguous elements pointed by \a __p.
2161 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float * __p,__m128 __a)2162 _mm_store1_ps(float *__p, __m128 __a)
2163 {
2164   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2165   _mm_store_ps(__p, __a);
2166 }
2167 
2168 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2169 ///    four contiguous elements in an aligned memory location.
2170 ///
2171 /// \headerfile <x86intrin.h>
2172 ///
2173 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2174 ///    instruction.
2175 ///
2176 /// \param __p
2177 ///    A pointer to a 128-bit memory location.
2178 /// \param __a
2179 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2180 ///    of the four contiguous elements pointed by \a __p.
2181 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float * __p,__m128 __a)2182 _mm_store_ps1(float *__p, __m128 __a)
2183 {
2184   _mm_store1_ps(__p, __a);
2185 }
2186 
2187 /// Stores float values from a 128-bit vector of [4 x float] to an
2188 ///    aligned memory location in reverse order.
2189 ///
2190 /// \headerfile <x86intrin.h>
2191 ///
2192 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2193 ///    instruction.
2194 ///
2195 /// \param __p
2196 ///    A pointer to a 128-bit memory location. The address of the memory
2197 ///    location has to be 128-bit aligned.
2198 /// \param __a
2199 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2200 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float * __p,__m128 __a)2201 _mm_storer_ps(float *__p, __m128 __a)
2202 {
2203   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2204   _mm_store_ps(__p, __a);
2205 }
2206 
2207 #define _MM_HINT_ET0 7
2208 #define _MM_HINT_ET1 6
2209 #define _MM_HINT_T0  3
2210 #define _MM_HINT_T1  2
2211 #define _MM_HINT_T2  1
2212 #define _MM_HINT_NTA 0
2213 
2214 #ifndef _MSC_VER
2215 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2216    Sema doesn't do any form of constant propagation yet. */
2217 
2218 /// Loads one cache line of data from the specified address to a location
2219 ///    closer to the processor.
2220 ///
2221 /// \headerfile <x86intrin.h>
2222 ///
2223 /// \code
2224 /// void _mm_prefetch(const void *a, const int sel);
2225 /// \endcode
2226 ///
2227 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2228 ///
2229 /// \param a
2230 ///    A pointer to a memory location containing a cache line of data.
2231 /// \param sel
2232 ///    A predefined integer constant specifying the type of prefetch
2233 ///    operation: \n
2234 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2235 ///    PREFETCHNTA instruction will be generated. \n
2236 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2237 ///    be generated. \n
2238 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2239 ///    be generated. \n
2240 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2241 ///    be generated.
2242 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2243                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2244 #endif
2245 
2246 /// Stores a 64-bit integer in the specified aligned memory location. To
2247 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2248 ///    used again soon).
2249 ///
2250 /// \headerfile <x86intrin.h>
2251 ///
2252 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2253 ///
2254 /// \param __p
2255 ///    A pointer to an aligned memory location used to store the register value.
2256 /// \param __a
2257 ///    A 64-bit integer containing the value to be stored.
2258 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pi(void * __p,__m64 __a)2259 _mm_stream_pi(void *__p, __m64 __a)
2260 {
2261   __builtin_nontemporal_store(__a, (__m64 *)__p);
2262 }
2263 
2264 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2265 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2266 ///    as non-temporal (unlikely to be used again soon).
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2271 ///
2272 /// \param __p
2273 ///    A pointer to a 128-bit aligned memory location that will receive the
2274 ///    single-precision floating-point values.
2275 /// \param __a
2276 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2277 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(void * __p,__m128 __a)2278 _mm_stream_ps(void *__p, __m128 __a)
2279 {
2280   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2281 }
2282 
2283 #if defined(__cplusplus)
2284 extern "C" {
2285 #endif
2286 
2287 /// Forces strong memory ordering (serialization) between store
2288 ///    instructions preceding this instruction and store instructions following
2289 ///    this instruction, ensuring the system completes all previous stores
2290 ///    before executing subsequent stores.
2291 ///
2292 /// \headerfile <x86intrin.h>
2293 ///
2294 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2295 ///
2296 void _mm_sfence(void);
2297 
2298 #if defined(__cplusplus)
2299 } // extern "C"
2300 #endif
2301 
2302 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2303 ///    returns it, as specified by the immediate integer operand.
2304 ///
2305 /// \headerfile <x86intrin.h>
2306 ///
2307 /// \code
2308 /// int _mm_extract_pi16(__m64 a, int n);
2309 /// \endcode
2310 ///
2311 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2312 ///
2313 /// \param a
2314 ///    A 64-bit vector of [4 x i16].
2315 /// \param n
2316 ///    An immediate integer operand that determines which bits are extracted: \n
2317 ///    0: Bits [15:0] are copied to the destination. \n
2318 ///    1: Bits [31:16] are copied to the destination. \n
2319 ///    2: Bits [47:32] are copied to the destination. \n
2320 ///    3: Bits [63:48] are copied to the destination.
2321 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2322 #define _mm_extract_pi16(a, n) \
2323   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2324 
2325 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2326 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2327 ///    specified by the immediate operand \a n.
2328 ///
2329 /// \headerfile <x86intrin.h>
2330 ///
2331 /// \code
2332 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2333 /// \endcode
2334 ///
2335 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2336 ///
2337 /// \param a
2338 ///    A 64-bit vector of [4 x i16].
2339 /// \param d
2340 ///    An integer. The lower 16-bit value from this operand is written to the
2341 ///    destination at the offset specified by operand \a n.
2342 /// \param n
2343 ///    An immediate integer operant that determines which the bits to be used
2344 ///    in the destination. \n
2345 ///    0: Bits [15:0] are copied to the destination. \n
2346 ///    1: Bits [31:16] are copied to the destination. \n
2347 ///    2: Bits [47:32] are copied to the destination. \n
2348 ///    3: Bits [63:48] are copied to the destination.  \n
2349 ///    The remaining bits in the destination are copied from the corresponding
2350 ///    bits in operand \a a.
2351 /// \returns A 64-bit integer vector containing the copied packed data from the
2352 ///    operands.
2353 #define _mm_insert_pi16(a, d, n) \
2354   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2355 
2356 /// Compares each of the corresponding packed 16-bit integer values of
2357 ///    the 64-bit integer vectors, and writes the greater value to the
2358 ///    corresponding bits in the destination.
2359 ///
2360 /// \headerfile <x86intrin.h>
2361 ///
2362 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2363 ///
2364 /// \param __a
2365 ///    A 64-bit integer vector containing one of the source operands.
2366 /// \param __b
2367 ///    A 64-bit integer vector containing one of the source operands.
2368 /// \returns A 64-bit integer vector containing the comparison results.
2369 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pi16(__m64 __a,__m64 __b)2370 _mm_max_pi16(__m64 __a, __m64 __b)
2371 {
2372   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2373 }
2374 
2375 /// Compares each of the corresponding packed 8-bit unsigned integer
2376 ///    values of the 64-bit integer vectors, and writes the greater value to the
2377 ///    corresponding bits in the destination.
2378 ///
2379 /// \headerfile <x86intrin.h>
2380 ///
2381 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2382 ///
2383 /// \param __a
2384 ///    A 64-bit integer vector containing one of the source operands.
2385 /// \param __b
2386 ///    A 64-bit integer vector containing one of the source operands.
2387 /// \returns A 64-bit integer vector containing the comparison results.
2388 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pu8(__m64 __a,__m64 __b)2389 _mm_max_pu8(__m64 __a, __m64 __b)
2390 {
2391   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2392 }
2393 
2394 /// Compares each of the corresponding packed 16-bit integer values of
2395 ///    the 64-bit integer vectors, and writes the lesser value to the
2396 ///    corresponding bits in the destination.
2397 ///
2398 /// \headerfile <x86intrin.h>
2399 ///
2400 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2401 ///
2402 /// \param __a
2403 ///    A 64-bit integer vector containing one of the source operands.
2404 /// \param __b
2405 ///    A 64-bit integer vector containing one of the source operands.
2406 /// \returns A 64-bit integer vector containing the comparison results.
2407 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pi16(__m64 __a,__m64 __b)2408 _mm_min_pi16(__m64 __a, __m64 __b)
2409 {
2410   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2411 }
2412 
2413 /// Compares each of the corresponding packed 8-bit unsigned integer
2414 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2415 ///    corresponding bits in the destination.
2416 ///
2417 /// \headerfile <x86intrin.h>
2418 ///
2419 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2420 ///
2421 /// \param __a
2422 ///    A 64-bit integer vector containing one of the source operands.
2423 /// \param __b
2424 ///    A 64-bit integer vector containing one of the source operands.
2425 /// \returns A 64-bit integer vector containing the comparison results.
2426 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pu8(__m64 __a,__m64 __b)2427 _mm_min_pu8(__m64 __a, __m64 __b)
2428 {
2429   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2430 }
2431 
2432 /// Takes the most significant bit from each 8-bit element in a 64-bit
2433 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2434 ///    32-bit integer and writes it to the destination.
2435 ///
2436 /// \headerfile <x86intrin.h>
2437 ///
2438 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2439 ///
2440 /// \param __a
2441 ///    A 64-bit integer vector containing the values with bits to be extracted.
2442 /// \returns The most significant bit from each 8-bit element in \a __a,
2443 ///    written to bits [7:0].
2444 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
_mm_movemask_pi8(__m64 __a)2445 _mm_movemask_pi8(__m64 __a)
2446 {
2447   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2448 }
2449 
2450 /// Multiplies packed 16-bit unsigned integer values and writes the
2451 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2452 ///    the destination.
2453 ///
2454 /// \headerfile <x86intrin.h>
2455 ///
2456 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2457 ///
2458 /// \param __a
2459 ///    A 64-bit integer vector containing one of the source operands.
2460 /// \param __b
2461 ///    A 64-bit integer vector containing one of the source operands.
2462 /// \returns A 64-bit integer vector containing the products of both operands.
2463 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mulhi_pu16(__m64 __a,__m64 __b)2464 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2465 {
2466   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2467                                              (__v8hi)__anyext128(__b)));
2468 }
2469 
2470 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2471 ///    destination, as specified by the immediate value operand.
2472 ///
2473 /// \headerfile <x86intrin.h>
2474 ///
2475 /// \code
2476 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2477 /// \endcode
2478 ///
2479 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2480 ///
2481 /// \param a
2482 ///    A 64-bit integer vector containing the values to be shuffled.
2483 /// \param n
2484 ///    An immediate value containing an 8-bit value specifying which elements to
2485 ///    copy from \a a. The destinations within the 64-bit destination are
2486 ///    assigned values as follows: \n
2487 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2488 ///    destination. \n
2489 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2490 ///    destination. \n
2491 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2492 ///    destination. \n
2493 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2494 ///    destination. \n
2495 ///    Bit value assignments: \n
2496 ///    00: assigned from bits [15:0] of \a a. \n
2497 ///    01: assigned from bits [31:16] of \a a. \n
2498 ///    10: assigned from bits [47:32] of \a a. \n
2499 ///    11: assigned from bits [63:48] of \a a. \n
2500 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2501 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2502 ///    <c>[b6, b4, b2, b0]</c>.
2503 /// \returns A 64-bit integer vector containing the shuffled values.
2504 #define _mm_shuffle_pi16(a, n)                                                 \
2505   ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2506                                   (n) & 0x3, ((n) >> 2) & 0x3,                 \
2507                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2508 
2509 /// Conditionally copies the values from each 8-bit element in the first
2510 ///    64-bit integer vector operand to the specified memory location, as
2511 ///    specified by the most significant bit in the corresponding element in the
2512 ///    second 64-bit integer vector operand.
2513 ///
2514 ///    To minimize caching, the data is flagged as non-temporal
2515 ///    (unlikely to be used again soon).
2516 ///
2517 /// \headerfile <x86intrin.h>
2518 ///
2519 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2520 ///
2521 /// \param __d
2522 ///    A 64-bit integer vector containing the values with elements to be copied.
2523 /// \param __n
2524 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2525 ///    element determines whether the corresponding element in operand \a __d
2526 ///    is copied. If the most significant bit of a given element is 1, the
2527 ///    corresponding element in operand \a __d is copied.
2528 /// \param __p
2529 ///    A pointer to a 64-bit memory location that will receive the conditionally
2530 ///    copied integer values. The address of the memory location does not have
2531 ///    to be aligned.
2532 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)2533 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2534 {
2535   // This is complex, because we need to support the case where __p is pointing
2536   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2537   // write might cause a trap where a 64-bit maskmovq would not. (Memory
2538   // locations not selected by the mask bits might still cause traps.)
2539   __m128i __d128  = __anyext128(__d);
2540   __m128i __n128  = __zext128(__n);
2541   if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2542       ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2543     // If there's a risk of spurious trap due to a 128-bit write, back up the
2544     // pointer by 8 bytes and shift values in registers to match.
2545     __p -= 8;
2546     __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2547     __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2548   }
2549 
2550   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2551 }
2552 
2553 /// Computes the rounded averages of the packed unsigned 8-bit integer
2554 ///    values and writes the averages to the corresponding bits in the
2555 ///    destination.
2556 ///
2557 /// \headerfile <x86intrin.h>
2558 ///
2559 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2560 ///
2561 /// \param __a
2562 ///    A 64-bit integer vector containing one of the source operands.
2563 /// \param __b
2564 ///    A 64-bit integer vector containing one of the source operands.
2565 /// \returns A 64-bit integer vector containing the averages of both operands.
2566 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu8(__m64 __a,__m64 __b)2567 _mm_avg_pu8(__m64 __a, __m64 __b)
2568 {
2569   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2570                                            (__v16qi)__anyext128(__b)));
2571 }
2572 
2573 /// Computes the rounded averages of the packed unsigned 16-bit integer
2574 ///    values and writes the averages to the corresponding bits in the
2575 ///    destination.
2576 ///
2577 /// \headerfile <x86intrin.h>
2578 ///
2579 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2580 ///
2581 /// \param __a
2582 ///    A 64-bit integer vector containing one of the source operands.
2583 /// \param __b
2584 ///    A 64-bit integer vector containing one of the source operands.
2585 /// \returns A 64-bit integer vector containing the averages of both operands.
2586 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu16(__m64 __a,__m64 __b)2587 _mm_avg_pu16(__m64 __a, __m64 __b)
2588 {
2589   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2590                                            (__v8hi)__anyext128(__b)));
2591 }
2592 
2593 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2594 ///    64-bit vector operands and computes the absolute value for each of the
2595 ///    difference. Then sum of the 8 absolute differences is written to the
2596 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2597 ///
2598 /// \headerfile <x86intrin.h>
2599 ///
2600 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2601 ///
2602 /// \param __a
2603 ///    A 64-bit integer vector containing one of the source operands.
2604 /// \param __b
2605 ///    A 64-bit integer vector containing one of the source operands.
2606 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2607 ///    sets of absolute differences between both operands. The upper bits are
2608 ///    cleared.
2609 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sad_pu8(__m64 __a,__m64 __b)2610 _mm_sad_pu8(__m64 __a, __m64 __b)
2611 {
2612   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2613                                             (__v16qi)__zext128(__b)));
2614 }
2615 
2616 #if defined(__cplusplus)
2617 extern "C" {
2618 #endif
2619 
2620 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2621 ///    integer value.
2622 ///
2623 ///    There are several groups of macros associated with this
2624 ///    intrinsic, including:
2625 ///    <ul>
2626 ///    <li>
2627 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2628 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2629 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2630 ///      _MM_GET_EXCEPTION_STATE().
2631 ///    </li>
2632 ///    <li>
2633 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2634 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2635 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2636 ///    </li>
2637 ///    <li>
2638 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2639 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2640 ///      _MM_GET_ROUNDING_MODE().
2641 ///    </li>
2642 ///    <li>
2643 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2644 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2645 ///    </li>
2646 ///    <li>
2647 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2648 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2649 ///      _MM_GET_DENORMALS_ZERO_MODE().
2650 ///    </li>
2651 ///    </ul>
2652 ///
2653 ///    For example, the following expression checks if an overflow exception has
2654 ///    occurred:
2655 ///    \code
2656 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2657 ///    \endcode
2658 ///
2659 ///    The following expression gets the current rounding mode:
2660 ///    \code
2661 ///      _MM_GET_ROUNDING_MODE()
2662 ///    \endcode
2663 ///
2664 /// \headerfile <x86intrin.h>
2665 ///
2666 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2667 ///
2668 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2669 ///    register.
2670 unsigned int _mm_getcsr(void);
2671 
2672 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2673 ///
2674 ///    There are several groups of macros associated with this intrinsic,
2675 ///    including:
2676 ///    <ul>
2677 ///    <li>
2678 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2679 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2680 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2681 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2682 ///    </li>
2683 ///    <li>
2684 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2685 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2686 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2687 ///      of these macros.
2688 ///    </li>
2689 ///    <li>
2690 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2691 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2692 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2693 ///    </li>
2694 ///    <li>
2695 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2696 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2697 ///      one of these macros.
2698 ///    </li>
2699 ///    <li>
2700 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2701 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2702 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2703 ///    </li>
2704 ///    </ul>
2705 ///
2706 ///    For example, the following expression causes subsequent floating-point
2707 ///    operations to round up:
2708 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2709 ///
2710 ///    The following example sets the DAZ and FTZ flags:
2711 ///    \code
2712 ///    void setFlags() {
2713 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2714 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2715 ///    }
2716 ///    \endcode
2717 ///
2718 /// \headerfile <x86intrin.h>
2719 ///
2720 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2721 ///
2722 /// \param __i
2723 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2724 void _mm_setcsr(unsigned int __i);
2725 
2726 #if defined(__cplusplus)
2727 } // extern "C"
2728 #endif
2729 
2730 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2731 ///    specified by the immediate value operand.
2732 ///
2733 /// \headerfile <x86intrin.h>
2734 ///
2735 /// \code
2736 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2737 /// \endcode
2738 ///
2739 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2740 ///
2741 /// \param a
2742 ///    A 128-bit vector of [4 x float].
2743 /// \param b
2744 ///    A 128-bit vector of [4 x float].
2745 /// \param mask
2746 ///    An immediate value containing an 8-bit value specifying which elements to
2747 ///    copy from \a a and \a b. \n
2748 ///    Bits [3:0] specify the values copied from operand \a a. \n
2749 ///    Bits [7:4] specify the values copied from operand \a b. \n
2750 ///    The destinations within the 128-bit destination are assigned values as
2751 ///    follows: \n
2752 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2753 ///    destination. \n
2754 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2755 ///    destination. \n
2756 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2757 ///    destination. \n
2758 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2759 ///    destination. \n
2760 ///    Bit value assignments: \n
2761 ///    00: Bits [31:0] copied from the specified operand. \n
2762 ///    01: Bits [63:32] copied from the specified operand. \n
2763 ///    10: Bits [95:64] copied from the specified operand. \n
2764 ///    11: Bits [127:96] copied from the specified operand. \n
2765 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2766 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2767 ///    <c>[b6, b4, b2, b0]</c>.
2768 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2769 #define _mm_shuffle_ps(a, b, mask) \
2770   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2771                                  (int)(mask)))
2772 
2773 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2774 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2779 ///
2780 /// \param __a
2781 ///    A 128-bit vector of [4 x float]. \n
2782 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2783 ///    Bits [127:96] are written to bits [95:64] of the destination.
2784 /// \param __b
2785 ///    A 128-bit vector of [4 x float].
2786 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2787 ///    Bits [127:96] are written to bits [127:96] of the destination.
2788 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2789 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpackhi_ps(__m128 __a,__m128 __b)2790 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2791 {
2792   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2793 }
2794 
2795 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2796 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2797 ///
2798 /// \headerfile <x86intrin.h>
2799 ///
2800 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2801 ///
2802 /// \param __a
2803 ///    A 128-bit vector of [4 x float]. \n
2804 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2805 ///    Bits [63:32] are written to bits [95:64] of the destination.
2806 /// \param __b
2807 ///    A 128-bit vector of [4 x float]. \n
2808 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2809 ///    Bits [63:32] are written to bits [127:96] of the destination.
2810 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2811 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpacklo_ps(__m128 __a,__m128 __b)2812 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2813 {
2814   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2815 }
2816 
2817 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2818 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2819 ///    96 bits are set to the upper 96 bits of the first parameter.
2820 ///
2821 /// \headerfile <x86intrin.h>
2822 ///
2823 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2824 ///    instruction.
2825 ///
2826 /// \param __a
2827 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2828 ///    written to the upper 96 bits of the result.
2829 /// \param __b
2830 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2831 ///    written to the lower 32 bits of the result.
2832 /// \returns A 128-bit floating-point vector of [4 x float].
2833 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_move_ss(__m128 __a,__m128 __b)2834 _mm_move_ss(__m128 __a, __m128 __b)
2835 {
2836   __a[0] = __b[0];
2837   return __a;
2838 }
2839 
2840 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2841 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2842 ///    64 bits are set to the upper 64 bits of the first parameter.
2843 ///
2844 /// \headerfile <x86intrin.h>
2845 ///
2846 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2847 ///
2848 /// \param __a
2849 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2850 ///    written to the upper 64 bits of the result.
2851 /// \param __b
2852 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2853 ///    written to the lower 64 bits of the result.
2854 /// \returns A 128-bit floating-point vector of [4 x float].
2855 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehl_ps(__m128 __a,__m128 __b)2856 _mm_movehl_ps(__m128 __a, __m128 __b)
2857 {
2858   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2859 }
2860 
2861 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2862 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2863 ///    64 bits are set to the lower 64 bits of the second parameter.
2864 ///
2865 /// \headerfile <x86intrin.h>
2866 ///
2867 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2868 ///
2869 /// \param __a
2870 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2871 ///    written to the lower 64 bits of the result.
2872 /// \param __b
2873 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2874 ///    written to the upper 64 bits of the result.
2875 /// \returns A 128-bit floating-point vector of [4 x float].
2876 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movelh_ps(__m128 __a,__m128 __b)2877 _mm_movelh_ps(__m128 __a, __m128 __b)
2878 {
2879   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2880 }
2881 
2882 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2883 ///    float].
2884 ///
2885 /// \headerfile <x86intrin.h>
2886 ///
2887 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2888 ///
2889 /// \param __a
2890 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2891 ///    from the corresponding elements in this operand.
2892 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2893 ///    values from the operand.
2894 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi16_ps(__m64 __a)2895 _mm_cvtpi16_ps(__m64 __a)
2896 {
2897   return __builtin_convertvector((__v4hi)__a, __v4sf);
2898 }
2899 
2900 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2901 ///    128-bit vector of [4 x float].
2902 ///
2903 /// \headerfile <x86intrin.h>
2904 ///
2905 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2906 ///
2907 /// \param __a
2908 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2909 ///    destination are copied from the corresponding elements in this operand.
2910 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2911 ///    values from the operand.
2912 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu16_ps(__m64 __a)2913 _mm_cvtpu16_ps(__m64 __a)
2914 {
2915   return __builtin_convertvector((__v4hu)__a, __v4sf);
2916 }
2917 
2918 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2919 ///    into a 128-bit vector of [4 x float].
2920 ///
2921 /// \headerfile <x86intrin.h>
2922 ///
2923 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2924 ///
2925 /// \param __a
2926 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2927 ///    from the corresponding lower 4 elements in this operand.
2928 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2929 ///    values from the operand.
2930 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi8_ps(__m64 __a)2931 _mm_cvtpi8_ps(__m64 __a)
2932 {
2933   return __builtin_convertvector(
2934       __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2935                               0, 1, 2, 3), __v4sf);
2936 }
2937 
2938 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2939 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2940 ///
2941 /// \headerfile <x86intrin.h>
2942 ///
2943 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2944 ///
2945 /// \param __a
2946 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2947 ///    destination are copied from the corresponding lower 4 elements in this
2948 ///    operand.
2949 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2950 ///    values from the source operand.
2951 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu8_ps(__m64 __a)2952 _mm_cvtpu8_ps(__m64 __a)
2953 {
2954   return __builtin_convertvector(
2955       __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2956                               0, 1, 2, 3), __v4sf);
2957 }
2958 
2959 /// Converts the two 32-bit signed integer values from each 64-bit vector
2960 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2961 ///
2962 /// \headerfile <x86intrin.h>
2963 ///
2964 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2965 ///
2966 /// \param __a
2967 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2968 ///    copied from the elements in this operand.
2969 /// \param __b
2970 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2971 ///    copied from the elements in this operand.
2972 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2973 ///    copied and converted values from the first operand. The upper 64 bits
2974 ///    contain the copied and converted values from the second operand.
2975 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)2976 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2977 {
2978   return __builtin_convertvector(
2979       __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2980                               0, 1, 2, 3), __v4sf);
2981 }
2982 
2983 /// Converts each single-precision floating-point element of a 128-bit
2984 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2985 ///    packs the results into a 64-bit integer vector of [4 x i16].
2986 ///
2987 ///    If the floating-point element is NaN or infinity, or if the
2988 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2989 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2990 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2991 ///
2992 /// \headerfile <x86intrin.h>
2993 ///
2994 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2995 ///
2996 /// \param __a
2997 ///    A 128-bit floating-point vector of [4 x float].
2998 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2999 ///    values.
3000 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi16(__m128 __a)3001 _mm_cvtps_pi16(__m128 __a)
3002 {
3003   return __trunc64(__builtin_ia32_packssdw128(
3004       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
3005 }
3006 
3007 /// Converts each single-precision floating-point element of a 128-bit
3008 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
3009 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
3010 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
3011 ///
3012 ///    If the floating-point element is NaN or infinity, or if the
3013 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3014 ///    is converted to 0x80. Otherwise if the floating-point element is greater
3015 ///    than 0x7F, it is converted to 0x7F.
3016 ///
3017 /// \headerfile <x86intrin.h>
3018 ///
3019 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3020 ///
3021 /// \param __a
3022 ///    128-bit floating-point vector of [4 x float].
3023 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3024 ///    converted values and the uppper 32 bits are set to zero.
3025 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi8(__m128 __a)3026 _mm_cvtps_pi8(__m128 __a)
3027 {
3028   __m64 __b, __c;
3029 
3030   __b = _mm_cvtps_pi16(__a);
3031   __c = _mm_setzero_si64();
3032 
3033   return _mm_packs_pi16(__b, __c);
3034 }
3035 
3036 /// Extracts the sign bits from each single-precision floating-point
3037 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3038 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3039 ///    to zero.
3040 ///
3041 /// \headerfile <x86intrin.h>
3042 ///
3043 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3044 ///
3045 /// \param __a
3046 ///    A 128-bit floating-point vector of [4 x float].
3047 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3048 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3049 ///    set to zero.
3050 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_ps(__m128 __a)3051 _mm_movemask_ps(__m128 __a)
3052 {
3053   return __builtin_ia32_movmskps((__v4sf)__a);
3054 }
3055 
3056 /* Compare */
3057 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3058 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3059 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3060 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3061 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3062 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3063 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3064 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3065 
3066 /// Compares each of the corresponding values of two 128-bit vectors of
3067 ///    [4 x float], using the operation specified by the immediate integer
3068 ///    operand.
3069 ///
3070 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3071 ///    If either value in a comparison is NaN, comparisons that are ordered
3072 ///    return false, and comparisons that are unordered return true.
3073 ///
3074 /// \headerfile <x86intrin.h>
3075 ///
3076 /// \code
3077 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3078 /// \endcode
3079 ///
3080 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3081 ///
3082 /// \param a
3083 ///    A 128-bit vector of [4 x float].
3084 /// \param b
3085 ///    A 128-bit vector of [4 x float].
3086 /// \param c
3087 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3088 ///    operation to use: \n
3089 ///    0x00: Equal (ordered, non-signaling) \n
3090 ///    0x01: Less-than (ordered, signaling) \n
3091 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3092 ///    0x03: Unordered (non-signaling) \n
3093 ///    0x04: Not-equal (unordered, non-signaling) \n
3094 ///    0x05: Not-less-than (unordered, signaling) \n
3095 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3096 ///    0x07: Ordered (non-signaling) \n
3097 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3098 #define _mm_cmp_ps(a, b, c)                                                    \
3099   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3100 
3101 /// Compares each of the corresponding scalar values of two 128-bit
3102 ///    vectors of [4 x float], using the operation specified by the immediate
3103 ///    integer operand.
3104 ///
3105 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3106 ///    If either value in a comparison is NaN, comparisons that are ordered
3107 ///    return false, and comparisons that are unordered return true.
3108 ///
3109 /// \headerfile <x86intrin.h>
3110 ///
3111 /// \code
3112 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3113 /// \endcode
3114 ///
3115 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3116 ///
3117 /// \param a
3118 ///    A 128-bit vector of [4 x float].
3119 /// \param b
3120 ///    A 128-bit vector of [4 x float].
3121 /// \param c
3122 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3123 ///    operation to use: \n
3124 ///    0x00: Equal (ordered, non-signaling) \n
3125 ///    0x01: Less-than (ordered, signaling) \n
3126 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3127 ///    0x03: Unordered (non-signaling) \n
3128 ///    0x04: Not-equal (unordered, non-signaling) \n
3129 ///    0x05: Not-less-than (unordered, signaling) \n
3130 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3131 ///    0x07: Ordered (non-signaling) \n
3132 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3133 #define _mm_cmp_ss(a, b, c)                                                    \
3134   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3135 
3136 #define _MM_ALIGN16 __attribute__((aligned(16)))
3137 
3138 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3139 
3140 #define _MM_EXCEPT_INVALID    (0x0001U)
3141 #define _MM_EXCEPT_DENORM     (0x0002U)
3142 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3143 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3144 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3145 #define _MM_EXCEPT_INEXACT    (0x0020U)
3146 #define _MM_EXCEPT_MASK       (0x003fU)
3147 
3148 #define _MM_MASK_INVALID      (0x0080U)
3149 #define _MM_MASK_DENORM       (0x0100U)
3150 #define _MM_MASK_DIV_ZERO     (0x0200U)
3151 #define _MM_MASK_OVERFLOW     (0x0400U)
3152 #define _MM_MASK_UNDERFLOW    (0x0800U)
3153 #define _MM_MASK_INEXACT      (0x1000U)
3154 #define _MM_MASK_MASK         (0x1f80U)
3155 
3156 #define _MM_ROUND_NEAREST     (0x0000U)
3157 #define _MM_ROUND_DOWN        (0x2000U)
3158 #define _MM_ROUND_UP          (0x4000U)
3159 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3160 #define _MM_ROUND_MASK        (0x6000U)
3161 
3162 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3163 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3164 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3165 
3166 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3167 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3168 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3169 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3170 
3171 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3172 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3173 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3174 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3175 
3176 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3177 do { \
3178   __m128 tmp3, tmp2, tmp1, tmp0; \
3179   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3180   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3181   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3182   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3183   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3184   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3185   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3186   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3187 } while (0)
3188 
3189 /* Aliases for compatibility. */
3190 #define _m_pextrw _mm_extract_pi16
3191 #define _m_pinsrw _mm_insert_pi16
3192 #define _m_pmaxsw _mm_max_pi16
3193 #define _m_pmaxub _mm_max_pu8
3194 #define _m_pminsw _mm_min_pi16
3195 #define _m_pminub _mm_min_pu8
3196 #define _m_pmovmskb _mm_movemask_pi8
3197 #define _m_pmulhuw _mm_mulhi_pu16
3198 #define _m_pshufw _mm_shuffle_pi16
3199 #define _m_maskmovq _mm_maskmove_si64
3200 #define _m_pavgb _mm_avg_pu8
3201 #define _m_pavgw _mm_avg_pu16
3202 #define _m_psadbw _mm_sad_pu8
3203 #define _m_ _mm_
3204 
3205 #undef __trunc64
3206 #undef __zext128
3207 #undef __anyext128
3208 #undef __zeroupper64
3209 #undef __DEFAULT_FN_ATTRS
3210 #undef __DEFAULT_FN_ATTRS_SSE2
3211 
3212 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3213 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3214 #include <emmintrin.h>
3215 #endif
3216 
3217 #endif /* __XMMINTRIN_H */
3218