1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __MMINTRIN_H
11 #define __MMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
18 
19 typedef long long __v1di __attribute__((__vector_size__(8)));
20 typedef int __v2si __attribute__((__vector_size__(8)));
21 typedef short __v4hi __attribute__((__vector_size__(8)));
22 typedef char __v8qi __attribute__((__vector_size__(8)));
23 
24 /* Unsigned types */
25 typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
26 typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
27 typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
28 typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
29 
30 /* We need an explicitly signed variant for char. Note that this shouldn't
31  * appear in the interface though. */
32 typedef signed char __v8qs __attribute__((__vector_size__(8)));
33 
34 /* SSE/SSE2 types */
35 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
36 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
37 typedef int __v4si __attribute__((__vector_size__(16)));
38 typedef short __v8hi __attribute__((__vector_size__(16)));
39 typedef char __v16qi __attribute__((__vector_size__(16)));
40 
41 /* Define the default attributes for the functions in this file. */
42 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
43 #define __DEFAULT_FN_ATTRS_SSE2                                                \
44   __attribute__((__always_inline__, __nodebug__,                               \
45                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
46 #else
47 #define __DEFAULT_FN_ATTRS_SSE2                                                \
48   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
49                  __min_vector_width__(128)))
50 #endif
51 
52 #define __trunc64(x)                                                           \
53   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
54 #define __anyext128(x)                                                         \
55   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
56                                     1, -1, -1)
57 
58 /// Clears the MMX state by setting the state of the x87 stack registers
59 ///    to empty.
60 ///
61 /// \headerfile <x86intrin.h>
62 ///
63 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
64 ///
65 static __inline__ void __attribute__((__always_inline__, __nodebug__,
66                                       __target__("mmx,no-evex512")))
_mm_empty(void)67 _mm_empty(void) {
68   __builtin_ia32_emms();
69 }
70 
71 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
72 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
73 ///
74 /// \headerfile <x86intrin.h>
75 ///
76 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
77 ///
78 /// \param __i
79 ///    A 32-bit integer value.
80 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
81 ///    parameter. The upper 32 bits are set to 0.
82 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi32_si64(int __i)83 _mm_cvtsi32_si64(int __i)
84 {
85     return __extension__ (__m64)(__v2si){__i, 0};
86 }
87 
88 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
89 ///    signed integer.
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
94 ///
95 /// \param __m
96 ///    A 64-bit integer vector.
97 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
98 ///    parameter.
99 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi64_si32(__m64 __m)100 _mm_cvtsi64_si32(__m64 __m)
101 {
102     return ((__v2si)__m)[0];
103 }
104 
105 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
106 ///
107 /// \headerfile <x86intrin.h>
108 ///
109 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
110 ///
111 /// \param __i
112 ///    A 64-bit signed integer.
113 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
114 ///    parameter.
115 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi64_m64(long long __i)116 _mm_cvtsi64_m64(long long __i)
117 {
118     return (__m64)__i;
119 }
120 
121 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
122 ///
123 /// \headerfile <x86intrin.h>
124 ///
125 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
126 ///
127 /// \param __m
128 ///    A 64-bit integer vector.
129 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
130 ///    parameter.
131 static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
_mm_cvtm64_si64(__m64 __m)132 _mm_cvtm64_si64(__m64 __m)
133 {
134     return (long long)__m;
135 }
136 
137 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
138 ///    vector parameters of [4 x i16] into 8-bit signed integer values, and
139 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
140 ///
141 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
142 ///    less than 0x80 are saturated to 0x80.
143 ///
144 /// \headerfile <x86intrin.h>
145 ///
146 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
147 ///
148 /// \param __m1
149 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
150 ///    written to the lower 32 bits of the result.
151 /// \param __m2
152 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
153 ///    written to the upper 32 bits of the result.
154 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
155 ///    values.
156 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pi16(__m64 __m1,__m64 __m2)157 _mm_packs_pi16(__m64 __m1, __m64 __m2)
158 {
159     return __trunc64(__builtin_ia32_packsswb128(
160         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
161 }
162 
163 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
164 ///    vector parameters of [2 x i32] into 16-bit signed integer values, and
165 ///    constructs a 64-bit integer vector of [4 x i16] as the result.
166 ///
167 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
168 ///    values less than 0x8000 are saturated to 0x8000.
169 ///
170 /// \headerfile <x86intrin.h>
171 ///
172 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
173 ///
174 /// \param __m1
175 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
176 ///    written to the lower 32 bits of the result.
177 /// \param __m2
178 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
179 ///    written to the upper 32 bits of the result.
180 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
181 ///    values.
182 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pi32(__m64 __m1,__m64 __m2)183 _mm_packs_pi32(__m64 __m1, __m64 __m2)
184 {
185     return __trunc64(__builtin_ia32_packssdw128(
186         (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
187 }
188 
189 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
190 ///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
191 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
192 ///
193 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
194 ///    saturated to 0.
195 ///
196 /// \headerfile <x86intrin.h>
197 ///
198 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
199 ///
200 /// \param __m1
201 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
202 ///    written to the lower 32 bits of the result.
203 /// \param __m2
204 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
205 ///    written to the upper 32 bits of the result.
206 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
207 ///    values.
208 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pu16(__m64 __m1,__m64 __m2)209 _mm_packs_pu16(__m64 __m1, __m64 __m2)
210 {
211     return __trunc64(__builtin_ia32_packuswb128(
212         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
213 }
214 
215 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
216 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
217 ///
218 /// \headerfile <x86intrin.h>
219 ///
220 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
221 ///
222 /// \param __m1
223 ///    A 64-bit integer vector of [8 x i8]. \n
224 ///    Bits [39:32] are written to bits [7:0] of the result. \n
225 ///    Bits [47:40] are written to bits [23:16] of the result. \n
226 ///    Bits [55:48] are written to bits [39:32] of the result. \n
227 ///    Bits [63:56] are written to bits [55:48] of the result.
228 /// \param __m2
229 ///    A 64-bit integer vector of [8 x i8].
230 ///    Bits [39:32] are written to bits [15:8] of the result. \n
231 ///    Bits [47:40] are written to bits [31:24] of the result. \n
232 ///    Bits [55:48] are written to bits [47:40] of the result. \n
233 ///    Bits [63:56] are written to bits [63:56] of the result.
234 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
235 ///    values.
236 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)237 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
238 {
239     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
240                                           4, 12, 5, 13, 6, 14, 7, 15);
241 }
242 
243 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
244 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
245 ///
246 /// \headerfile <x86intrin.h>
247 ///
248 /// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
249 ///
250 /// \param __m1
251 ///    A 64-bit integer vector of [4 x i16].
252 ///    Bits [47:32] are written to bits [15:0] of the result. \n
253 ///    Bits [63:48] are written to bits [47:32] of the result.
254 /// \param __m2
255 ///    A 64-bit integer vector of [4 x i16].
256 ///    Bits [47:32] are written to bits [31:16] of the result. \n
257 ///    Bits [63:48] are written to bits [63:48] of the result.
258 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
259 ///    values.
260 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)261 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
262 {
263     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
264                                           2, 6, 3, 7);
265 }
266 
267 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
268 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
269 ///
270 /// \headerfile <x86intrin.h>
271 ///
272 /// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
273 ///
274 /// \param __m1
275 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
276 ///    the lower 32 bits of the result.
277 /// \param __m2
278 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
279 ///    the upper 32 bits of the result.
280 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
281 ///    values.
282 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)283 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
284 {
285     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
286 }
287 
288 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
289 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
290 ///
291 /// \headerfile <x86intrin.h>
292 ///
293 /// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
294 ///
295 /// \param __m1
296 ///    A 64-bit integer vector of [8 x i8].
297 ///    Bits [7:0] are written to bits [7:0] of the result. \n
298 ///    Bits [15:8] are written to bits [23:16] of the result. \n
299 ///    Bits [23:16] are written to bits [39:32] of the result. \n
300 ///    Bits [31:24] are written to bits [55:48] of the result.
301 /// \param __m2
302 ///    A 64-bit integer vector of [8 x i8].
303 ///    Bits [7:0] are written to bits [15:8] of the result. \n
304 ///    Bits [15:8] are written to bits [31:24] of the result. \n
305 ///    Bits [23:16] are written to bits [47:40] of the result. \n
306 ///    Bits [31:24] are written to bits [63:56] of the result.
307 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
308 ///    values.
309 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)310 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
311 {
312     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
313                                           0, 8, 1, 9, 2, 10, 3, 11);
314 }
315 
316 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
317 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
322 ///
323 /// \param __m1
324 ///    A 64-bit integer vector of [4 x i16].
325 ///    Bits [15:0] are written to bits [15:0] of the result. \n
326 ///    Bits [31:16] are written to bits [47:32] of the result.
327 /// \param __m2
328 ///    A 64-bit integer vector of [4 x i16].
329 ///    Bits [15:0] are written to bits [31:16] of the result. \n
330 ///    Bits [31:16] are written to bits [63:48] of the result.
331 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
332 ///    values.
333 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)334 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
335 {
336     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
337                                           0, 4, 1, 5);
338 }
339 
340 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
341 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
346 ///
347 /// \param __m1
348 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
349 ///    the lower 32 bits of the result.
350 /// \param __m2
351 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
352 ///    the upper 32 bits of the result.
353 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
354 ///    values.
355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)356 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
357 {
358     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
359 }
360 
361 /// Adds each 8-bit integer element of the first 64-bit integer vector
362 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
363 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
364 ///    packed into a 64-bit integer vector of [8 x i8].
365 ///
366 /// \headerfile <x86intrin.h>
367 ///
368 /// This intrinsic corresponds to the <c> PADDB </c> instruction.
369 ///
370 /// \param __m1
371 ///    A 64-bit integer vector of [8 x i8].
372 /// \param __m2
373 ///    A 64-bit integer vector of [8 x i8].
374 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
375 ///    parameters.
376 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi8(__m64 __m1,__m64 __m2)377 _mm_add_pi8(__m64 __m1, __m64 __m2)
378 {
379     return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
380 }
381 
382 /// Adds each 16-bit integer element of the first 64-bit integer vector
383 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
384 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
385 ///    packed into a 64-bit integer vector of [4 x i16].
386 ///
387 /// \headerfile <x86intrin.h>
388 ///
389 /// This intrinsic corresponds to the <c> PADDW </c> instruction.
390 ///
391 /// \param __m1
392 ///    A 64-bit integer vector of [4 x i16].
393 /// \param __m2
394 ///    A 64-bit integer vector of [4 x i16].
395 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
396 ///    parameters.
397 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi16(__m64 __m1,__m64 __m2)398 _mm_add_pi16(__m64 __m1, __m64 __m2)
399 {
400     return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
401 }
402 
403 /// Adds each 32-bit integer element of the first 64-bit integer vector
404 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
405 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
406 ///    packed into a 64-bit integer vector of [2 x i32].
407 ///
408 /// \headerfile <x86intrin.h>
409 ///
410 /// This intrinsic corresponds to the <c> PADDD </c> instruction.
411 ///
412 /// \param __m1
413 ///    A 64-bit integer vector of [2 x i32].
414 /// \param __m2
415 ///    A 64-bit integer vector of [2 x i32].
416 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
417 ///    parameters.
418 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi32(__m64 __m1,__m64 __m2)419 _mm_add_pi32(__m64 __m1, __m64 __m2)
420 {
421     return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
422 }
423 
424 /// Adds, with saturation, each 8-bit signed integer element of the first
425 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
426 ///    integer element of the second 64-bit integer vector of [8 x i8].
427 ///
428 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
429 ///    less than 0x80 are saturated to 0x80. The results are packed into a
430 ///    64-bit integer vector of [8 x i8].
431 ///
432 /// \headerfile <x86intrin.h>
433 ///
434 /// This intrinsic corresponds to the <c> PADDSB </c> instruction.
435 ///
436 /// \param __m1
437 ///    A 64-bit integer vector of [8 x i8].
438 /// \param __m2
439 ///    A 64-bit integer vector of [8 x i8].
440 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
441 ///    of both parameters.
442 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pi8(__m64 __m1,__m64 __m2)443 _mm_adds_pi8(__m64 __m1, __m64 __m2)
444 {
445     return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
446 }
447 
448 /// Adds, with saturation, each 16-bit signed integer element of the first
449 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
450 ///    integer element of the second 64-bit integer vector of [4 x i16].
451 ///
452 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
453 ///    less than 0x8000 are saturated to 0x8000. The results are packed into a
454 ///    64-bit integer vector of [4 x i16].
455 ///
456 /// \headerfile <x86intrin.h>
457 ///
458 /// This intrinsic corresponds to the <c> PADDSW </c> instruction.
459 ///
460 /// \param __m1
461 ///    A 64-bit integer vector of [4 x i16].
462 /// \param __m2
463 ///    A 64-bit integer vector of [4 x i16].
464 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
465 ///    of both parameters.
466 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pi16(__m64 __m1,__m64 __m2)467 _mm_adds_pi16(__m64 __m1, __m64 __m2)
468 {
469     return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
470 }
471 
472 /// Adds, with saturation, each 8-bit unsigned integer element of the first
473 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
474 ///    integer element of the second 64-bit integer vector of [8 x i8].
475 ///
476 ///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
477 ///    into a 64-bit integer vector of [8 x i8].
478 ///
479 /// \headerfile <x86intrin.h>
480 ///
481 /// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
482 ///
483 /// \param __m1
484 ///    A 64-bit integer vector of [8 x i8].
485 /// \param __m2
486 ///    A 64-bit integer vector of [8 x i8].
487 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
488 ///    unsigned sums of both parameters.
489 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pu8(__m64 __m1,__m64 __m2)490 _mm_adds_pu8(__m64 __m1, __m64 __m2)
491 {
492     return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
493 }
494 
495 /// Adds, with saturation, each 16-bit unsigned integer element of the first
496 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
497 ///    integer element of the second 64-bit integer vector of [4 x i16].
498 ///
499 ///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
500 ///    into a 64-bit integer vector of [4 x i16].
501 ///
502 /// \headerfile <x86intrin.h>
503 ///
504 /// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
505 ///
506 /// \param __m1
507 ///    A 64-bit integer vector of [4 x i16].
508 /// \param __m2
509 ///    A 64-bit integer vector of [4 x i16].
510 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
511 ///    unsigned sums of both parameters.
512 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pu16(__m64 __m1,__m64 __m2)513 _mm_adds_pu16(__m64 __m1, __m64 __m2)
514 {
515     return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
516 }
517 
518 /// Subtracts each 8-bit integer element of the second 64-bit integer
519 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
520 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
521 ///    are packed into a 64-bit integer vector of [8 x i8].
522 ///
523 /// \headerfile <x86intrin.h>
524 ///
525 /// This intrinsic corresponds to the <c> PSUBB </c> instruction.
526 ///
527 /// \param __m1
528 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
529 /// \param __m2
530 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
531 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
532 ///    both parameters.
533 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi8(__m64 __m1,__m64 __m2)534 _mm_sub_pi8(__m64 __m1, __m64 __m2)
535 {
536     return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
537 }
538 
539 /// Subtracts each 16-bit integer element of the second 64-bit integer
540 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
541 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
542 ///    results are packed into a 64-bit integer vector of [4 x i16].
543 ///
544 /// \headerfile <x86intrin.h>
545 ///
546 /// This intrinsic corresponds to the <c> PSUBW </c> instruction.
547 ///
548 /// \param __m1
549 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
550 /// \param __m2
551 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
552 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
553 ///    both parameters.
554 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi16(__m64 __m1,__m64 __m2)555 _mm_sub_pi16(__m64 __m1, __m64 __m2)
556 {
557     return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
558 }
559 
560 /// Subtracts each 32-bit integer element of the second 64-bit integer
561 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
562 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
563 ///    results are packed into a 64-bit integer vector of [2 x i32].
564 ///
565 /// \headerfile <x86intrin.h>
566 ///
567 /// This intrinsic corresponds to the <c> PSUBD </c> instruction.
568 ///
569 /// \param __m1
570 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
571 /// \param __m2
572 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
573 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
574 ///    both parameters.
575 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi32(__m64 __m1,__m64 __m2)576 _mm_sub_pi32(__m64 __m1, __m64 __m2)
577 {
578     return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
579 }
580 
581 /// Subtracts, with saturation, each 8-bit signed integer element of the second
582 ///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
583 ///    integer element of the first 64-bit integer vector of [8 x i8].
584 ///
585 ///    Positive results greater than 0x7F are saturated to 0x7F. Negative
586 ///    results less than 0x80 are saturated to 0x80. The results are packed
587 ///    into a 64-bit integer vector of [8 x i8].
588 ///
589 /// \headerfile <x86intrin.h>
590 ///
591 /// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
592 ///
593 /// \param __m1
594 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
595 /// \param __m2
596 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
597 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
598 ///    differences of both parameters.
599 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pi8(__m64 __m1,__m64 __m2)600 _mm_subs_pi8(__m64 __m1, __m64 __m2)
601 {
602     return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
603 }
604 
605 /// Subtracts, with saturation, each 16-bit signed integer element of the
606 ///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
607 ///    signed integer element of the first 64-bit integer vector of [4 x i16].
608 ///
609 ///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
610 ///    results less than 0x8000 are saturated to 0x8000. The results are packed
611 ///    into a 64-bit integer vector of [4 x i16].
612 ///
613 /// \headerfile <x86intrin.h>
614 ///
615 /// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
616 ///
617 /// \param __m1
618 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
619 /// \param __m2
620 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
621 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
622 ///    differences of both parameters.
623 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pi16(__m64 __m1,__m64 __m2)624 _mm_subs_pi16(__m64 __m1, __m64 __m2)
625 {
626     return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
627 }
628 
629 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
630 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
631 ///    element of the first 64-bit integer vector of [8 x i8].
632 ///
633 ///    If an element of the first vector is less than the corresponding element
634 ///    of the second vector, the result is saturated to 0. The results are
635 ///    packed into a 64-bit integer vector of [8 x i8].
636 ///
637 /// \headerfile <x86intrin.h>
638 ///
639 /// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
640 ///
641 /// \param __m1
642 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
643 /// \param __m2
644 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
645 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
646 ///    differences of both parameters.
647 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pu8(__m64 __m1,__m64 __m2)648 _mm_subs_pu8(__m64 __m1, __m64 __m2)
649 {
650     return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
651 }
652 
653 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
654 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
655 ///    integer element of the first 64-bit integer vector of [4 x i16].
656 ///
657 ///    If an element of the first vector is less than the corresponding element
658 ///    of the second vector, the result is saturated to 0. The results are
659 ///    packed into a 64-bit integer vector of [4 x i16].
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
664 ///
665 /// \param __m1
666 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
667 /// \param __m2
668 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
669 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
670 ///    differences of both parameters.
671 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pu16(__m64 __m1,__m64 __m2)672 _mm_subs_pu16(__m64 __m1, __m64 __m2)
673 {
674     return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
675 }
676 
677 /// Multiplies each 16-bit signed integer element of the first 64-bit
678 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
679 ///    element of the second 64-bit integer vector of [4 x i16] and get four
680 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
681 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
682 ///    vector of [2 x i32].
683 ///
684 ///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
685 ///    of both parameters are multiplied, and the sum of both results is written
686 ///    to bits [31:0] of the result.
687 ///
688 /// \headerfile <x86intrin.h>
689 ///
690 /// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
691 ///
692 /// \param __m1
693 ///    A 64-bit integer vector of [4 x i16].
694 /// \param __m2
695 ///    A 64-bit integer vector of [4 x i16].
696 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
697 ///    products of both parameters.
698 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_madd_pi16(__m64 __m1,__m64 __m2)699 _mm_madd_pi16(__m64 __m1, __m64 __m2)
700 {
701     return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
702                                                (__v8hi)__anyext128(__m2)));
703 }
704 
705 /// Multiplies each 16-bit signed integer element of the first 64-bit
706 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
707 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
708 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
709 ///
710 /// \headerfile <x86intrin.h>
711 ///
712 /// This intrinsic corresponds to the <c> PMULHW </c> instruction.
713 ///
714 /// \param __m1
715 ///    A 64-bit integer vector of [4 x i16].
716 /// \param __m2
717 ///    A 64-bit integer vector of [4 x i16].
718 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
719 ///    of the products of both parameters.
720 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)721 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
722 {
723     return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
724                                               (__v8hi)__anyext128(__m2)));
725 }
726 
727 /// Multiplies each 16-bit signed integer element of the first 64-bit
728 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
729 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
730 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
731 ///
732 /// \headerfile <x86intrin.h>
733 ///
734 /// This intrinsic corresponds to the <c> PMULLW </c> instruction.
735 ///
736 /// \param __m1
737 ///    A 64-bit integer vector of [4 x i16].
738 /// \param __m2
739 ///    A 64-bit integer vector of [4 x i16].
740 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
741 ///    of the products of both parameters.
742 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mullo_pi16(__m64 __m1,__m64 __m2)743 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
744 {
745     return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
746 }
747 
748 /// Left-shifts each 16-bit signed integer element of the first
749 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
750 ///    of bits specified by the second parameter, which is a 64-bit integer. The
751 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
752 ///    [4 x i16].
753 ///
754 /// \headerfile <x86intrin.h>
755 ///
756 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
757 ///
758 /// \param __m
759 ///    A 64-bit integer vector of [4 x i16].
760 /// \param __count
761 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
762 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
763 ///    values. If \a __count is greater or equal to 16, the result is set to all
764 ///    0.
765 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_pi16(__m64 __m,__m64 __count)766 _mm_sll_pi16(__m64 __m, __m64 __count)
767 {
768     return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
769                                              (__v8hi)__anyext128(__count)));
770 }
771 
772 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
773 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
774 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
775 ///    of [4 x i16].
776 ///
777 /// \headerfile <x86intrin.h>
778 ///
779 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
780 ///
781 /// \param __m
782 ///    A 64-bit integer vector of [4 x i16].
783 /// \param __count
784 ///    A 32-bit integer value.
785 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
786 ///    values. If \a __count is greater or equal to 16, the result is set to all
787 ///    0.
788 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_pi16(__m64 __m,int __count)789 _mm_slli_pi16(__m64 __m, int __count)
790 {
791     return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
792                                               __count));
793 }
794 
795 /// Left-shifts each 32-bit signed integer element of the first
796 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
797 ///    of bits specified by the second parameter, which is a 64-bit integer. The
798 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
799 ///    [2 x i32].
800 ///
801 /// \headerfile <x86intrin.h>
802 ///
803 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
804 ///
805 /// \param __m
806 ///    A 64-bit integer vector of [2 x i32].
807 /// \param __count
808 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
809 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
810 ///    values. If \a __count is greater or equal to 32, the result is set to all
811 ///    0.
812 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_pi32(__m64 __m,__m64 __count)813 _mm_sll_pi32(__m64 __m, __m64 __count)
814 {
815     return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
816                                              (__v4si)__anyext128(__count)));
817 }
818 
819 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
820 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
821 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
822 ///    of [2 x i32].
823 ///
824 /// \headerfile <x86intrin.h>
825 ///
826 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
827 ///
828 /// \param __m
829 ///    A 64-bit integer vector of [2 x i32].
830 /// \param __count
831 ///    A 32-bit integer value.
832 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
833 ///    values. If \a __count is greater or equal to 32, the result is set to all
834 ///    0.
835 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_pi32(__m64 __m,int __count)836 _mm_slli_pi32(__m64 __m, int __count)
837 {
838     return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
839                                               __count));
840 }
841 
842 /// Left-shifts the first 64-bit integer parameter by the number of bits
843 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
844 ///    result are returned.
845 ///
846 /// \headerfile <x86intrin.h>
847 ///
848 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
849 ///
850 /// \param __m
851 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
852 /// \param __count
853 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
854 /// \returns A 64-bit integer vector containing the left-shifted value. If
855 ///     \a __count is greater or equal to 64, the result is set to 0.
856 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_si64(__m64 __m,__m64 __count)857 _mm_sll_si64(__m64 __m, __m64 __count)
858 {
859     return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
860                                              (__v2di)__anyext128(__count)));
861 }
862 
863 /// Left-shifts the first parameter, which is a 64-bit integer, by the
864 ///    number of bits specified by the second parameter, which is a 32-bit
865 ///    integer. The lower 64 bits of result are returned.
866 ///
867 /// \headerfile <x86intrin.h>
868 ///
869 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
870 ///
871 /// \param __m
872 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
873 /// \param __count
874 ///    A 32-bit integer value.
875 /// \returns A 64-bit integer vector containing the left-shifted value. If
876 ///     \a __count is greater or equal to 64, the result is set to 0.
877 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_si64(__m64 __m,int __count)878 _mm_slli_si64(__m64 __m, int __count)
879 {
880     return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
881                                               __count));
882 }
883 
884 /// Right-shifts each 16-bit integer element of the first parameter,
885 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
886 ///    specified by the second parameter, which is a 64-bit integer.
887 ///
888 ///    High-order bits are filled with the sign bit of the initial value of each
889 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
890 ///    vector of [4 x i16].
891 ///
892 /// \headerfile <x86intrin.h>
893 ///
894 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
895 ///
896 /// \param __m
897 ///    A 64-bit integer vector of [4 x i16].
898 /// \param __count
899 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
900 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
901 ///    values.
902 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sra_pi16(__m64 __m,__m64 __count)903 _mm_sra_pi16(__m64 __m, __m64 __count)
904 {
905     return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
906                                              (__v8hi)__anyext128(__count)));
907 }
908 
909 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
910 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
911 ///
912 ///    High-order bits are filled with the sign bit of the initial value of each
913 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
914 ///    vector of [4 x i16].
915 ///
916 /// \headerfile <x86intrin.h>
917 ///
918 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
919 ///
920 /// \param __m
921 ///    A 64-bit integer vector of [4 x i16].
922 /// \param __count
923 ///    A 32-bit integer value.
924 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
925 ///    values.
926 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srai_pi16(__m64 __m,int __count)927 _mm_srai_pi16(__m64 __m, int __count)
928 {
929     return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
930                                               __count));
931 }
932 
933 /// Right-shifts each 32-bit integer element of the first parameter,
934 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
935 ///    specified by the second parameter, which is a 64-bit integer.
936 ///
937 ///    High-order bits are filled with the sign bit of the initial value of each
938 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
939 ///    vector of [2 x i32].
940 ///
941 /// \headerfile <x86intrin.h>
942 ///
943 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
944 ///
945 /// \param __m
946 ///    A 64-bit integer vector of [2 x i32].
947 /// \param __count
948 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
949 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
950 ///    values.
951 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sra_pi32(__m64 __m,__m64 __count)952 _mm_sra_pi32(__m64 __m, __m64 __count)
953 {
954     return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
955                                              (__v4si)__anyext128(__count)));
956 }
957 
958 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
959 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
960 ///
961 ///    High-order bits are filled with the sign bit of the initial value of each
962 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
963 ///    vector of [2 x i32].
964 ///
965 /// \headerfile <x86intrin.h>
966 ///
967 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
968 ///
969 /// \param __m
970 ///    A 64-bit integer vector of [2 x i32].
971 /// \param __count
972 ///    A 32-bit integer value.
973 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
974 ///    values.
975 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srai_pi32(__m64 __m,int __count)976 _mm_srai_pi32(__m64 __m, int __count)
977 {
978     return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
979                                               __count));
980 }
981 
982 /// Right-shifts each 16-bit integer element of the first parameter,
983 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
984 ///    specified by the second parameter, which is a 64-bit integer.
985 ///
986 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
987 ///    integer vector of [4 x i16].
988 ///
989 /// \headerfile <x86intrin.h>
990 ///
991 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
992 ///
993 /// \param __m
994 ///    A 64-bit integer vector of [4 x i16].
995 /// \param __count
996 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
997 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
998 ///    values.
999 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_pi16(__m64 __m,__m64 __count)1000 _mm_srl_pi16(__m64 __m, __m64 __count)
1001 {
1002     return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
1003                                              (__v8hi)__anyext128(__count)));
1004 }
1005 
1006 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
1007 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
1008 ///
1009 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
1010 ///    integer vector of [4 x i16].
1011 ///
1012 /// \headerfile <x86intrin.h>
1013 ///
1014 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
1015 ///
1016 /// \param __m
1017 ///    A 64-bit integer vector of [4 x i16].
1018 /// \param __count
1019 ///    A 32-bit integer value.
1020 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
1021 ///    values.
1022 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_pi16(__m64 __m,int __count)1023 _mm_srli_pi16(__m64 __m, int __count)
1024 {
1025     return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
1026                                               __count));
1027 }
1028 
1029 /// Right-shifts each 32-bit integer element of the first parameter,
1030 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
1031 ///    specified by the second parameter, which is a 64-bit integer.
1032 ///
1033 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1034 ///    integer vector of [2 x i32].
1035 ///
1036 /// \headerfile <x86intrin.h>
1037 ///
1038 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1039 ///
1040 /// \param __m
1041 ///    A 64-bit integer vector of [2 x i32].
1042 /// \param __count
1043 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1044 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1045 ///    values.
1046 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_pi32(__m64 __m,__m64 __count)1047 _mm_srl_pi32(__m64 __m, __m64 __count)
1048 {
1049     return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
1050                                              (__v4si)__anyext128(__count)));
1051 }
1052 
1053 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
1054 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
1055 ///
1056 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1057 ///    integer vector of [2 x i32].
1058 ///
1059 /// \headerfile <x86intrin.h>
1060 ///
1061 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1062 ///
1063 /// \param __m
1064 ///    A 64-bit integer vector of [2 x i32].
1065 /// \param __count
1066 ///    A 32-bit integer value.
1067 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1068 ///    values.
1069 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_pi32(__m64 __m,int __count)1070 _mm_srli_pi32(__m64 __m, int __count)
1071 {
1072     return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
1073                                               __count));
1074 }
1075 
1076 /// Right-shifts the first 64-bit integer parameter by the number of bits
1077 ///    specified by the second 64-bit integer parameter.
1078 ///
1079 ///    High-order bits are cleared.
1080 ///
1081 /// \headerfile <x86intrin.h>
1082 ///
1083 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1084 ///
1085 /// \param __m
1086 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1087 /// \param __count
1088 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1089 /// \returns A 64-bit integer vector containing the right-shifted value.
1090 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_si64(__m64 __m,__m64 __count)1091 _mm_srl_si64(__m64 __m, __m64 __count)
1092 {
1093     return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
1094                                              (__v2di)__anyext128(__count)));
1095 }
1096 
1097 /// Right-shifts the first parameter, which is a 64-bit integer, by the
1098 ///    number of bits specified by the second parameter, which is a 32-bit
1099 ///    integer.
1100 ///
1101 ///    High-order bits are cleared.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1106 ///
1107 /// \param __m
1108 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1109 /// \param __count
1110 ///    A 32-bit integer value.
1111 /// \returns A 64-bit integer vector containing the right-shifted value.
1112 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_si64(__m64 __m,int __count)1113 _mm_srli_si64(__m64 __m, int __count)
1114 {
1115     return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
1116                                               __count));
1117 }
1118 
1119 /// Performs a bitwise AND of two 64-bit integer vectors.
1120 ///
1121 /// \headerfile <x86intrin.h>
1122 ///
1123 /// This intrinsic corresponds to the <c> PAND </c> instruction.
1124 ///
1125 /// \param __m1
1126 ///    A 64-bit integer vector.
1127 /// \param __m2
1128 ///    A 64-bit integer vector.
1129 /// \returns A 64-bit integer vector containing the bitwise AND of both
1130 ///    parameters.
1131 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_and_si64(__m64 __m1,__m64 __m2)1132 _mm_and_si64(__m64 __m1, __m64 __m2)
1133 {
1134     return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
1135 }
1136 
1137 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
1138 ///    performs a bitwise AND of the intermediate result and the second 64-bit
1139 ///    integer vector.
1140 ///
1141 /// \headerfile <x86intrin.h>
1142 ///
1143 /// This intrinsic corresponds to the <c> PANDN </c> instruction.
1144 ///
1145 /// \param __m1
1146 ///    A 64-bit integer vector. The one's complement of this parameter is used
1147 ///    in the bitwise AND.
1148 /// \param __m2
1149 ///    A 64-bit integer vector.
1150 /// \returns A 64-bit integer vector containing the bitwise AND of the second
1151 ///    parameter and the one's complement of the first parameter.
1152 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_andnot_si64(__m64 __m1,__m64 __m2)1153 _mm_andnot_si64(__m64 __m1, __m64 __m2)
1154 {
1155     return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
1156 }
1157 
1158 /// Performs a bitwise OR of two 64-bit integer vectors.
1159 ///
1160 /// \headerfile <x86intrin.h>
1161 ///
1162 /// This intrinsic corresponds to the <c> POR </c> instruction.
1163 ///
1164 /// \param __m1
1165 ///    A 64-bit integer vector.
1166 /// \param __m2
1167 ///    A 64-bit integer vector.
1168 /// \returns A 64-bit integer vector containing the bitwise OR of both
1169 ///    parameters.
1170 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_or_si64(__m64 __m1,__m64 __m2)1171 _mm_or_si64(__m64 __m1, __m64 __m2)
1172 {
1173     return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
1174 }
1175 
1176 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1177 ///
1178 /// \headerfile <x86intrin.h>
1179 ///
1180 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1181 ///
1182 /// \param __m1
1183 ///    A 64-bit integer vector.
1184 /// \param __m2
1185 ///    A 64-bit integer vector.
1186 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1187 ///    parameters.
1188 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_xor_si64(__m64 __m1,__m64 __m2)1189 _mm_xor_si64(__m64 __m1, __m64 __m2)
1190 {
1191     return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
1192 }
1193 
1194 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1195 ///    [8 x i8] to determine if the element of the first vector is equal to the
1196 ///    corresponding element of the second vector.
1197 ///
1198 ///    Each comparison returns 0 for false, 0xFF for true.
1199 ///
1200 /// \headerfile <x86intrin.h>
1201 ///
1202 /// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1203 ///
1204 /// \param __m1
1205 ///    A 64-bit integer vector of [8 x i8].
1206 /// \param __m2
1207 ///    A 64-bit integer vector of [8 x i8].
1208 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1209 ///    results.
1210 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)1211 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1212 {
1213     return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
1214 }
1215 
1216 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1217 ///    [4 x i16] to determine if the element of the first vector is equal to the
1218 ///    corresponding element of the second vector.
1219 ///
1220 ///    Each comparison returns 0 for false, 0xFFFF for true.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1225 ///
1226 /// \param __m1
1227 ///    A 64-bit integer vector of [4 x i16].
1228 /// \param __m2
1229 ///    A 64-bit integer vector of [4 x i16].
1230 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1231 ///    results.
1232 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)1233 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1234 {
1235     return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
1236 }
1237 
1238 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1239 ///    [2 x i32] to determine if the element of the first vector is equal to the
1240 ///    corresponding element of the second vector.
1241 ///
1242 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
1243 ///
1244 /// \headerfile <x86intrin.h>
1245 ///
1246 /// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1247 ///
1248 /// \param __m1
1249 ///    A 64-bit integer vector of [2 x i32].
1250 /// \param __m2
1251 ///    A 64-bit integer vector of [2 x i32].
1252 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1253 ///    results.
1254 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)1255 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1256 {
1257     return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
1258 }
1259 
1260 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1261 ///    [8 x i8] to determine if the element of the first vector is greater than
1262 ///    the corresponding element of the second vector.
1263 ///
1264 ///    Each comparison returns 0 for false, 0xFF for true.
1265 ///
1266 /// \headerfile <x86intrin.h>
1267 ///
1268 /// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1269 ///
1270 /// \param __m1
1271 ///    A 64-bit integer vector of [8 x i8].
1272 /// \param __m2
1273 ///    A 64-bit integer vector of [8 x i8].
1274 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1275 ///    results.
1276 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)1277 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1278 {
1279   /* This function always performs a signed comparison, but __v8qi is a char
1280      which may be signed or unsigned, so use __v8qs. */
1281     return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
1282 }
1283 
1284 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1285 ///    [4 x i16] to determine if the element of the first vector is greater than
1286 ///    the corresponding element of the second vector.
1287 ///
1288 ///    Each comparison returns 0 for false, 0xFFFF for true.
1289 ///
1290 /// \headerfile <x86intrin.h>
1291 ///
1292 /// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1293 ///
1294 /// \param __m1
1295 ///    A 64-bit integer vector of [4 x i16].
1296 /// \param __m2
1297 ///    A 64-bit integer vector of [4 x i16].
1298 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1299 ///    results.
1300 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)1301 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1302 {
1303     return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
1304 }
1305 
1306 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1307 ///    [2 x i32] to determine if the element of the first vector is greater than
1308 ///    the corresponding element of the second vector.
1309 ///
1310 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
1311 ///
1312 /// \headerfile <x86intrin.h>
1313 ///
1314 /// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1315 ///
1316 /// \param __m1
1317 ///    A 64-bit integer vector of [2 x i32].
1318 /// \param __m2
1319 ///    A 64-bit integer vector of [2 x i32].
1320 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1321 ///    results.
1322 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)1323 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1324 {
1325     return (__m64)((__v2si)__m1 > (__v2si)__m2);
1326 }
1327 
1328 /// Constructs a 64-bit integer vector initialized to zero.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1333 ///
1334 /// \returns An initialized 64-bit integer vector with all elements set to zero.
1335 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setzero_si64(void)1336 _mm_setzero_si64(void)
1337 {
1338     return __extension__ (__m64){ 0LL };
1339 }
1340 
1341 /// Constructs a 64-bit integer vector initialized with the specified
1342 ///    32-bit integer values.
1343 ///
1344 /// \headerfile <x86intrin.h>
1345 ///
1346 /// This intrinsic is a utility function and does not correspond to a specific
1347 ///    instruction.
1348 ///
1349 /// \param __i1
1350 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1351 ///    result.
1352 /// \param __i0
1353 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1354 ///    result.
1355 /// \returns An initialized 64-bit integer vector.
1356 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi32(int __i1,int __i0)1357 _mm_set_pi32(int __i1, int __i0)
1358 {
1359     return __extension__ (__m64)(__v2si){__i0, __i1};
1360 }
1361 
1362 /// Constructs a 64-bit integer vector initialized with the specified
1363 ///    16-bit integer values.
1364 ///
1365 /// \headerfile <x86intrin.h>
1366 ///
1367 /// This intrinsic is a utility function and does not correspond to a specific
1368 ///    instruction.
1369 ///
1370 /// \param __s3
1371 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1372 /// \param __s2
1373 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1374 /// \param __s1
1375 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1376 /// \param __s0
1377 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1378 /// \returns An initialized 64-bit integer vector.
1379 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi16(short __s3,short __s2,short __s1,short __s0)1380 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1381 {
1382     return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
1383 }
1384 
1385 /// Constructs a 64-bit integer vector initialized with the specified
1386 ///    8-bit integer values.
1387 ///
1388 /// \headerfile <x86intrin.h>
1389 ///
1390 /// This intrinsic is a utility function and does not correspond to a specific
1391 ///    instruction.
1392 ///
1393 /// \param __b7
1394 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1395 /// \param __b6
1396 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1397 /// \param __b5
1398 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1399 /// \param __b4
1400 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1401 /// \param __b3
1402 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1403 /// \param __b2
1404 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1405 /// \param __b1
1406 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1407 /// \param __b0
1408 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1409 /// \returns An initialized 64-bit integer vector.
1410 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)1411 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1412             char __b1, char __b0)
1413 {
1414     return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
1415                                          __b4, __b5, __b6, __b7};
1416 }
1417 
1418 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
1419 ///    32-bit integer vector elements set to the specified 32-bit integer
1420 ///    value.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic is a utility function and does not correspond to a specific
1425 ///    instruction.
1426 ///
1427 /// \param __i
1428 ///    A 32-bit integer value used to initialize each vector element of the
1429 ///    result.
1430 /// \returns An initialized 64-bit integer vector of [2 x i32].
1431 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi32(int __i)1432 _mm_set1_pi32(int __i)
1433 {
1434     return _mm_set_pi32(__i, __i);
1435 }
1436 
1437 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
1438 ///    16-bit integer vector elements set to the specified 16-bit integer
1439 ///    value.
1440 ///
1441 /// \headerfile <x86intrin.h>
1442 ///
1443 /// This intrinsic is a utility function and does not correspond to a specific
1444 ///    instruction.
1445 ///
1446 /// \param __w
1447 ///    A 16-bit integer value used to initialize each vector element of the
1448 ///    result.
1449 /// \returns An initialized 64-bit integer vector of [4 x i16].
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi16(short __w)1451 _mm_set1_pi16(short __w)
1452 {
1453     return _mm_set_pi16(__w, __w, __w, __w);
1454 }
1455 
1456 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
1457 ///    8-bit integer vector elements set to the specified 8-bit integer value.
1458 ///
1459 /// \headerfile <x86intrin.h>
1460 ///
1461 /// This intrinsic is a utility function and does not correspond to a specific
1462 ///    instruction.
1463 ///
1464 /// \param __b
1465 ///    An 8-bit integer value used to initialize each vector element of the
1466 ///    result.
1467 /// \returns An initialized 64-bit integer vector of [8 x i8].
1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi8(char __b)1469 _mm_set1_pi8(char __b)
1470 {
1471     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1472 }
1473 
1474 /// Constructs a 64-bit integer vector, initialized in reverse order with
1475 ///    the specified 32-bit integer values.
1476 ///
1477 /// \headerfile <x86intrin.h>
1478 ///
1479 /// This intrinsic is a utility function and does not correspond to a specific
1480 ///    instruction.
1481 ///
1482 /// \param __i0
1483 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1484 ///    result.
1485 /// \param __i1
1486 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1487 ///    result.
1488 /// \returns An initialized 64-bit integer vector.
1489 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi32(int __i0,int __i1)1490 _mm_setr_pi32(int __i0, int __i1)
1491 {
1492     return _mm_set_pi32(__i1, __i0);
1493 }
1494 
1495 /// Constructs a 64-bit integer vector, initialized in reverse order with
1496 ///    the specified 16-bit integer values.
1497 ///
1498 /// \headerfile <x86intrin.h>
1499 ///
1500 /// This intrinsic is a utility function and does not correspond to a specific
1501 ///    instruction.
1502 ///
1503 /// \param __w0
1504 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1505 /// \param __w1
1506 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1507 /// \param __w2
1508 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1509 /// \param __w3
1510 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1511 /// \returns An initialized 64-bit integer vector.
1512 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)1513 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1514 {
1515     return _mm_set_pi16(__w3, __w2, __w1, __w0);
1516 }
1517 
1518 /// Constructs a 64-bit integer vector, initialized in reverse order with
1519 ///    the specified 8-bit integer values.
1520 ///
1521 /// \headerfile <x86intrin.h>
1522 ///
1523 /// This intrinsic is a utility function and does not correspond to a specific
1524 ///    instruction.
1525 ///
1526 /// \param __b0
1527 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1528 /// \param __b1
1529 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1530 /// \param __b2
1531 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1532 /// \param __b3
1533 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1534 /// \param __b4
1535 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1536 /// \param __b5
1537 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1538 /// \param __b6
1539 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1540 /// \param __b7
1541 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1542 /// \returns An initialized 64-bit integer vector.
1543 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)1544 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1545              char __b6, char __b7)
1546 {
1547     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1548 }
1549 
1550 #undef __anyext128
1551 #undef __trunc64
1552 #undef __DEFAULT_FN_ATTRS_SSE2
1553 
1554 /* Aliases for compatibility. */
1555 #define _m_empty _mm_empty
1556 #define _m_from_int _mm_cvtsi32_si64
1557 #define _m_from_int64 _mm_cvtsi64_m64
1558 #define _m_to_int _mm_cvtsi64_si32
1559 #define _m_to_int64 _mm_cvtm64_si64
1560 #define _m_packsswb _mm_packs_pi16
1561 #define _m_packssdw _mm_packs_pi32
1562 #define _m_packuswb _mm_packs_pu16
1563 #define _m_punpckhbw _mm_unpackhi_pi8
1564 #define _m_punpckhwd _mm_unpackhi_pi16
1565 #define _m_punpckhdq _mm_unpackhi_pi32
1566 #define _m_punpcklbw _mm_unpacklo_pi8
1567 #define _m_punpcklwd _mm_unpacklo_pi16
1568 #define _m_punpckldq _mm_unpacklo_pi32
1569 #define _m_paddb _mm_add_pi8
1570 #define _m_paddw _mm_add_pi16
1571 #define _m_paddd _mm_add_pi32
1572 #define _m_paddsb _mm_adds_pi8
1573 #define _m_paddsw _mm_adds_pi16
1574 #define _m_paddusb _mm_adds_pu8
1575 #define _m_paddusw _mm_adds_pu16
1576 #define _m_psubb _mm_sub_pi8
1577 #define _m_psubw _mm_sub_pi16
1578 #define _m_psubd _mm_sub_pi32
1579 #define _m_psubsb _mm_subs_pi8
1580 #define _m_psubsw _mm_subs_pi16
1581 #define _m_psubusb _mm_subs_pu8
1582 #define _m_psubusw _mm_subs_pu16
1583 #define _m_pmaddwd _mm_madd_pi16
1584 #define _m_pmulhw _mm_mulhi_pi16
1585 #define _m_pmullw _mm_mullo_pi16
1586 #define _m_psllw _mm_sll_pi16
1587 #define _m_psllwi _mm_slli_pi16
1588 #define _m_pslld _mm_sll_pi32
1589 #define _m_pslldi _mm_slli_pi32
1590 #define _m_psllq _mm_sll_si64
1591 #define _m_psllqi _mm_slli_si64
1592 #define _m_psraw _mm_sra_pi16
1593 #define _m_psrawi _mm_srai_pi16
1594 #define _m_psrad _mm_sra_pi32
1595 #define _m_psradi _mm_srai_pi32
1596 #define _m_psrlw _mm_srl_pi16
1597 #define _m_psrlwi _mm_srli_pi16
1598 #define _m_psrld _mm_srl_pi32
1599 #define _m_psrldi _mm_srli_pi32
1600 #define _m_psrlq _mm_srl_si64
1601 #define _m_psrlqi _mm_srli_si64
1602 #define _m_pand _mm_and_si64
1603 #define _m_pandn _mm_andnot_si64
1604 #define _m_por _mm_or_si64
1605 #define _m_pxor _mm_xor_si64
1606 #define _m_pcmpeqb _mm_cmpeq_pi8
1607 #define _m_pcmpeqw _mm_cmpeq_pi16
1608 #define _m_pcmpeqd _mm_cmpeq_pi32
1609 #define _m_pcmpgtb _mm_cmpgt_pi8
1610 #define _m_pcmpgtw _mm_cmpgt_pi16
1611 #define _m_pcmpgtd _mm_cmpgt_pi32
1612 
1613 #endif /* __MMINTRIN_H */
1614 
1615