1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16
17 #include <pmmintrin.h>
18
19 /* Define the default attributes for the functions in this file. */
20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
21 #define __DEFAULT_FN_ATTRS \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("ssse3,no-evex512"), __min_vector_width__(128)))
24 #else
25 #define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), \
27 __min_vector_width__(128)))
28 #endif
29
30 #define __trunc64(x) \
31 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
32 #define __anyext128(x) \
33 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
34 1, -1, -1)
35
36 /// Computes the absolute value of each of the packed 8-bit signed
37 /// integers in the source operand and stores the 8-bit unsigned integer
38 /// results in the destination.
39 ///
40 /// \headerfile <x86intrin.h>
41 ///
42 /// This intrinsic corresponds to the \c PABSB instruction.
43 ///
44 /// \param __a
45 /// A 64-bit vector of [8 x i8].
46 /// \returns A 64-bit integer vector containing the absolute values of the
47 /// elements in the operand.
48 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi8(__m64 __a)49 _mm_abs_pi8(__m64 __a)
50 {
51 return (__m64)__builtin_elementwise_abs((__v8qs)__a);
52 }
53
54 /// Computes the absolute value of each of the packed 8-bit signed
55 /// integers in the source operand and stores the 8-bit unsigned integer
56 /// results in the destination.
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the \c VPABSB instruction.
61 ///
62 /// \param __a
63 /// A 128-bit vector of [16 x i8].
64 /// \returns A 128-bit integer vector containing the absolute values of the
65 /// elements in the operand.
66 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)67 _mm_abs_epi8(__m128i __a)
68 {
69 return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
70 }
71
72 /// Computes the absolute value of each of the packed 16-bit signed
73 /// integers in the source operand and stores the 16-bit unsigned integer
74 /// results in the destination.
75 ///
76 /// \headerfile <x86intrin.h>
77 ///
78 /// This intrinsic corresponds to the \c PABSW instruction.
79 ///
80 /// \param __a
81 /// A 64-bit vector of [4 x i16].
82 /// \returns A 64-bit integer vector containing the absolute values of the
83 /// elements in the operand.
84 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi16(__m64 __a)85 _mm_abs_pi16(__m64 __a)
86 {
87 return (__m64)__builtin_elementwise_abs((__v4hi)__a);
88 }
89
90 /// Computes the absolute value of each of the packed 16-bit signed
91 /// integers in the source operand and stores the 16-bit unsigned integer
92 /// results in the destination.
93 ///
94 /// \headerfile <x86intrin.h>
95 ///
96 /// This intrinsic corresponds to the \c VPABSW instruction.
97 ///
98 /// \param __a
99 /// A 128-bit vector of [8 x i16].
100 /// \returns A 128-bit integer vector containing the absolute values of the
101 /// elements in the operand.
102 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)103 _mm_abs_epi16(__m128i __a)
104 {
105 return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
106 }
107
108 /// Computes the absolute value of each of the packed 32-bit signed
109 /// integers in the source operand and stores the 32-bit unsigned integer
110 /// results in the destination.
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the \c PABSD instruction.
115 ///
116 /// \param __a
117 /// A 64-bit vector of [2 x i32].
118 /// \returns A 64-bit integer vector containing the absolute values of the
119 /// elements in the operand.
120 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi32(__m64 __a)121 _mm_abs_pi32(__m64 __a)
122 {
123 return (__m64)__builtin_elementwise_abs((__v2si)__a);
124 }
125
126 /// Computes the absolute value of each of the packed 32-bit signed
127 /// integers in the source operand and stores the 32-bit unsigned integer
128 /// results in the destination.
129 ///
130 /// \headerfile <x86intrin.h>
131 ///
132 /// This intrinsic corresponds to the \c VPABSD instruction.
133 ///
134 /// \param __a
135 /// A 128-bit vector of [4 x i32].
136 /// \returns A 128-bit integer vector containing the absolute values of the
137 /// elements in the operand.
138 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)139 _mm_abs_epi32(__m128i __a)
140 {
141 return (__m128i)__builtin_elementwise_abs((__v4si)__a);
142 }
143
144 /// Concatenates the two 128-bit integer vector operands, and
145 /// right-shifts the result by the number of bytes specified in the immediate
146 /// operand.
147 ///
148 /// \headerfile <x86intrin.h>
149 ///
150 /// \code
151 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
152 /// \endcode
153 ///
154 /// This intrinsic corresponds to the \c PALIGNR instruction.
155 ///
156 /// \param a
157 /// A 128-bit vector of [16 x i8] containing one of the source operands.
158 /// \param b
159 /// A 128-bit vector of [16 x i8] containing one of the source operands.
160 /// \param n
161 /// An immediate operand specifying how many bytes to right-shift the result.
162 /// \returns A 128-bit integer vector containing the concatenated right-shifted
163 /// value.
164 #define _mm_alignr_epi8(a, b, n) \
165 ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
166 (__v16qi)(__m128i)(b), (n)))
167
168 /// Concatenates the two 64-bit integer vector operands, and right-shifts
169 /// the result by the number of bytes specified in the immediate operand.
170 ///
171 /// \headerfile <x86intrin.h>
172 ///
173 /// \code
174 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
175 /// \endcode
176 ///
177 /// This intrinsic corresponds to the \c PALIGNR instruction.
178 ///
179 /// \param a
180 /// A 64-bit vector of [8 x i8] containing one of the source operands.
181 /// \param b
182 /// A 64-bit vector of [8 x i8] containing one of the source operands.
183 /// \param n
184 /// An immediate operand specifying how many bytes to right-shift the result.
185 /// \returns A 64-bit integer vector containing the concatenated right-shifted
186 /// value.
187 #define _mm_alignr_pi8(a, b, n) \
188 ((__m64)__builtin_shufflevector( \
189 __builtin_ia32_psrldqi128_byteshift( \
190 __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
191 (n)), __extension__ (__v2di){}, 0))
192
193 /// Horizontally adds the adjacent pairs of values contained in 2 packed
194 /// 128-bit vectors of [8 x i16].
195 ///
196 /// \headerfile <x86intrin.h>
197 ///
198 /// This intrinsic corresponds to the \c VPHADDW instruction.
199 ///
200 /// \param __a
201 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
202 /// horizontal sums of the values are stored in the lower bits of the
203 /// destination.
204 /// \param __b
205 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
206 /// horizontal sums of the values are stored in the upper bits of the
207 /// destination.
208 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
209 /// both operands.
210 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi16(__m128i __a,__m128i __b)211 _mm_hadd_epi16(__m128i __a, __m128i __b)
212 {
213 return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
214 }
215
216 /// Horizontally adds the adjacent pairs of values contained in 2 packed
217 /// 128-bit vectors of [4 x i32].
218 ///
219 /// \headerfile <x86intrin.h>
220 ///
221 /// This intrinsic corresponds to the \c VPHADDD instruction.
222 ///
223 /// \param __a
224 /// A 128-bit vector of [4 x i32] containing one of the source operands. The
225 /// horizontal sums of the values are stored in the lower bits of the
226 /// destination.
227 /// \param __b
228 /// A 128-bit vector of [4 x i32] containing one of the source operands. The
229 /// horizontal sums of the values are stored in the upper bits of the
230 /// destination.
231 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
232 /// both operands.
233 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi32(__m128i __a,__m128i __b)234 _mm_hadd_epi32(__m128i __a, __m128i __b)
235 {
236 return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
237 }
238
239 /// Horizontally adds the adjacent pairs of values contained in 2 packed
240 /// 64-bit vectors of [4 x i16].
241 ///
242 /// \headerfile <x86intrin.h>
243 ///
244 /// This intrinsic corresponds to the \c PHADDW instruction.
245 ///
246 /// \param __a
247 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
248 /// horizontal sums of the values are stored in the lower bits of the
249 /// destination.
250 /// \param __b
251 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
252 /// horizontal sums of the values are stored in the upper bits of the
253 /// destination.
254 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
255 /// operands.
256 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi16(__m64 __a,__m64 __b)257 _mm_hadd_pi16(__m64 __a, __m64 __b)
258 {
259 return __trunc64(__builtin_ia32_phaddw128(
260 (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
261 }
262
263 /// Horizontally adds the adjacent pairs of values contained in 2 packed
264 /// 64-bit vectors of [2 x i32].
265 ///
266 /// \headerfile <x86intrin.h>
267 ///
268 /// This intrinsic corresponds to the \c PHADDD instruction.
269 ///
270 /// \param __a
271 /// A 64-bit vector of [2 x i32] containing one of the source operands. The
272 /// horizontal sums of the values are stored in the lower bits of the
273 /// destination.
274 /// \param __b
275 /// A 64-bit vector of [2 x i32] containing one of the source operands. The
276 /// horizontal sums of the values are stored in the upper bits of the
277 /// destination.
278 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
279 /// operands.
280 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi32(__m64 __a,__m64 __b)281 _mm_hadd_pi32(__m64 __a, __m64 __b)
282 {
283 return __trunc64(__builtin_ia32_phaddd128(
284 (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
285 }
286
287 /// Horizontally adds, with saturation, the adjacent pairs of values contained
288 /// in two packed 128-bit vectors of [8 x i16].
289 ///
290 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
291 /// less than 0x8000 are saturated to 0x8000.
292 ///
293 /// \headerfile <x86intrin.h>
294 ///
295 /// This intrinsic corresponds to the \c VPHADDSW instruction.
296 ///
297 /// \param __a
298 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
299 /// horizontal sums of the values are stored in the lower bits of the
300 /// destination.
301 /// \param __b
302 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
303 /// horizontal sums of the values are stored in the upper bits of the
304 /// destination.
305 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
306 /// sums of both operands.
307 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadds_epi16(__m128i __a,__m128i __b)308 _mm_hadds_epi16(__m128i __a, __m128i __b)
309 {
310 return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
311 }
312
313 /// Horizontally adds, with saturation, the adjacent pairs of values contained
314 /// in two packed 64-bit vectors of [4 x i16].
315 ///
316 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
317 /// less than 0x8000 are saturated to 0x8000.
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the \c PHADDSW instruction.
322 ///
323 /// \param __a
324 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
325 /// horizontal sums of the values are stored in the lower bits of the
326 /// destination.
327 /// \param __b
328 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
329 /// horizontal sums of the values are stored in the upper bits of the
330 /// destination.
331 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
332 /// sums of both operands.
333 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadds_pi16(__m64 __a,__m64 __b)334 _mm_hadds_pi16(__m64 __a, __m64 __b)
335 {
336 return __trunc64(__builtin_ia32_phaddsw128(
337 (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
338 }
339
340 /// Horizontally subtracts the adjacent pairs of values contained in 2
341 /// packed 128-bit vectors of [8 x i16].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the \c VPHSUBW instruction.
346 ///
347 /// \param __a
348 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
349 /// horizontal differences between the values are stored in the lower bits of
350 /// the destination.
351 /// \param __b
352 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
353 /// horizontal differences between the values are stored in the upper bits of
354 /// the destination.
355 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
356 /// of both operands.
357 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi16(__m128i __a,__m128i __b)358 _mm_hsub_epi16(__m128i __a, __m128i __b)
359 {
360 return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
361 }
362
363 /// Horizontally subtracts the adjacent pairs of values contained in 2
364 /// packed 128-bit vectors of [4 x i32].
365 ///
366 /// \headerfile <x86intrin.h>
367 ///
368 /// This intrinsic corresponds to the \c VPHSUBD instruction.
369 ///
370 /// \param __a
371 /// A 128-bit vector of [4 x i32] containing one of the source operands. The
372 /// horizontal differences between the values are stored in the lower bits of
373 /// the destination.
374 /// \param __b
375 /// A 128-bit vector of [4 x i32] containing one of the source operands. The
376 /// horizontal differences between the values are stored in the upper bits of
377 /// the destination.
378 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
379 /// of both operands.
380 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi32(__m128i __a,__m128i __b)381 _mm_hsub_epi32(__m128i __a, __m128i __b)
382 {
383 return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
384 }
385
386 /// Horizontally subtracts the adjacent pairs of values contained in 2
387 /// packed 64-bit vectors of [4 x i16].
388 ///
389 /// \headerfile <x86intrin.h>
390 ///
391 /// This intrinsic corresponds to the \c PHSUBW instruction.
392 ///
393 /// \param __a
394 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
395 /// horizontal differences between the values are stored in the lower bits of
396 /// the destination.
397 /// \param __b
398 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
399 /// horizontal differences between the values are stored in the upper bits of
400 /// the destination.
401 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
402 /// of both operands.
403 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi16(__m64 __a,__m64 __b)404 _mm_hsub_pi16(__m64 __a, __m64 __b)
405 {
406 return __trunc64(__builtin_ia32_phsubw128(
407 (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
408 }
409
410 /// Horizontally subtracts the adjacent pairs of values contained in 2
411 /// packed 64-bit vectors of [2 x i32].
412 ///
413 /// \headerfile <x86intrin.h>
414 ///
415 /// This intrinsic corresponds to the \c PHSUBD instruction.
416 ///
417 /// \param __a
418 /// A 64-bit vector of [2 x i32] containing one of the source operands. The
419 /// horizontal differences between the values are stored in the lower bits of
420 /// the destination.
421 /// \param __b
422 /// A 64-bit vector of [2 x i32] containing one of the source operands. The
423 /// horizontal differences between the values are stored in the upper bits of
424 /// the destination.
425 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
426 /// of both operands.
427 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi32(__m64 __a,__m64 __b)428 _mm_hsub_pi32(__m64 __a, __m64 __b)
429 {
430 return __trunc64(__builtin_ia32_phsubd128(
431 (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
432 }
433
434 /// Horizontally subtracts, with saturation, the adjacent pairs of values
435 /// contained in two packed 128-bit vectors of [8 x i16].
436 ///
437 /// Positive differences greater than 0x7FFF are saturated to 0x7FFF.
438 /// Negative differences less than 0x8000 are saturated to 0x8000.
439 ///
440 /// \headerfile <x86intrin.h>
441 ///
442 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
443 ///
444 /// \param __a
445 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
446 /// horizontal differences between the values are stored in the lower bits of
447 /// the destination.
448 /// \param __b
449 /// A 128-bit vector of [8 x i16] containing one of the source operands. The
450 /// horizontal differences between the values are stored in the upper bits of
451 /// the destination.
452 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
453 /// differences of both operands.
454 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsubs_epi16(__m128i __a,__m128i __b)455 _mm_hsubs_epi16(__m128i __a, __m128i __b)
456 {
457 return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
458 }
459
460 /// Horizontally subtracts, with saturation, the adjacent pairs of values
461 /// contained in two packed 64-bit vectors of [4 x i16].
462 ///
463 /// Positive differences greater than 0x7FFF are saturated to 0x7FFF.
464 /// Negative differences less than 0x8000 are saturated to 0x8000.
465 ///
466 /// \headerfile <x86intrin.h>
467 ///
468 /// This intrinsic corresponds to the \c PHSUBSW instruction.
469 ///
470 /// \param __a
471 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
472 /// horizontal differences between the values are stored in the lower bits of
473 /// the destination.
474 /// \param __b
475 /// A 64-bit vector of [4 x i16] containing one of the source operands. The
476 /// horizontal differences between the values are stored in the upper bits of
477 /// the destination.
478 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
479 /// differences of both operands.
480 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsubs_pi16(__m64 __a,__m64 __b)481 _mm_hsubs_pi16(__m64 __a, __m64 __b)
482 {
483 return __trunc64(__builtin_ia32_phsubsw128(
484 (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
485 }
486
487 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
488 /// values contained in the first source operand and packed 8-bit signed
489 /// integer values contained in the second source operand, adds pairs of
490 /// contiguous products with signed saturation, and writes the 16-bit sums to
491 /// the corresponding bits in the destination.
492 ///
493 /// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
494 /// both operands are multiplied, and the sum of both results is written to
495 /// bits [15:0] of the destination.
496 ///
497 /// \headerfile <x86intrin.h>
498 ///
499 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
500 ///
501 /// \param __a
502 /// A 128-bit integer vector containing the first source operand.
503 /// \param __b
504 /// A 128-bit integer vector containing the second source operand.
505 /// \returns A 128-bit integer vector containing the sums of products of both
506 /// operands: \n
507 /// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
508 /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
509 /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
510 /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
511 /// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
512 /// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
513 /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
514 /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
515 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a,__m128i __b)516 _mm_maddubs_epi16(__m128i __a, __m128i __b)
517 {
518 return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
519 }
520
521 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
522 /// values contained in the first source operand and packed 8-bit signed
523 /// integer values contained in the second source operand, adds pairs of
524 /// contiguous products with signed saturation, and writes the 16-bit sums to
525 /// the corresponding bits in the destination.
526 ///
527 /// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
528 /// both operands are multiplied, and the sum of both results is written to
529 /// bits [15:0] of the destination.
530 ///
531 /// \headerfile <x86intrin.h>
532 ///
533 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
534 ///
535 /// \param __a
536 /// A 64-bit integer vector containing the first source operand.
537 /// \param __b
538 /// A 64-bit integer vector containing the second source operand.
539 /// \returns A 64-bit integer vector containing the sums of products of both
540 /// operands: \n
541 /// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
542 /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
543 /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
544 /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
545 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a,__m64 __b)546 _mm_maddubs_pi16(__m64 __a, __m64 __b)
547 {
548 return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
549 (__v16qi)__anyext128(__b)));
550 }
551
552 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
553 /// products to the 18 most significant bits by right-shifting, rounds the
554 /// truncated value by adding 1, and writes bits [16:1] to the destination.
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
559 ///
560 /// \param __a
561 /// A 128-bit vector of [8 x i16] containing one of the source operands.
562 /// \param __b
563 /// A 128-bit vector of [8 x i16] containing one of the source operands.
564 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
565 /// products of both operands.
566 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhrs_epi16(__m128i __a,__m128i __b)567 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
568 {
569 return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
570 }
571
572 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
573 /// products to the 18 most significant bits by right-shifting, rounds the
574 /// truncated value by adding 1, and writes bits [16:1] to the destination.
575 ///
576 /// \headerfile <x86intrin.h>
577 ///
578 /// This intrinsic corresponds to the \c PMULHRSW instruction.
579 ///
580 /// \param __a
581 /// A 64-bit vector of [4 x i16] containing one of the source operands.
582 /// \param __b
583 /// A 64-bit vector of [4 x i16] containing one of the source operands.
584 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
585 /// products of both operands.
586 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_mulhrs_pi16(__m64 __a,__m64 __b)587 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
588 {
589 return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
590 (__v8hi)__anyext128(__b)));
591 }
592
593 /// Copies the 8-bit integers from a 128-bit integer vector to the
594 /// destination or clears 8-bit values in the destination, as specified by
595 /// the second source operand.
596 ///
597 /// \headerfile <x86intrin.h>
598 ///
599 /// This intrinsic corresponds to the \c VPSHUFB instruction.
600 ///
601 /// \param __a
602 /// A 128-bit integer vector containing the values to be copied.
603 /// \param __b
604 /// A 128-bit integer vector containing control bytes corresponding to
605 /// positions in the destination:
606 /// Bit 7: \n
607 /// 1: Clear the corresponding byte in the destination. \n
608 /// 0: Copy the selected source byte to the corresponding byte in the
609 /// destination. \n
610 /// Bits [6:4] Reserved. \n
611 /// Bits [3:0] select the source byte to be copied.
612 /// \returns A 128-bit integer vector containing the copied or cleared values.
613 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a,__m128i __b)614 _mm_shuffle_epi8(__m128i __a, __m128i __b)
615 {
616 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
617 }
618
619 /// Copies the 8-bit integers from a 64-bit integer vector to the
620 /// destination or clears 8-bit values in the destination, as specified by
621 /// the second source operand.
622 ///
623 /// \headerfile <x86intrin.h>
624 ///
625 /// This intrinsic corresponds to the \c PSHUFB instruction.
626 ///
627 /// \param __a
628 /// A 64-bit integer vector containing the values to be copied.
629 /// \param __b
630 /// A 64-bit integer vector containing control bytes corresponding to
631 /// positions in the destination:
632 /// Bit 7: \n
633 /// 1: Clear the corresponding byte in the destination. \n
634 /// 0: Copy the selected source byte to the corresponding byte in the
635 /// destination. \n
636 /// Bits [2:0] select the source byte to be copied.
637 /// \returns A 64-bit integer vector containing the copied or cleared values.
638 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a,__m64 __b)639 _mm_shuffle_pi8(__m64 __a, __m64 __b)
640 {
641 return __trunc64(__builtin_ia32_pshufb128(
642 (__v16qi)__builtin_shufflevector(
643 (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
644 (__v16qi)__anyext128(__b)));
645 }
646
647 /// For each 8-bit integer in the first source operand, perform one of
648 /// the following actions as specified by the second source operand.
649 ///
650 /// If the byte in the second source is negative, calculate the two's
651 /// complement of the corresponding byte in the first source, and write that
652 /// value to the destination. If the byte in the second source is positive,
653 /// copy the corresponding byte from the first source to the destination. If
654 /// the byte in the second source is zero, clear the corresponding byte in
655 /// the destination.
656 ///
657 /// \headerfile <x86intrin.h>
658 ///
659 /// This intrinsic corresponds to the \c VPSIGNB instruction.
660 ///
661 /// \param __a
662 /// A 128-bit integer vector containing the values to be copied.
663 /// \param __b
664 /// A 128-bit integer vector containing control bytes corresponding to
665 /// positions in the destination.
666 /// \returns A 128-bit integer vector containing the resultant values.
667 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi8(__m128i __a,__m128i __b)668 _mm_sign_epi8(__m128i __a, __m128i __b)
669 {
670 return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
671 }
672
673 /// For each 16-bit integer in the first source operand, perform one of
674 /// the following actions as specified by the second source operand.
675 ///
676 /// If the word in the second source is negative, calculate the two's
677 /// complement of the corresponding word in the first source, and write that
678 /// value to the destination. If the word in the second source is positive,
679 /// copy the corresponding word from the first source to the destination. If
680 /// the word in the second source is zero, clear the corresponding word in
681 /// the destination.
682 ///
683 /// \headerfile <x86intrin.h>
684 ///
685 /// This intrinsic corresponds to the \c VPSIGNW instruction.
686 ///
687 /// \param __a
688 /// A 128-bit integer vector containing the values to be copied.
689 /// \param __b
690 /// A 128-bit integer vector containing control words corresponding to
691 /// positions in the destination.
692 /// \returns A 128-bit integer vector containing the resultant values.
693 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi16(__m128i __a,__m128i __b)694 _mm_sign_epi16(__m128i __a, __m128i __b)
695 {
696 return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
697 }
698
699 /// For each 32-bit integer in the first source operand, perform one of
700 /// the following actions as specified by the second source operand.
701 ///
702 /// If the doubleword in the second source is negative, calculate the two's
703 /// complement of the corresponding word in the first source, and write that
704 /// value to the destination. If the doubleword in the second source is
705 /// positive, copy the corresponding word from the first source to the
706 /// destination. If the doubleword in the second source is zero, clear the
707 /// corresponding word in the destination.
708 ///
709 /// \headerfile <x86intrin.h>
710 ///
711 /// This intrinsic corresponds to the \c VPSIGND instruction.
712 ///
713 /// \param __a
714 /// A 128-bit integer vector containing the values to be copied.
715 /// \param __b
716 /// A 128-bit integer vector containing control doublewords corresponding to
717 /// positions in the destination.
718 /// \returns A 128-bit integer vector containing the resultant values.
719 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi32(__m128i __a,__m128i __b)720 _mm_sign_epi32(__m128i __a, __m128i __b)
721 {
722 return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
723 }
724
725 /// For each 8-bit integer in the first source operand, perform one of
726 /// the following actions as specified by the second source operand.
727 ///
728 /// If the byte in the second source is negative, calculate the two's
729 /// complement of the corresponding byte in the first source, and write that
730 /// value to the destination. If the byte in the second source is positive,
731 /// copy the corresponding byte from the first source to the destination. If
732 /// the byte in the second source is zero, clear the corresponding byte in
733 /// the destination.
734 ///
735 /// \headerfile <x86intrin.h>
736 ///
737 /// This intrinsic corresponds to the \c PSIGNB instruction.
738 ///
739 /// \param __a
740 /// A 64-bit integer vector containing the values to be copied.
741 /// \param __b
742 /// A 64-bit integer vector containing control bytes corresponding to
743 /// positions in the destination.
744 /// \returns A 64-bit integer vector containing the resultant values.
745 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi8(__m64 __a,__m64 __b)746 _mm_sign_pi8(__m64 __a, __m64 __b)
747 {
748 return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
749 (__v16qi)__anyext128(__b)));
750 }
751
752 /// For each 16-bit integer in the first source operand, perform one of
753 /// the following actions as specified by the second source operand.
754 ///
755 /// If the word in the second source is negative, calculate the two's
756 /// complement of the corresponding word in the first source, and write that
757 /// value to the destination. If the word in the second source is positive,
758 /// copy the corresponding word from the first source to the destination. If
759 /// the word in the second source is zero, clear the corresponding word in
760 /// the destination.
761 ///
762 /// \headerfile <x86intrin.h>
763 ///
764 /// This intrinsic corresponds to the \c PSIGNW instruction.
765 ///
766 /// \param __a
767 /// A 64-bit integer vector containing the values to be copied.
768 /// \param __b
769 /// A 64-bit integer vector containing control words corresponding to
770 /// positions in the destination.
771 /// \returns A 64-bit integer vector containing the resultant values.
772 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi16(__m64 __a,__m64 __b)773 _mm_sign_pi16(__m64 __a, __m64 __b)
774 {
775 return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
776 (__v8hi)__anyext128(__b)));
777 }
778
779 /// For each 32-bit integer in the first source operand, perform one of
780 /// the following actions as specified by the second source operand.
781 ///
782 /// If the doubleword in the second source is negative, calculate the two's
783 /// complement of the corresponding doubleword in the first source, and
784 /// write that value to the destination. If the doubleword in the second
785 /// source is positive, copy the corresponding doubleword from the first
786 /// source to the destination. If the doubleword in the second source is
787 /// zero, clear the corresponding doubleword in the destination.
788 ///
789 /// \headerfile <x86intrin.h>
790 ///
791 /// This intrinsic corresponds to the \c PSIGND instruction.
792 ///
793 /// \param __a
794 /// A 64-bit integer vector containing the values to be copied.
795 /// \param __b
796 /// A 64-bit integer vector containing two control doublewords corresponding
797 /// to positions in the destination.
798 /// \returns A 64-bit integer vector containing the resultant values.
799 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi32(__m64 __a,__m64 __b)800 _mm_sign_pi32(__m64 __a, __m64 __b)
801 {
802 return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
803 (__v4si)__anyext128(__b)));
804 }
805
806 #undef __anyext128
807 #undef __trunc64
808 #undef __DEFAULT_FN_ATTRS
809
810 #endif /* __TMMINTRIN_H */
811