1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __FMAINTRIN_H
15 #define __FMAINTRIN_H
16 
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
20 
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 ///    For each element, computes <c> (__A * __B) + __C </c>.
23 ///
24 /// \headerfile <immintrin.h>
25 ///
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27 ///
28 /// \param __A
29 ///    A 128-bit vector of [4 x float] containing the multiplicand.
30 /// \param __B
31 ///    A 128-bit vector of [4 x float] containing the multiplier.
32 /// \param __C
33 ///    A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ps(__m128 __A,__m128 __B,__m128 __C)36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
37 {
38   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
39 }
40 
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 ///    For each element, computes <c> (__A * __B) + __C </c>.
43 ///
44 /// \headerfile <immintrin.h>
45 ///
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47 ///
48 /// \param __A
49 ///    A 128-bit vector of [2 x double] containing the multiplicand.
50 /// \param __B
51 ///    A 128-bit vector of [2 x double] containing the multiplier.
52 /// \param __C
53 ///    A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_pd(__m128d __A,__m128d __B,__m128d __C)56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
57 {
58   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
59 }
60 
61 /// Computes a scalar multiply-add of the single-precision values in the
62 ///    low 32 bits of 128-bit vectors of [4 x float].
63 ///
64 /// \code{.operation}
65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
66 /// result[127:32] = __A[127:32]
67 /// \endcode
68 ///
69 /// \headerfile <immintrin.h>
70 ///
71 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
72 ///
73 /// \param __A
74 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
75 ///    32 bits.
76 /// \param __B
77 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
78 ///    32 bits.
79 /// \param __C
80 ///    A 128-bit vector of [4 x float] containing the addend in the low
81 ///    32 bits.
82 /// \returns A 128-bit vector of [4 x float] containing the result in the low
83 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
84 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ss(__m128 __A,__m128 __B,__m128 __C)85 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
86 {
87   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
88 }
89 
90 /// Computes a scalar multiply-add of the double-precision values in the
91 ///    low 64 bits of 128-bit vectors of [2 x double].
92 ///
93 /// \code{.operation}
94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
95 /// result[127:64] = __A[127:64]
96 /// \endcode
97 ///
98 /// \headerfile <immintrin.h>
99 ///
100 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
101 ///
102 /// \param __A
103 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
104 ///    64 bits.
105 /// \param __B
106 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
107 ///    64 bits.
108 /// \param __C
109 ///    A 128-bit vector of [2 x double] containing the addend in the low
110 ///    64 bits.
111 /// \returns A 128-bit vector of [2 x double] containing the result in the low
112 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_sd(__m128d __A,__m128d __B,__m128d __C)114 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
115 {
116   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
117 }
118 
119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
120 ///    For each element, computes <c> (__A * __B) - __C </c>.
121 ///
122 /// \headerfile <immintrin.h>
123 ///
124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
125 ///
126 /// \param __A
127 ///    A 128-bit vector of [4 x float] containing the multiplicand.
128 /// \param __B
129 ///    A 128-bit vector of [4 x float] containing the multiplier.
130 /// \param __C
131 ///    A 128-bit vector of [4 x float] containing the subtrahend.
132 /// \returns A 128-bit vector of [4 x float] containing the result.
133 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ps(__m128 __A,__m128 __B,__m128 __C)134 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
135 {
136   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
137 }
138 
139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
140 ///    For each element, computes <c> (__A * __B) - __C </c>.
141 ///
142 /// \headerfile <immintrin.h>
143 ///
144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
145 ///
146 /// \param __A
147 ///    A 128-bit vector of [2 x double] containing the multiplicand.
148 /// \param __B
149 ///    A 128-bit vector of [2 x double] containing the multiplier.
150 /// \param __C
151 ///    A 128-bit vector of [2 x double] containing the addend.
152 /// \returns A 128-bit vector of [2 x double] containing the result.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_pd(__m128d __A,__m128d __B,__m128d __C)154 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
155 {
156   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
157 }
158 
159 /// Computes a scalar multiply-subtract of the single-precision values in
160 ///    the low 32 bits of 128-bit vectors of [4 x float].
161 ///
162 /// \code{.operation}
163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
164 /// result[127:32] = __A[127:32]
165 /// \endcode
166 ///
167 /// \headerfile <immintrin.h>
168 ///
169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
170 ///
171 /// \param __A
172 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
173 ///    32 bits.
174 /// \param __B
175 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
176 ///    32 bits.
177 /// \param __C
178 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
179 ///   32 bits.
180 /// \returns A 128-bit vector of [4 x float] containing the result in the low
181 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
182 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ss(__m128 __A,__m128 __B,__m128 __C)183 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
184 {
185   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
186 }
187 
188 /// Computes a scalar multiply-subtract of the double-precision values in
189 ///    the low 64 bits of 128-bit vectors of [2 x double].
190 ///
191 /// \code{.operation}
192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
193 /// result[127:64] = __A[127:64]
194 /// \endcode
195 ///
196 /// \headerfile <immintrin.h>
197 ///
198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
199 ///
200 /// \param __A
201 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
202 ///    64 bits.
203 /// \param __B
204 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
205 ///    64 bits.
206 /// \param __C
207 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
208 ///    64 bits.
209 /// \returns A 128-bit vector of [2 x double] containing the result in the low
210 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_sd(__m128d __A,__m128d __B,__m128d __C)212 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
213 {
214   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
215 }
216 
217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
218 ///    For each element, computes <c> -(__A * __B) + __C </c>.
219 ///
220 /// \headerfile <immintrin.h>
221 ///
222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
223 ///
224 /// \param __A
225 ///    A 128-bit vector of [4 x float] containing the multiplicand.
226 /// \param __B
227 ///    A 128-bit vector of [4 x float] containing the multiplier.
228 /// \param __C
229 ///    A 128-bit vector of [4 x float] containing the addend.
230 /// \returns A 128-bit [4 x float] vector containing the result.
231 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ps(__m128 __A,__m128 __B,__m128 __C)232 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
233 {
234   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
235 }
236 
237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
238 ///    For each element, computes <c> -(__A * __B) + __C </c>.
239 ///
240 /// \headerfile <immintrin.h>
241 ///
242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
243 ///
244 /// \param __A
245 ///    A 128-bit vector of [2 x double] containing the multiplicand.
246 /// \param __B
247 ///    A 128-bit vector of [2 x double] containing the multiplier.
248 /// \param __C
249 ///    A 128-bit vector of [2 x double] containing the addend.
250 /// \returns A 128-bit vector of [2 x double] containing the result.
251 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_pd(__m128d __A,__m128d __B,__m128d __C)252 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
253 {
254   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
255 }
256 
257 /// Computes a scalar negated multiply-add of the single-precision values in
258 ///    the low 32 bits of 128-bit vectors of [4 x float].
259 ///
260 /// \code{.operation}
261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
262 /// result[127:32] = __A[127:32]
263 /// \endcode
264 ///
265 /// \headerfile <immintrin.h>
266 ///
267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
268 ///
269 /// \param __A
270 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
271 ///    32 bits.
272 /// \param __B
273 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
274 ///    32 bits.
275 /// \param __C
276 ///    A 128-bit vector of [4 x float] containing the addend in the low
277 ///    32 bits.
278 /// \returns A 128-bit vector of [4 x float] containing the result in the low
279 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ss(__m128 __A,__m128 __B,__m128 __C)281 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
282 {
283   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
284 }
285 
286 /// Computes a scalar negated multiply-add of the double-precision values
287 ///    in the low 64 bits of 128-bit vectors of [2 x double].
288 ///
289 /// \code{.operation}
290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
291 /// result[127:64] = __A[127:64]
292 /// \endcode
293 ///
294 /// \headerfile <immintrin.h>
295 ///
296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
297 ///
298 /// \param __A
299 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
300 ///    64 bits.
301 /// \param __B
302 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
303 ///    64 bits.
304 /// \param __C
305 ///    A 128-bit vector of [2 x double] containing the addend in the low
306 ///    64 bits.
307 /// \returns A 128-bit vector of [2 x double] containing the result in the low
308 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
309 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_sd(__m128d __A,__m128d __B,__m128d __C)310 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
311 {
312   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
313 }
314 
315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
316 ///    For each element, computes <c> -(__A * __B) - __C </c>.
317 ///
318 /// \headerfile <immintrin.h>
319 ///
320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
321 ///
322 /// \param __A
323 ///    A 128-bit vector of [4 x float] containing the multiplicand.
324 /// \param __B
325 ///    A 128-bit vector of [4 x float] containing the multiplier.
326 /// \param __C
327 ///    A 128-bit vector of [4 x float] containing the subtrahend.
328 /// \returns A 128-bit vector of [4 x float] containing the result.
329 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ps(__m128 __A,__m128 __B,__m128 __C)330 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
331 {
332   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
333 }
334 
335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
336 ///    For each element, computes <c> -(__A * __B) - __C </c>.
337 ///
338 /// \headerfile <immintrin.h>
339 ///
340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
341 ///
342 /// \param __A
343 ///    A 128-bit vector of [2 x double] containing the multiplicand.
344 /// \param __B
345 ///    A 128-bit vector of [2 x double] containing the multiplier.
346 /// \param __C
347 ///    A 128-bit vector of [2 x double] containing the subtrahend.
348 /// \returns A 128-bit vector of [2 x double] containing the result.
349 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_pd(__m128d __A,__m128d __B,__m128d __C)350 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
351 {
352   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
353 }
354 
355 /// Computes a scalar negated multiply-subtract of the single-precision
356 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
357 ///
358 /// \code{.operation}
359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
360 /// result[127:32] = __A[127:32]
361 /// \endcode
362 ///
363 /// \headerfile <immintrin.h>
364 ///
365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
366 ///
367 /// \param __A
368 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
369 ///    32 bits.
370 /// \param __B
371 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
372 ///    32 bits.
373 /// \param __C
374 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
375 ///    32 bits.
376 /// \returns A 128-bit vector of [4 x float] containing the result in the low
377 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
378 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ss(__m128 __A,__m128 __B,__m128 __C)379 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
380 {
381   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
382 }
383 
384 /// Computes a scalar negated multiply-subtract of the double-precision
385 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
386 ///
387 /// \code{.operation}
388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
389 /// result[127:64] = __A[127:64]
390 /// \endcode
391 ///
392 /// \headerfile <immintrin.h>
393 ///
394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
395 ///
396 /// \param __A
397 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
398 ///    64 bits.
399 /// \param __B
400 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
401 ///    64 bits.
402 /// \param __C
403 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
404 ///    64 bits.
405 /// \returns A 128-bit vector of [2 x double] containing the result in the low
406 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_sd(__m128d __A,__m128d __B,__m128d __C)408 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
409 {
410   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
411 }
412 
413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
414 ///    [4 x float].
415 ///
416 /// \code{.operation}
417 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
421 /// \endcode
422 ///
423 /// \headerfile <immintrin.h>
424 ///
425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
426 ///
427 /// \param __A
428 ///    A 128-bit vector of [4 x float] containing the multiplicand.
429 /// \param __B
430 ///    A 128-bit vector of [4 x float] containing the multiplier.
431 /// \param __C
432 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
433 /// \returns A 128-bit vector of [4 x float] containing the result.
434 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmaddsub_ps(__m128 __A,__m128 __B,__m128 __C)435 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
436 {
437   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
438 }
439 
440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
441 ///    [2 x double].
442 ///
443 /// \code{.operation}
444 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
446 /// \endcode
447 ///
448 /// \headerfile <immintrin.h>
449 ///
450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
451 ///
452 /// \param __A
453 ///    A 128-bit vector of [2 x double] containing the multiplicand.
454 /// \param __B
455 ///    A 128-bit vector of [2 x double] containing the multiplier.
456 /// \param __C
457 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
458 /// \returns A 128-bit vector of [2 x double] containing the result.
459 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmaddsub_pd(__m128d __A,__m128d __B,__m128d __C)460 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
461 {
462   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
463 }
464 
465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
466 ///    [4 x float].
467 ///
468 /// \code{.operation}
469 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
473 /// \endcode
474 ///
475 /// \headerfile <immintrin.h>
476 ///
477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
478 ///
479 /// \param __A
480 ///    A 128-bit vector of [4 x float] containing the multiplicand.
481 /// \param __B
482 ///    A 128-bit vector of [4 x float] containing the multiplier.
483 /// \param __C
484 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
485 /// \returns A 128-bit vector of [4 x float] containing the result.
486 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsubadd_ps(__m128 __A,__m128 __B,__m128 __C)487 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
488 {
489   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
490 }
491 
492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
493 ///    [2 x double].
494 ///
495 /// \code{.operation}
496 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
498 /// \endcode
499 ///
500 /// \headerfile <immintrin.h>
501 ///
502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
503 ///
504 /// \param __A
505 ///    A 128-bit vector of [2 x double] containing the multiplicand.
506 /// \param __B
507 ///    A 128-bit vector of [2 x double] containing the multiplier.
508 /// \param __C
509 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
510 /// \returns A 128-bit vector of [2 x double] containing the result.
511 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsubadd_pd(__m128d __A,__m128d __B,__m128d __C)512 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
513 {
514   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
515 }
516 
517 /// Computes a multiply-add of 256-bit vectors of [8 x float].
518 ///    For each element, computes <c> (__A * __B) + __C </c>.
519 ///
520 /// \headerfile <immintrin.h>
521 ///
522 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
523 ///
524 /// \param __A
525 ///    A 256-bit vector of [8 x float] containing the multiplicand.
526 /// \param __B
527 ///    A 256-bit vector of [8 x float] containing the multiplier.
528 /// \param __C
529 ///    A 256-bit vector of [8 x float] containing the addend.
530 /// \returns A 256-bit vector of [8 x float] containing the result.
531 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmadd_ps(__m256 __A,__m256 __B,__m256 __C)532 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
533 {
534   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
535 }
536 
537 /// Computes a multiply-add of 256-bit vectors of [4 x double].
538 ///    For each element, computes <c> (__A * __B) + __C </c>.
539 ///
540 /// \headerfile <immintrin.h>
541 ///
542 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
543 ///
544 /// \param __A
545 ///    A 256-bit vector of [4 x double] containing the multiplicand.
546 /// \param __B
547 ///    A 256-bit vector of [4 x double] containing the multiplier.
548 /// \param __C
549 ///    A 256-bit vector of [4 x double] containing the addend.
550 /// \returns A 256-bit vector of [4 x double] containing the result.
551 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmadd_pd(__m256d __A,__m256d __B,__m256d __C)552 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
553 {
554   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
555 }
556 
557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
558 ///    For each element, computes <c> (__A * __B) - __C </c>.
559 ///
560 /// \headerfile <immintrin.h>
561 ///
562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
563 ///
564 /// \param __A
565 ///    A 256-bit vector of [8 x float] containing the multiplicand.
566 /// \param __B
567 ///    A 256-bit vector of [8 x float] containing the multiplier.
568 /// \param __C
569 ///    A 256-bit vector of [8 x float] containing the subtrahend.
570 /// \returns A 256-bit vector of [8 x float] containing the result.
571 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsub_ps(__m256 __A,__m256 __B,__m256 __C)572 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
573 {
574   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
575 }
576 
577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
578 ///    For each element, computes <c> (__A * __B) - __C </c>.
579 ///
580 /// \headerfile <immintrin.h>
581 ///
582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
583 ///
584 /// \param __A
585 ///    A 256-bit vector of [4 x double] containing the multiplicand.
586 /// \param __B
587 ///    A 256-bit vector of [4 x double] containing the multiplier.
588 /// \param __C
589 ///    A 256-bit vector of [4 x double] containing the subtrahend.
590 /// \returns A 256-bit vector of [4 x double] containing the result.
591 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsub_pd(__m256d __A,__m256d __B,__m256d __C)592 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
593 {
594   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
595 }
596 
597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
598 ///    For each element, computes <c> -(__A * __B) + __C </c>.
599 ///
600 /// \headerfile <immintrin.h>
601 ///
602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
603 ///
604 /// \param __A
605 ///    A 256-bit vector of [8 x float] containing the multiplicand.
606 /// \param __B
607 ///    A 256-bit vector of [8 x float] containing the multiplier.
608 /// \param __C
609 ///    A 256-bit vector of [8 x float] containing the addend.
610 /// \returns A 256-bit vector of [8 x float] containing the result.
611 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmadd_ps(__m256 __A,__m256 __B,__m256 __C)612 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
613 {
614   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
615 }
616 
617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
618 ///    For each element, computes <c> -(__A * __B) + __C </c>.
619 ///
620 /// \headerfile <immintrin.h>
621 ///
622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
623 ///
624 /// \param __A
625 ///    A 256-bit vector of [4 x double] containing the multiplicand.
626 /// \param __B
627 ///    A 256-bit vector of [4 x double] containing the multiplier.
628 /// \param __C
629 ///    A 256-bit vector of [4 x double] containing the addend.
630 /// \returns A 256-bit vector of [4 x double] containing the result.
631 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmadd_pd(__m256d __A,__m256d __B,__m256d __C)632 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
633 {
634   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
635 }
636 
637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
638 ///    For each element, computes <c> -(__A * __B) - __C </c>.
639 ///
640 /// \headerfile <immintrin.h>
641 ///
642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
643 ///
644 /// \param __A
645 ///    A 256-bit vector of [8 x float] containing the multiplicand.
646 /// \param __B
647 ///    A 256-bit vector of [8 x float] containing the multiplier.
648 /// \param __C
649 ///    A 256-bit vector of [8 x float] containing the subtrahend.
650 /// \returns A 256-bit vector of [8 x float] containing the result.
651 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmsub_ps(__m256 __A,__m256 __B,__m256 __C)652 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
653 {
654   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
655 }
656 
657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
658 ///    For each element, computes <c> -(__A * __B) - __C </c>.
659 ///
660 /// \headerfile <immintrin.h>
661 ///
662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
663 ///
664 /// \param __A
665 ///    A 256-bit vector of [4 x double] containing the multiplicand.
666 /// \param __B
667 ///    A 256-bit vector of [4 x double] containing the multiplier.
668 /// \param __C
669 ///    A 256-bit vector of [4 x double] containing the subtrahend.
670 /// \returns A 256-bit vector of [4 x double] containing the result.
671 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmsub_pd(__m256d __A,__m256d __B,__m256d __C)672 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
673 {
674   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
675 }
676 
677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
678 ///    [8 x float].
679 ///
680 /// \code{.operation}
681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
689 /// \endcode
690 ///
691 /// \headerfile <immintrin.h>
692 ///
693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
694 ///
695 /// \param __A
696 ///    A 256-bit vector of [8 x float] containing the multiplicand.
697 /// \param __B
698 ///    A 256-bit vector of [8 x float] containing the multiplier.
699 /// \param __C
700 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
701 /// \returns A 256-bit vector of [8 x float] containing the result.
702 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_ps(__m256 __A,__m256 __B,__m256 __C)703 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
704 {
705   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
706 }
707 
708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
709 ///    [4 x double].
710 ///
711 /// \code{.operation}
712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
716 /// \endcode
717 ///
718 /// \headerfile <immintrin.h>
719 ///
720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
721 ///
722 /// \param __A
723 ///    A 256-bit vector of [4 x double] containing the multiplicand.
724 /// \param __B
725 ///    A 256-bit vector of [4 x double] containing the multiplier.
726 /// \param __C
727 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
728 /// \returns A 256-bit vector of [4 x double] containing the result.
729 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_pd(__m256d __A,__m256d __B,__m256d __C)730 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
731 {
732   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
733 }
734 
735 /// Computes a vector multiply with alternating add/subtract of 256-bit
736 ///    vectors of [8 x float].
737 ///
738 /// \code{.operation}
739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
747 /// \endcode
748 ///
749 /// \headerfile <immintrin.h>
750 ///
751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
752 ///
753 /// \param __A
754 ///    A 256-bit vector of [8 x float] containing the multiplicand.
755 /// \param __B
756 ///    A 256-bit vector of [8 x float] containing the multiplier.
757 /// \param __C
758 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
759 /// \returns A 256-bit vector of [8 x float] containing the result.
760 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_ps(__m256 __A,__m256 __B,__m256 __C)761 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
762 {
763   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
764 }
765 
766 /// Computes a vector multiply with alternating add/subtract of 256-bit
767 ///    vectors of [4 x double].
768 ///
769 /// \code{.operation}
770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
774 /// \endcode
775 ///
776 /// \headerfile <immintrin.h>
777 ///
778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
779 ///
780 /// \param __A
781 ///    A 256-bit vector of [4 x double] containing the multiplicand.
782 /// \param __B
783 ///    A 256-bit vector of [4 x double] containing the multiplier.
784 /// \param __C
785 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
786 /// \returns A 256-bit vector of [4 x double] containing the result.
787 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_pd(__m256d __A,__m256d __B,__m256d __C)788 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
789 {
790   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
791 }
792 
793 #undef __DEFAULT_FN_ATTRS128
794 #undef __DEFAULT_FN_ATTRS256
795 
796 #endif /* __FMAINTRIN_H */
797