• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
5
6; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
7
8define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B,  <2 x double> %C) #0 {
9; FMA3-LABEL: mul_addsub_pd128:
10; FMA3:       # %bb.0: # %entry
11; FMA3-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
12; FMA3-NEXT:    retq
13;
14; FMA4-LABEL: mul_addsub_pd128:
15; FMA4:       # %bb.0: # %entry
16; FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
17; FMA4-NEXT:    retq
18entry:
19  %AB = fmul <2 x double> %A, %B
20  %Sub = fsub <2 x double> %AB, %C
21  %Add = fadd <2 x double> %AB, %C
22  %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
23  ret <2 x double> %Addsub
24}
25
26define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
27; FMA3-LABEL: mul_addsub_ps128:
28; FMA3:       # %bb.0: # %entry
29; FMA3-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
30; FMA3-NEXT:    retq
31;
32; FMA4-LABEL: mul_addsub_ps128:
33; FMA4:       # %bb.0: # %entry
34; FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
35; FMA4-NEXT:    retq
36entry:
37  %AB = fmul <4 x float> %A, %B
38  %Sub = fsub <4 x float> %AB, %C
39  %Add = fadd <4 x float> %AB, %C
40  %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
41  ret <4 x float> %Addsub
42}
43
44define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
45; FMA3-LABEL: mul_addsub_pd256:
46; FMA3:       # %bb.0: # %entry
47; FMA3-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
48; FMA3-NEXT:    retq
49;
50; FMA4-LABEL: mul_addsub_pd256:
51; FMA4:       # %bb.0: # %entry
52; FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
53; FMA4-NEXT:    retq
54entry:
55  %AB = fmul <4 x double> %A, %B
56  %Sub = fsub <4 x double> %AB, %C
57  %Add = fadd <4 x double> %AB, %C
58  %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
59  ret <4 x double> %Addsub
60}
61
62define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
63; FMA3-LABEL: mul_addsub_ps256:
64; FMA3:       # %bb.0: # %entry
65; FMA3-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
66; FMA3-NEXT:    retq
67;
68; FMA4-LABEL: mul_addsub_ps256:
69; FMA4:       # %bb.0: # %entry
70; FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
71; FMA4-NEXT:    retq
72entry:
73  %AB = fmul <8 x float> %A, %B
74  %Sub = fsub <8 x float> %AB, %C
75  %Add = fadd <8 x float> %AB, %C
76  %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
77  ret <8 x float> %Addsub
78}
79
80define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
81; FMA3_256-LABEL: mul_addsub_pd512:
82; FMA3_256:       # %bb.0: # %entry
83; FMA3_256-NEXT:    vfmaddsub213pd %ymm4, %ymm2, %ymm0
84; FMA3_256-NEXT:    vfmaddsub213pd %ymm5, %ymm3, %ymm1
85; FMA3_256-NEXT:    retq
86;
87; FMA3_512-LABEL: mul_addsub_pd512:
88; FMA3_512:       # %bb.0: # %entry
89; FMA3_512-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0
90; FMA3_512-NEXT:    retq
91;
92; FMA4-LABEL: mul_addsub_pd512:
93; FMA4:       # %bb.0: # %entry
94; FMA4-NEXT:    vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
95; FMA4-NEXT:    vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
96; FMA4-NEXT:    retq
97entry:
98  %AB = fmul <8 x double> %A, %B
99  %Sub = fsub <8 x double> %AB, %C
100  %Add = fadd <8 x double> %AB, %C
101  %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
102  ret <8 x double> %Addsub
103}
104
105define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
106; FMA3_256-LABEL: mul_addsub_ps512:
107; FMA3_256:       # %bb.0: # %entry
108; FMA3_256-NEXT:    vfmaddsub213ps %ymm4, %ymm2, %ymm0
109; FMA3_256-NEXT:    vfmaddsub213ps %ymm5, %ymm3, %ymm1
110; FMA3_256-NEXT:    retq
111;
112; FMA3_512-LABEL: mul_addsub_ps512:
113; FMA3_512:       # %bb.0: # %entry
114; FMA3_512-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0
115; FMA3_512-NEXT:    retq
116;
117; FMA4-LABEL: mul_addsub_ps512:
118; FMA4:       # %bb.0: # %entry
119; FMA4-NEXT:    vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
120; FMA4-NEXT:    vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
121; FMA4-NEXT:    retq
122entry:
123  %AB = fmul <16 x float> %A, %B
124  %Sub = fsub <16 x float> %AB, %C
125  %Add = fadd <16 x float> %AB, %C
126  %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
127  ret <16 x float> %Addsub
128}
129
130define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
131; FMA3-LABEL: buildvector_mul_addsub_ps128:
132; FMA3:       # %bb.0: # %bb
133; FMA3-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
134; FMA3-NEXT:    retq
135;
136; FMA4-LABEL: buildvector_mul_addsub_ps128:
137; FMA4:       # %bb.0: # %bb
138; FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
139; FMA4-NEXT:    retq
140bb:
141  %A = fmul <4 x float> %C, %D
142  %A0 = extractelement <4 x float> %A, i32 0
143  %B0 = extractelement <4 x float> %B, i32 0
144  %sub0 = fsub float %A0, %B0
145  %A2 = extractelement <4 x float> %A, i32 2
146  %B2 = extractelement <4 x float> %B, i32 2
147  %sub2 = fsub float %A2, %B2
148  %A1 = extractelement <4 x float> %A, i32 1
149  %B1 = extractelement <4 x float> %B, i32 1
150  %add1 = fadd float %A1, %B1
151  %A3 = extractelement <4 x float> %A, i32 3
152  %B3 = extractelement <4 x float> %B, i32 3
153  %add3 = fadd float %A3, %B3
154  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
155  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
156  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
157  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
158  ret <4 x float> %vecinsert4
159}
160
161define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
162; FMA3-LABEL: buildvector_mul_addsub_pd128:
163; FMA3:       # %bb.0: # %bb
164; FMA3-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
165; FMA3-NEXT:    retq
166;
167; FMA4-LABEL: buildvector_mul_addsub_pd128:
168; FMA4:       # %bb.0: # %bb
169; FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
170; FMA4-NEXT:    retq
171bb:
172  %A = fmul <2 x double> %C, %D
173  %A0 = extractelement <2 x double> %A, i32 0
174  %B0 = extractelement <2 x double> %B, i32 0
175  %sub0 = fsub double %A0, %B0
176  %A1 = extractelement <2 x double> %A, i32 1
177  %B1 = extractelement <2 x double> %B, i32 1
178  %add1 = fadd double %A1, %B1
179  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
180  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
181  ret <2 x double> %vecinsert2
182}
183
184define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
185; FMA3-LABEL: buildvector_mul_addsub_ps256:
186; FMA3:       # %bb.0: # %bb
187; FMA3-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
188; FMA3-NEXT:    retq
189;
190; FMA4-LABEL: buildvector_mul_addsub_ps256:
191; FMA4:       # %bb.0: # %bb
192; FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
193; FMA4-NEXT:    retq
194bb:
195  %A = fmul <8 x float> %C, %D
196  %A0 = extractelement <8 x float> %A, i32 0
197  %B0 = extractelement <8 x float> %B, i32 0
198  %sub0 = fsub float %A0, %B0
199  %A2 = extractelement <8 x float> %A, i32 2
200  %B2 = extractelement <8 x float> %B, i32 2
201  %sub2 = fsub float %A2, %B2
202  %A4 = extractelement <8 x float> %A, i32 4
203  %B4 = extractelement <8 x float> %B, i32 4
204  %sub4 = fsub float %A4, %B4
205  %A6 = extractelement <8 x float> %A, i32 6
206  %B6 = extractelement <8 x float> %B, i32 6
207  %sub6 = fsub float %A6, %B6
208  %A1 = extractelement <8 x float> %A, i32 1
209  %B1 = extractelement <8 x float> %B, i32 1
210  %add1 = fadd float %A1, %B1
211  %A3 = extractelement <8 x float> %A, i32 3
212  %B3 = extractelement <8 x float> %B, i32 3
213  %add3 = fadd float %A3, %B3
214  %A5 = extractelement <8 x float> %A, i32 5
215  %B5 = extractelement <8 x float> %B, i32 5
216  %add5 = fadd float %A5, %B5
217  %A7 = extractelement <8 x float> %A, i32 7
218  %B7 = extractelement <8 x float> %B, i32 7
219  %add7 = fadd float %A7, %B7
220  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
221  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
222  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
223  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
224  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
225  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
226  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
227  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
228  ret <8 x float> %vecinsert8
229}
230
231define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
232; FMA3-LABEL: buildvector_mul_addsub_pd256:
233; FMA3:       # %bb.0: # %bb
234; FMA3-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
235; FMA3-NEXT:    retq
236;
237; FMA4-LABEL: buildvector_mul_addsub_pd256:
238; FMA4:       # %bb.0: # %bb
239; FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
240; FMA4-NEXT:    retq
241bb:
242  %A = fmul <4 x double> %C, %D
243  %A0 = extractelement <4 x double> %A, i32 0
244  %B0 = extractelement <4 x double> %B, i32 0
245  %sub0 = fsub double %A0, %B0
246  %A2 = extractelement <4 x double> %A, i32 2
247  %B2 = extractelement <4 x double> %B, i32 2
248  %sub2 = fsub double %A2, %B2
249  %A1 = extractelement <4 x double> %A, i32 1
250  %B1 = extractelement <4 x double> %B, i32 1
251  %add1 = fadd double %A1, %B1
252  %A3 = extractelement <4 x double> %A, i32 3
253  %B3 = extractelement <4 x double> %B, i32 3
254  %add3 = fadd double %A3, %B3
255  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
256  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
257  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
258  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
259  ret <4 x double> %vecinsert4
260}
261
262define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
263; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
264; FMA3_256:       # %bb.0: # %bb
265; FMA3_256-NEXT:    vfmaddsub213ps %ymm4, %ymm2, %ymm0
266; FMA3_256-NEXT:    vfmaddsub213ps %ymm5, %ymm3, %ymm1
267; FMA3_256-NEXT:    retq
268;
269; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
270; FMA3_512:       # %bb.0: # %bb
271; FMA3_512-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0
272; FMA3_512-NEXT:    retq
273;
274; FMA4-LABEL: buildvector_mul_addsub_ps512:
275; FMA4:       # %bb.0: # %bb
276; FMA4-NEXT:    vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
277; FMA4-NEXT:    vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
278; FMA4-NEXT:    retq
279bb:
280  %A = fmul <16 x float> %C, %D
281  %A0 = extractelement <16 x float> %A, i32 0
282  %B0 = extractelement <16 x float> %B, i32 0
283  %sub0 = fsub float %A0, %B0
284  %A2 = extractelement <16 x float> %A, i32 2
285  %B2 = extractelement <16 x float> %B, i32 2
286  %sub2 = fsub float %A2, %B2
287  %A4 = extractelement <16 x float> %A, i32 4
288  %B4 = extractelement <16 x float> %B, i32 4
289  %sub4 = fsub float %A4, %B4
290  %A6 = extractelement <16 x float> %A, i32 6
291  %B6 = extractelement <16 x float> %B, i32 6
292  %sub6 = fsub float %A6, %B6
293  %A8 = extractelement <16 x float> %A, i32 8
294  %B8 = extractelement <16 x float> %B, i32 8
295  %sub8 = fsub float %A8, %B8
296  %A10 = extractelement <16 x float> %A, i32 10
297  %B10 = extractelement <16 x float> %B, i32 10
298  %sub10 = fsub float %A10, %B10
299  %A12 = extractelement <16 x float> %A, i32 12
300  %B12 = extractelement <16 x float> %B, i32 12
301  %sub12 = fsub float %A12, %B12
302  %A14 = extractelement <16 x float> %A, i32 14
303  %B14 = extractelement <16 x float> %B, i32 14
304  %sub14 = fsub float %A14, %B14
305  %A1 = extractelement <16 x float> %A, i32 1
306  %B1 = extractelement <16 x float> %B, i32 1
307  %add1 = fadd float %A1, %B1
308  %A3 = extractelement <16 x float> %A, i32 3
309  %B3 = extractelement <16 x float> %B, i32 3
310  %add3 = fadd float %A3, %B3
311  %A5 = extractelement <16 x float> %A, i32 5
312  %B5 = extractelement <16 x float> %B, i32 5
313  %add5 = fadd float %A5, %B5
314  %A7 = extractelement <16 x float> %A, i32 7
315  %B7 = extractelement <16 x float> %B, i32 7
316  %add7 = fadd float %A7, %B7
317  %A9 = extractelement <16 x float> %A, i32 9
318  %B9 = extractelement <16 x float> %B, i32 9
319  %add9 = fadd float %A9, %B9
320  %A11 = extractelement <16 x float> %A, i32 11
321  %B11 = extractelement <16 x float> %B, i32 11
322  %add11 = fadd float %A11, %B11
323  %A13 = extractelement <16 x float> %A, i32 13
324  %B13 = extractelement <16 x float> %B, i32 13
325  %add13 = fadd float %A13, %B13
326  %A15 = extractelement <16 x float> %A, i32 15
327  %B15 = extractelement <16 x float> %B, i32 15
328  %add15 = fadd float %A15, %B15
329  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
330  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
331  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
332  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
333  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
334  ; element 5 is undef
335  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
336  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
337  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
338  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
339  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
340  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
341  ; element 12 is undef
342  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
343  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
344  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
345  ret <16 x float> %vecinsert16
346}
347
348define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
349; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
350; FMA3_256:       # %bb.0: # %bb
351; FMA3_256-NEXT:    vfmaddsub213pd %ymm4, %ymm2, %ymm0
352; FMA3_256-NEXT:    vfmaddsub213pd %ymm5, %ymm3, %ymm1
353; FMA3_256-NEXT:    retq
354;
355; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
356; FMA3_512:       # %bb.0: # %bb
357; FMA3_512-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0
358; FMA3_512-NEXT:    retq
359;
360; FMA4-LABEL: buildvector_mul_addsub_pd512:
361; FMA4:       # %bb.0: # %bb
362; FMA4-NEXT:    vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
363; FMA4-NEXT:    vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
364; FMA4-NEXT:    retq
365bb:
366  %A = fmul <8 x double> %C, %D
367  %A0 = extractelement <8 x double> %A, i32 0
368  %B0 = extractelement <8 x double> %B, i32 0
369  %sub0 = fsub double %A0, %B0
370  %A2 = extractelement <8 x double> %A, i32 2
371  %B2 = extractelement <8 x double> %B, i32 2
372  %sub2 = fsub double %A2, %B2
373  %A4 = extractelement <8 x double> %A, i32 4
374  %B4 = extractelement <8 x double> %B, i32 4
375  %sub4 = fsub double %A4, %B4
376  %A6 = extractelement <8 x double> %A, i32 6
377  %B6 = extractelement <8 x double> %B, i32 6
378  %sub6 = fsub double %A6, %B6
379  %A1 = extractelement <8 x double> %A, i32 1
380  %B1 = extractelement <8 x double> %B, i32 1
381  %add1 = fadd double %A1, %B1
382  %A3 = extractelement <8 x double> %A, i32 3
383  %B3 = extractelement <8 x double> %B, i32 3
384  %add3 = fadd double %A3, %B3
385  %A7 = extractelement <8 x double> %A, i32 7
386  %B7 = extractelement <8 x double> %B, i32 7
387  %add7 = fadd double %A7, %B7
388  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
389  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
390  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
391  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
392  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
393  ; element 5 is undef
394  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
395  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
396  ret <8 x double> %vecinsert8
397}
398
399define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
400; FMA3-LABEL: buildvector_mul_subadd_ps128:
401; FMA3:       # %bb.0: # %bb
402; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
403; FMA3-NEXT:    retq
404;
405; FMA4-LABEL: buildvector_mul_subadd_ps128:
406; FMA4:       # %bb.0: # %bb
407; FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
408; FMA4-NEXT:    retq
409bb:
410  %A = fmul <4 x float> %C, %D
411  %A0 = extractelement <4 x float> %A, i32 0
412  %B0 = extractelement <4 x float> %B, i32 0
413  %sub0 = fadd float %A0, %B0
414  %A2 = extractelement <4 x float> %A, i32 2
415  %B2 = extractelement <4 x float> %B, i32 2
416  %sub2 = fadd float %A2, %B2
417  %A1 = extractelement <4 x float> %A, i32 1
418  %B1 = extractelement <4 x float> %B, i32 1
419  %add1 = fsub float %A1, %B1
420  %A3 = extractelement <4 x float> %A, i32 3
421  %B3 = extractelement <4 x float> %B, i32 3
422  %add3 = fsub float %A3, %B3
423  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
424  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
425  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
426  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
427  ret <4 x float> %vecinsert4
428}
429
430define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
431; FMA3-LABEL: buildvector_mul_subadd_pd128:
432; FMA3:       # %bb.0: # %bb
433; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
434; FMA3-NEXT:    retq
435;
436; FMA4-LABEL: buildvector_mul_subadd_pd128:
437; FMA4:       # %bb.0: # %bb
438; FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
439; FMA4-NEXT:    retq
440bb:
441  %A = fmul <2 x double> %C, %D
442  %A0 = extractelement <2 x double> %A, i32 0
443  %B0 = extractelement <2 x double> %B, i32 0
444  %sub0 = fadd double %A0, %B0
445  %A1 = extractelement <2 x double> %A, i32 1
446  %B1 = extractelement <2 x double> %B, i32 1
447  %add1 = fsub double %A1, %B1
448  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
449  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
450  ret <2 x double> %vecinsert2
451}
452
453define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
454; FMA3-LABEL: buildvector_mul_subadd_ps256:
455; FMA3:       # %bb.0: # %bb
456; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
457; FMA3-NEXT:    retq
458;
459; FMA4-LABEL: buildvector_mul_subadd_ps256:
460; FMA4:       # %bb.0: # %bb
461; FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
462; FMA4-NEXT:    retq
463bb:
464  %A = fmul <8 x float> %C, %D
465  %A0 = extractelement <8 x float> %A, i32 0
466  %B0 = extractelement <8 x float> %B, i32 0
467  %sub0 = fadd float %A0, %B0
468  %A2 = extractelement <8 x float> %A, i32 2
469  %B2 = extractelement <8 x float> %B, i32 2
470  %sub2 = fadd float %A2, %B2
471  %A4 = extractelement <8 x float> %A, i32 4
472  %B4 = extractelement <8 x float> %B, i32 4
473  %sub4 = fadd float %A4, %B4
474  %A6 = extractelement <8 x float> %A, i32 6
475  %B6 = extractelement <8 x float> %B, i32 6
476  %sub6 = fadd float %A6, %B6
477  %A1 = extractelement <8 x float> %A, i32 1
478  %B1 = extractelement <8 x float> %B, i32 1
479  %add1 = fsub float %A1, %B1
480  %A3 = extractelement <8 x float> %A, i32 3
481  %B3 = extractelement <8 x float> %B, i32 3
482  %add3 = fsub float %A3, %B3
483  %A5 = extractelement <8 x float> %A, i32 5
484  %B5 = extractelement <8 x float> %B, i32 5
485  %add5 = fsub float %A5, %B5
486  %A7 = extractelement <8 x float> %A, i32 7
487  %B7 = extractelement <8 x float> %B, i32 7
488  %add7 = fsub float %A7, %B7
489  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
490  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
491  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
492  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
493  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
494  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
495  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
496  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
497  ret <8 x float> %vecinsert8
498}
499
500define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
501; FMA3-LABEL: buildvector_mul_subadd_pd256:
502; FMA3:       # %bb.0: # %bb
503; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
504; FMA3-NEXT:    retq
505;
506; FMA4-LABEL: buildvector_mul_subadd_pd256:
507; FMA4:       # %bb.0: # %bb
508; FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
509; FMA4-NEXT:    retq
510bb:
511  %A = fmul <4 x double> %C, %D
512  %A0 = extractelement <4 x double> %A, i32 0
513  %B0 = extractelement <4 x double> %B, i32 0
514  %sub0 = fadd double %A0, %B0
515  %A2 = extractelement <4 x double> %A, i32 2
516  %B2 = extractelement <4 x double> %B, i32 2
517  %sub2 = fadd double %A2, %B2
518  %A1 = extractelement <4 x double> %A, i32 1
519  %B1 = extractelement <4 x double> %B, i32 1
520  %add1 = fsub double %A1, %B1
521  %A3 = extractelement <4 x double> %A, i32 3
522  %B3 = extractelement <4 x double> %B, i32 3
523  %add3 = fsub double %A3, %B3
524  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
525  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
526  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
527  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
528  ret <4 x double> %vecinsert4
529}
530
531define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
532; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
533; FMA3_256:       # %bb.0: # %bb
534; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
535; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
536; FMA3_256-NEXT:    retq
537;
538; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
539; FMA3_512:       # %bb.0: # %bb
540; FMA3_512-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
541; FMA3_512-NEXT:    retq
542;
543; FMA4-LABEL: buildvector_mul_subadd_ps512:
544; FMA4:       # %bb.0: # %bb
545; FMA4-NEXT:    vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0
546; FMA4-NEXT:    vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1
547; FMA4-NEXT:    retq
548bb:
549  %A = fmul <16 x float> %C, %D
550  %A0 = extractelement <16 x float> %A, i32 0
551  %B0 = extractelement <16 x float> %B, i32 0
552  %sub0 = fadd float %A0, %B0
553  %A2 = extractelement <16 x float> %A, i32 2
554  %B2 = extractelement <16 x float> %B, i32 2
555  %sub2 = fadd float %A2, %B2
556  %A4 = extractelement <16 x float> %A, i32 4
557  %B4 = extractelement <16 x float> %B, i32 4
558  %sub4 = fadd float %A4, %B4
559  %A6 = extractelement <16 x float> %A, i32 6
560  %B6 = extractelement <16 x float> %B, i32 6
561  %sub6 = fadd float %A6, %B6
562  %A8 = extractelement <16 x float> %A, i32 8
563  %B8 = extractelement <16 x float> %B, i32 8
564  %sub8 = fadd float %A8, %B8
565  %A10 = extractelement <16 x float> %A, i32 10
566  %B10 = extractelement <16 x float> %B, i32 10
567  %sub10 = fadd float %A10, %B10
568  %A12 = extractelement <16 x float> %A, i32 12
569  %B12 = extractelement <16 x float> %B, i32 12
570  %sub12 = fadd float %A12, %B12
571  %A14 = extractelement <16 x float> %A, i32 14
572  %B14 = extractelement <16 x float> %B, i32 14
573  %sub14 = fadd float %A14, %B14
574  %A1 = extractelement <16 x float> %A, i32 1
575  %B1 = extractelement <16 x float> %B, i32 1
576  %add1 = fsub float %A1, %B1
577  %A3 = extractelement <16 x float> %A, i32 3
578  %B3 = extractelement <16 x float> %B, i32 3
579  %add3 = fsub float %A3, %B3
580  %A5 = extractelement <16 x float> %A, i32 5
581  %B5 = extractelement <16 x float> %B, i32 5
582  %add5 = fsub float %A5, %B5
583  %A7 = extractelement <16 x float> %A, i32 7
584  %B7 = extractelement <16 x float> %B, i32 7
585  %add7 = fsub float %A7, %B7
586  %A9 = extractelement <16 x float> %A, i32 9
587  %B9 = extractelement <16 x float> %B, i32 9
588  %add9 = fsub float %A9, %B9
589  %A11 = extractelement <16 x float> %A, i32 11
590  %B11 = extractelement <16 x float> %B, i32 11
591  %add11 = fsub float %A11, %B11
592  %A13 = extractelement <16 x float> %A, i32 13
593  %B13 = extractelement <16 x float> %B, i32 13
594  %add13 = fsub float %A13, %B13
595  %A15 = extractelement <16 x float> %A, i32 15
596  %B15 = extractelement <16 x float> %B, i32 15
597  %add15 = fsub float %A15, %B15
598  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
599  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
600  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
601  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
602  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
603  ; element 5 is undef
604  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
605  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
606  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
607  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
608  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
609  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
610  ; element 12 is undef
611  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
612  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
613  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
614  ret <16 x float> %vecinsert16
615}
616
617define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
618; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
619; FMA3_256:       # %bb.0: # %bb
620; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
621; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
622; FMA3_256-NEXT:    retq
623;
624; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
625; FMA3_512:       # %bb.0: # %bb
626; FMA3_512-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
627; FMA3_512-NEXT:    retq
628;
629; FMA4-LABEL: buildvector_mul_subadd_pd512:
630; FMA4:       # %bb.0: # %bb
631; FMA4-NEXT:    vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0
632; FMA4-NEXT:    vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1
633; FMA4-NEXT:    retq
634bb:
635  %A = fmul <8 x double> %C, %D
636  %A0 = extractelement <8 x double> %A, i32 0
637  %B0 = extractelement <8 x double> %B, i32 0
638  %sub0 = fadd double %A0, %B0
639  %A2 = extractelement <8 x double> %A, i32 2
640  %B2 = extractelement <8 x double> %B, i32 2
641  %sub2 = fadd double %A2, %B2
642  %A4 = extractelement <8 x double> %A, i32 4
643  %B4 = extractelement <8 x double> %B, i32 4
644  %sub4 = fadd double %A4, %B4
645  %A6 = extractelement <8 x double> %A, i32 6
646  %B6 = extractelement <8 x double> %B, i32 6
647  %sub6 = fadd double %A6, %B6
648  %A1 = extractelement <8 x double> %A, i32 1
649  %B1 = extractelement <8 x double> %B, i32 1
650  %add1 = fsub double %A1, %B1
651  %A3 = extractelement <8 x double> %A, i32 3
652  %B3 = extractelement <8 x double> %B, i32 3
653  %add3 = fsub double %A3, %B3
654  %A7 = extractelement <8 x double> %A, i32 7
655  %B7 = extractelement <8 x double> %B, i32 7
656  %add7 = fsub double %A7, %B7
657  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
658  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
659  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
660  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
661  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
662  ; element 5 is undef
663  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
664  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
665  ret <8 x double> %vecinsert8
666}
667
668attributes #0 = { nounwind "unsafe-fp-math"="true" }
669