• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4
5
6; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
7
8define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B,  <2 x double> %C) #0 {
9; FMA3-LABEL: mul_addsub_pd128:
10; FMA3:       # %bb.0: # %entry
11; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
12; FMA3-NEXT:    retq
13;
14; FMA4-LABEL: mul_addsub_pd128:
15; FMA4:       # %bb.0: # %entry
16; FMA4-NEXT:    vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
17; FMA4-NEXT:    retq
18entry:
19  %AB = fmul <2 x double> %A, %B
20  %Sub = fsub <2 x double> %AB, %C
21  %Add = fadd <2 x double> %AB, %C
22  %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
23  ret <2 x double> %Addsub
24}
25
26define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
27; FMA3-LABEL: mul_addsub_ps128:
28; FMA3:       # %bb.0: # %entry
29; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
30; FMA3-NEXT:    retq
31;
32; FMA4-LABEL: mul_addsub_ps128:
33; FMA4:       # %bb.0: # %entry
34; FMA4-NEXT:    vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
35; FMA4-NEXT:    retq
36entry:
37  %AB = fmul <4 x float> %A, %B
38  %Sub = fsub <4 x float> %AB, %C
39  %Add = fadd <4 x float> %AB, %C
40  %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
41  ret <4 x float> %Addsub
42}
43
44define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
45; FMA3-LABEL: mul_addsub_pd256:
46; FMA3:       # %bb.0: # %entry
47; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
48; FMA3-NEXT:    retq
49;
50; FMA4-LABEL: mul_addsub_pd256:
51; FMA4:       # %bb.0: # %entry
52; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
53; FMA4-NEXT:    retq
54entry:
55  %AB = fmul <4 x double> %A, %B
56  %Sub = fsub <4 x double> %AB, %C
57  %Add = fadd <4 x double> %AB, %C
58  %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
59  ret <4 x double> %Addsub
60}
61
62define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
63; FMA3-LABEL: mul_addsub_ps256:
64; FMA3:       # %bb.0: # %entry
65; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
66; FMA3-NEXT:    retq
67;
68; FMA4-LABEL: mul_addsub_ps256:
69; FMA4:       # %bb.0: # %entry
70; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
71; FMA4-NEXT:    retq
72entry:
73  %AB = fmul <8 x float> %A, %B
74  %Sub = fsub <8 x float> %AB, %C
75  %Add = fadd <8 x float> %AB, %C
76  %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
77  ret <8 x float> %Addsub
78}
79
80define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
81; FMA3_256-LABEL: mul_addsub_pd512:
82; FMA3_256:       # %bb.0: # %entry
83; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
84; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
85; FMA3_256-NEXT:    retq
86;
87; FMA3_512-LABEL: mul_addsub_pd512:
88; FMA3_512:       # %bb.0: # %entry
89; FMA3_512-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
90; FMA3_512-NEXT:    retq
91;
92; FMA4-LABEL: mul_addsub_pd512:
93; FMA4:       # %bb.0: # %entry
94; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
95; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
96; FMA4-NEXT:    retq
97entry:
98  %AB = fmul <8 x double> %A, %B
99  %Sub = fsub <8 x double> %AB, %C
100  %Add = fadd <8 x double> %AB, %C
101  %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
102  ret <8 x double> %Addsub
103}
104
105define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
106; FMA3_256-LABEL: mul_addsub_ps512:
107; FMA3_256:       # %bb.0: # %entry
108; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
109; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
110; FMA3_256-NEXT:    retq
111;
112; FMA3_512-LABEL: mul_addsub_ps512:
113; FMA3_512:       # %bb.0: # %entry
114; FMA3_512-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
115; FMA3_512-NEXT:    retq
116;
117; FMA4-LABEL: mul_addsub_ps512:
118; FMA4:       # %bb.0: # %entry
119; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
120; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
121; FMA4-NEXT:    retq
122entry:
123  %AB = fmul <16 x float> %A, %B
124  %Sub = fsub <16 x float> %AB, %C
125  %Add = fadd <16 x float> %AB, %C
126  %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
127  ret <16 x float> %Addsub
128}
129
130define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
131; FMA3-LABEL: buildvector_mul_addsub_ps128:
132; FMA3:       # %bb.0: # %bb
133; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
134; FMA3-NEXT:    retq
135;
136; FMA4-LABEL: buildvector_mul_addsub_ps128:
137; FMA4:       # %bb.0: # %bb
138; FMA4-NEXT:    vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
139; FMA4-NEXT:    retq
140bb:
141  %A = fmul <4 x float> %C, %D
142  %A0 = extractelement <4 x float> %A, i32 0
143  %B0 = extractelement <4 x float> %B, i32 0
144  %sub0 = fsub float %A0, %B0
145  %A2 = extractelement <4 x float> %A, i32 2
146  %B2 = extractelement <4 x float> %B, i32 2
147  %sub2 = fsub float %A2, %B2
148  %A1 = extractelement <4 x float> %A, i32 1
149  %B1 = extractelement <4 x float> %B, i32 1
150  %add1 = fadd float %A1, %B1
151  %A3 = extractelement <4 x float> %A, i32 3
152  %B3 = extractelement <4 x float> %B, i32 3
153  %add3 = fadd float %A3, %B3
154  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
155  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
156  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
157  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
158  ret <4 x float> %vecinsert4
159}
160
161define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
162; FMA3-LABEL: buildvector_mul_addsub_pd128:
163; FMA3:       # %bb.0: # %bb
164; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
165; FMA3-NEXT:    retq
166;
167; FMA4-LABEL: buildvector_mul_addsub_pd128:
168; FMA4:       # %bb.0: # %bb
169; FMA4-NEXT:    vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
170; FMA4-NEXT:    retq
171bb:
172  %A = fmul <2 x double> %C, %D
173  %A0 = extractelement <2 x double> %A, i32 0
174  %B0 = extractelement <2 x double> %B, i32 0
175  %sub0 = fsub double %A0, %B0
176  %A1 = extractelement <2 x double> %A, i32 1
177  %B1 = extractelement <2 x double> %B, i32 1
178  %add1 = fadd double %A1, %B1
179  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
180  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
181  ret <2 x double> %vecinsert2
182}
183
184define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
185; FMA3-LABEL: buildvector_mul_addsub_ps256:
186; FMA3:       # %bb.0: # %bb
187; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
188; FMA3-NEXT:    retq
189;
190; FMA4-LABEL: buildvector_mul_addsub_ps256:
191; FMA4:       # %bb.0: # %bb
192; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
193; FMA4-NEXT:    retq
194bb:
195  %A = fmul <8 x float> %C, %D
196  %A0 = extractelement <8 x float> %A, i32 0
197  %B0 = extractelement <8 x float> %B, i32 0
198  %sub0 = fsub float %A0, %B0
199  %A2 = extractelement <8 x float> %A, i32 2
200  %B2 = extractelement <8 x float> %B, i32 2
201  %sub2 = fsub float %A2, %B2
202  %A4 = extractelement <8 x float> %A, i32 4
203  %B4 = extractelement <8 x float> %B, i32 4
204  %sub4 = fsub float %A4, %B4
205  %A6 = extractelement <8 x float> %A, i32 6
206  %B6 = extractelement <8 x float> %B, i32 6
207  %sub6 = fsub float %A6, %B6
208  %A1 = extractelement <8 x float> %A, i32 1
209  %B1 = extractelement <8 x float> %B, i32 1
210  %add1 = fadd float %A1, %B1
211  %A3 = extractelement <8 x float> %A, i32 3
212  %B3 = extractelement <8 x float> %B, i32 3
213  %add3 = fadd float %A3, %B3
214  %A5 = extractelement <8 x float> %A, i32 5
215  %B5 = extractelement <8 x float> %B, i32 5
216  %add5 = fadd float %A5, %B5
217  %A7 = extractelement <8 x float> %A, i32 7
218  %B7 = extractelement <8 x float> %B, i32 7
219  %add7 = fadd float %A7, %B7
220  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
221  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
222  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
223  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
224  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
225  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
226  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
227  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
228  ret <8 x float> %vecinsert8
229}
230
231define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
232; FMA3-LABEL: buildvector_mul_addsub_pd256:
233; FMA3:       # %bb.0: # %bb
234; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
235; FMA3-NEXT:    retq
236;
237; FMA4-LABEL: buildvector_mul_addsub_pd256:
238; FMA4:       # %bb.0: # %bb
239; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
240; FMA4-NEXT:    retq
241bb:
242  %A = fmul <4 x double> %C, %D
243  %A0 = extractelement <4 x double> %A, i32 0
244  %B0 = extractelement <4 x double> %B, i32 0
245  %sub0 = fsub double %A0, %B0
246  %A2 = extractelement <4 x double> %A, i32 2
247  %B2 = extractelement <4 x double> %B, i32 2
248  %sub2 = fsub double %A2, %B2
249  %A1 = extractelement <4 x double> %A, i32 1
250  %B1 = extractelement <4 x double> %B, i32 1
251  %add1 = fadd double %A1, %B1
252  %A3 = extractelement <4 x double> %A, i32 3
253  %B3 = extractelement <4 x double> %B, i32 3
254  %add3 = fadd double %A3, %B3
255  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
256  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
257  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
258  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
259  ret <4 x double> %vecinsert4
260}
261
262define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
263; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
264; FMA3_256:       # %bb.0: # %bb
265; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
266; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
267; FMA3_256-NEXT:    retq
268;
269; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
270; FMA3_512:       # %bb.0: # %bb
271; FMA3_512-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
272; FMA3_512-NEXT:    retq
273;
274; FMA4-LABEL: buildvector_mul_addsub_ps512:
275; FMA4:       # %bb.0: # %bb
276; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
277; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
278; FMA4-NEXT:    retq
279bb:
280  %A = fmul <16 x float> %C, %D
281  %A0 = extractelement <16 x float> %A, i32 0
282  %B0 = extractelement <16 x float> %B, i32 0
283  %sub0 = fsub float %A0, %B0
284  %A2 = extractelement <16 x float> %A, i32 2
285  %B2 = extractelement <16 x float> %B, i32 2
286  %sub2 = fsub float %A2, %B2
287  %A4 = extractelement <16 x float> %A, i32 4
288  %B4 = extractelement <16 x float> %B, i32 4
289  %sub4 = fsub float %A4, %B4
290  %A6 = extractelement <16 x float> %A, i32 6
291  %B6 = extractelement <16 x float> %B, i32 6
292  %sub6 = fsub float %A6, %B6
293  %A8 = extractelement <16 x float> %A, i32 8
294  %B8 = extractelement <16 x float> %B, i32 8
295  %sub8 = fsub float %A8, %B8
296  %A10 = extractelement <16 x float> %A, i32 10
297  %B10 = extractelement <16 x float> %B, i32 10
298  %sub10 = fsub float %A10, %B10
299  %A12 = extractelement <16 x float> %A, i32 12
300  %B12 = extractelement <16 x float> %B, i32 12
301  %sub12 = fsub float %A12, %B12
302  %A14 = extractelement <16 x float> %A, i32 14
303  %B14 = extractelement <16 x float> %B, i32 14
304  %sub14 = fsub float %A14, %B14
305  %A1 = extractelement <16 x float> %A, i32 1
306  %B1 = extractelement <16 x float> %B, i32 1
307  %add1 = fadd float %A1, %B1
308  %A3 = extractelement <16 x float> %A, i32 3
309  %B3 = extractelement <16 x float> %B, i32 3
310  %add3 = fadd float %A3, %B3
311  %A5 = extractelement <16 x float> %A, i32 5
312  %B5 = extractelement <16 x float> %B, i32 5
313  %add5 = fadd float %A5, %B5
314  %A7 = extractelement <16 x float> %A, i32 7
315  %B7 = extractelement <16 x float> %B, i32 7
316  %add7 = fadd float %A7, %B7
317  %A9 = extractelement <16 x float> %A, i32 9
318  %B9 = extractelement <16 x float> %B, i32 9
319  %add9 = fadd float %A9, %B9
320  %A11 = extractelement <16 x float> %A, i32 11
321  %B11 = extractelement <16 x float> %B, i32 11
322  %add11 = fadd float %A11, %B11
323  %A13 = extractelement <16 x float> %A, i32 13
324  %B13 = extractelement <16 x float> %B, i32 13
325  %add13 = fadd float %A13, %B13
326  %A15 = extractelement <16 x float> %A, i32 15
327  %B15 = extractelement <16 x float> %B, i32 15
328  %add15 = fadd float %A15, %B15
329  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
330  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
331  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
332  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
333  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
334  ; element 5 is undef
335  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
336  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
337  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
338  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
339  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
340  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
341  ; element 12 is undef
342  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
343  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
344  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
345  ret <16 x float> %vecinsert16
346}
347
348define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
349; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
350; FMA3_256:       # %bb.0: # %bb
351; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
352; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
353; FMA3_256-NEXT:    retq
354;
355; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
356; FMA3_512:       # %bb.0: # %bb
357; FMA3_512-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
358; FMA3_512-NEXT:    retq
359;
360; FMA4-LABEL: buildvector_mul_addsub_pd512:
361; FMA4:       # %bb.0: # %bb
362; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
363; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
364; FMA4-NEXT:    retq
365bb:
366  %A = fmul <8 x double> %C, %D
367  %A0 = extractelement <8 x double> %A, i32 0
368  %B0 = extractelement <8 x double> %B, i32 0
369  %sub0 = fsub double %A0, %B0
370  %A2 = extractelement <8 x double> %A, i32 2
371  %B2 = extractelement <8 x double> %B, i32 2
372  %sub2 = fsub double %A2, %B2
373  %A4 = extractelement <8 x double> %A, i32 4
374  %B4 = extractelement <8 x double> %B, i32 4
375  %sub4 = fsub double %A4, %B4
376  %A6 = extractelement <8 x double> %A, i32 6
377  %B6 = extractelement <8 x double> %B, i32 6
378  %sub6 = fsub double %A6, %B6
379  %A1 = extractelement <8 x double> %A, i32 1
380  %B1 = extractelement <8 x double> %B, i32 1
381  %add1 = fadd double %A1, %B1
382  %A3 = extractelement <8 x double> %A, i32 3
383  %B3 = extractelement <8 x double> %B, i32 3
384  %add3 = fadd double %A3, %B3
385  %A7 = extractelement <8 x double> %A, i32 7
386  %B7 = extractelement <8 x double> %B, i32 7
387  %add7 = fadd double %A7, %B7
388  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
389  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
390  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
391  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
392  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
393  ; element 5 is undef
394  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
395  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
396  ret <8 x double> %vecinsert8
397}
398
399define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
400; FMA3-LABEL: buildvector_mul_subadd_ps128:
401; FMA3:       # %bb.0: # %bb
402; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
403; FMA3-NEXT:    retq
404;
405; FMA4-LABEL: buildvector_mul_subadd_ps128:
406; FMA4:       # %bb.0: # %bb
407; FMA4-NEXT:    vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
408; FMA4-NEXT:    retq
409bb:
410  %A = fmul <4 x float> %C, %D
411  %A0 = extractelement <4 x float> %A, i32 0
412  %B0 = extractelement <4 x float> %B, i32 0
413  %sub0 = fadd float %A0, %B0
414  %A2 = extractelement <4 x float> %A, i32 2
415  %B2 = extractelement <4 x float> %B, i32 2
416  %sub2 = fadd float %A2, %B2
417  %A1 = extractelement <4 x float> %A, i32 1
418  %B1 = extractelement <4 x float> %B, i32 1
419  %add1 = fsub float %A1, %B1
420  %A3 = extractelement <4 x float> %A, i32 3
421  %B3 = extractelement <4 x float> %B, i32 3
422  %add3 = fsub float %A3, %B3
423  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
424  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
425  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
426  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
427  ret <4 x float> %vecinsert4
428}
429
430define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
431; FMA3-LABEL: buildvector_mul_subadd_pd128:
432; FMA3:       # %bb.0: # %bb
433; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
434; FMA3-NEXT:    retq
435;
436; FMA4-LABEL: buildvector_mul_subadd_pd128:
437; FMA4:       # %bb.0: # %bb
438; FMA4-NEXT:    vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
439; FMA4-NEXT:    retq
440bb:
441  %A = fmul <2 x double> %C, %D
442  %A0 = extractelement <2 x double> %A, i32 0
443  %B0 = extractelement <2 x double> %B, i32 0
444  %sub0 = fadd double %A0, %B0
445  %A1 = extractelement <2 x double> %A, i32 1
446  %B1 = extractelement <2 x double> %B, i32 1
447  %add1 = fsub double %A1, %B1
448  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
449  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
450  ret <2 x double> %vecinsert2
451}
452
453define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
454; FMA3-LABEL: buildvector_mul_subadd_ps256:
455; FMA3:       # %bb.0: # %bb
456; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
457; FMA3-NEXT:    retq
458;
459; FMA4-LABEL: buildvector_mul_subadd_ps256:
460; FMA4:       # %bb.0: # %bb
461; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
462; FMA4-NEXT:    retq
463bb:
464  %A = fmul <8 x float> %C, %D
465  %A0 = extractelement <8 x float> %A, i32 0
466  %B0 = extractelement <8 x float> %B, i32 0
467  %sub0 = fadd float %A0, %B0
468  %A2 = extractelement <8 x float> %A, i32 2
469  %B2 = extractelement <8 x float> %B, i32 2
470  %sub2 = fadd float %A2, %B2
471  %A4 = extractelement <8 x float> %A, i32 4
472  %B4 = extractelement <8 x float> %B, i32 4
473  %sub4 = fadd float %A4, %B4
474  %A6 = extractelement <8 x float> %A, i32 6
475  %B6 = extractelement <8 x float> %B, i32 6
476  %sub6 = fadd float %A6, %B6
477  %A1 = extractelement <8 x float> %A, i32 1
478  %B1 = extractelement <8 x float> %B, i32 1
479  %add1 = fsub float %A1, %B1
480  %A3 = extractelement <8 x float> %A, i32 3
481  %B3 = extractelement <8 x float> %B, i32 3
482  %add3 = fsub float %A3, %B3
483  %A5 = extractelement <8 x float> %A, i32 5
484  %B5 = extractelement <8 x float> %B, i32 5
485  %add5 = fsub float %A5, %B5
486  %A7 = extractelement <8 x float> %A, i32 7
487  %B7 = extractelement <8 x float> %B, i32 7
488  %add7 = fsub float %A7, %B7
489  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
490  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
491  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
492  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
493  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
494  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
495  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
496  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
497  ret <8 x float> %vecinsert8
498}
499
500define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
501; FMA3-LABEL: buildvector_mul_subadd_pd256:
502; FMA3:       # %bb.0: # %bb
503; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
504; FMA3-NEXT:    retq
505;
506; FMA4-LABEL: buildvector_mul_subadd_pd256:
507; FMA4:       # %bb.0: # %bb
508; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
509; FMA4-NEXT:    retq
510bb:
511  %A = fmul <4 x double> %C, %D
512  %A0 = extractelement <4 x double> %A, i32 0
513  %B0 = extractelement <4 x double> %B, i32 0
514  %sub0 = fadd double %A0, %B0
515  %A2 = extractelement <4 x double> %A, i32 2
516  %B2 = extractelement <4 x double> %B, i32 2
517  %sub2 = fadd double %A2, %B2
518  %A1 = extractelement <4 x double> %A, i32 1
519  %B1 = extractelement <4 x double> %B, i32 1
520  %add1 = fsub double %A1, %B1
521  %A3 = extractelement <4 x double> %A, i32 3
522  %B3 = extractelement <4 x double> %B, i32 3
523  %add3 = fsub double %A3, %B3
524  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
525  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
526  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
527  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
528  ret <4 x double> %vecinsert4
529}
530
531define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
532; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
533; FMA3_256:       # %bb.0: # %bb
534; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
535; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
536; FMA3_256-NEXT:    retq
537;
538; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
539; FMA3_512:       # %bb.0: # %bb
540; FMA3_512-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
541; FMA3_512-NEXT:    retq
542;
543; FMA4-LABEL: buildvector_mul_subadd_ps512:
544; FMA4:       # %bb.0: # %bb
545; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
546; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
547; FMA4-NEXT:    retq
548bb:
549  %A = fmul <16 x float> %C, %D
550  %A0 = extractelement <16 x float> %A, i32 0
551  %B0 = extractelement <16 x float> %B, i32 0
552  %sub0 = fadd float %A0, %B0
553  %A2 = extractelement <16 x float> %A, i32 2
554  %B2 = extractelement <16 x float> %B, i32 2
555  %sub2 = fadd float %A2, %B2
556  %A4 = extractelement <16 x float> %A, i32 4
557  %B4 = extractelement <16 x float> %B, i32 4
558  %sub4 = fadd float %A4, %B4
559  %A6 = extractelement <16 x float> %A, i32 6
560  %B6 = extractelement <16 x float> %B, i32 6
561  %sub6 = fadd float %A6, %B6
562  %A8 = extractelement <16 x float> %A, i32 8
563  %B8 = extractelement <16 x float> %B, i32 8
564  %sub8 = fadd float %A8, %B8
565  %A10 = extractelement <16 x float> %A, i32 10
566  %B10 = extractelement <16 x float> %B, i32 10
567  %sub10 = fadd float %A10, %B10
568  %A12 = extractelement <16 x float> %A, i32 12
569  %B12 = extractelement <16 x float> %B, i32 12
570  %sub12 = fadd float %A12, %B12
571  %A14 = extractelement <16 x float> %A, i32 14
572  %B14 = extractelement <16 x float> %B, i32 14
573  %sub14 = fadd float %A14, %B14
574  %A1 = extractelement <16 x float> %A, i32 1
575  %B1 = extractelement <16 x float> %B, i32 1
576  %add1 = fsub float %A1, %B1
577  %A3 = extractelement <16 x float> %A, i32 3
578  %B3 = extractelement <16 x float> %B, i32 3
579  %add3 = fsub float %A3, %B3
580  %A5 = extractelement <16 x float> %A, i32 5
581  %B5 = extractelement <16 x float> %B, i32 5
582  %add5 = fsub float %A5, %B5
583  %A7 = extractelement <16 x float> %A, i32 7
584  %B7 = extractelement <16 x float> %B, i32 7
585  %add7 = fsub float %A7, %B7
586  %A9 = extractelement <16 x float> %A, i32 9
587  %B9 = extractelement <16 x float> %B, i32 9
588  %add9 = fsub float %A9, %B9
589  %A11 = extractelement <16 x float> %A, i32 11
590  %B11 = extractelement <16 x float> %B, i32 11
591  %add11 = fsub float %A11, %B11
592  %A13 = extractelement <16 x float> %A, i32 13
593  %B13 = extractelement <16 x float> %B, i32 13
594  %add13 = fsub float %A13, %B13
595  %A15 = extractelement <16 x float> %A, i32 15
596  %B15 = extractelement <16 x float> %B, i32 15
597  %add15 = fsub float %A15, %B15
598  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
599  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
600  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
601  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
602  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
603  ; element 5 is undef
604  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
605  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
606  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
607  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
608  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
609  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
610  ; element 12 is undef
611  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
612  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
613  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
614  ret <16 x float> %vecinsert16
615}
616
617define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
618; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
619; FMA3_256:       # %bb.0: # %bb
620; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
621; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
622; FMA3_256-NEXT:    retq
623;
624; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
625; FMA3_512:       # %bb.0: # %bb
626; FMA3_512-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
627; FMA3_512-NEXT:    retq
628;
629; FMA4-LABEL: buildvector_mul_subadd_pd512:
630; FMA4:       # %bb.0: # %bb
631; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
632; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
633; FMA4-NEXT:    retq
634bb:
635  %A = fmul <8 x double> %C, %D
636  %A0 = extractelement <8 x double> %A, i32 0
637  %B0 = extractelement <8 x double> %B, i32 0
638  %sub0 = fadd double %A0, %B0
639  %A2 = extractelement <8 x double> %A, i32 2
640  %B2 = extractelement <8 x double> %B, i32 2
641  %sub2 = fadd double %A2, %B2
642  %A4 = extractelement <8 x double> %A, i32 4
643  %B4 = extractelement <8 x double> %B, i32 4
644  %sub4 = fadd double %A4, %B4
645  %A6 = extractelement <8 x double> %A, i32 6
646  %B6 = extractelement <8 x double> %B, i32 6
647  %sub6 = fadd double %A6, %B6
648  %A1 = extractelement <8 x double> %A, i32 1
649  %B1 = extractelement <8 x double> %B, i32 1
650  %add1 = fsub double %A1, %B1
651  %A3 = extractelement <8 x double> %A, i32 3
652  %B3 = extractelement <8 x double> %B, i32 3
653  %add3 = fsub double %A3, %B3
654  %A7 = extractelement <8 x double> %A, i32 7
655  %B7 = extractelement <8 x double> %B, i32 7
656  %add7 = fsub double %A7, %B7
657  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
658  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
659  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
660  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
661  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
662  ; element 5 is undef
663  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
664  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
665  ret <8 x double> %vecinsert8
666}
667
668attributes #0 = { nounwind "unsafe-fp-math"="true" }
669