• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 | FileCheck %s --check-prefix=FMA4
3
4attributes #0 = { nounwind }
5
6declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
7define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
8; FMA4-LABEL: test_x86_fmadd_baa_ss:
9; FMA4:       # %bb.0:
10; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
11; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
12; FMA4-NEXT:    retq
13  %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
14  ret <4 x float> %res
15}
16
17define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
18; FMA4-LABEL: test_x86_fmadd_aba_ss:
19; FMA4:       # %bb.0:
20; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
21; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
22; FMA4-NEXT:    retq
23  %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
24  ret <4 x float> %res
25}
26
27define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
28; FMA4-LABEL: test_x86_fmadd_bba_ss:
29; FMA4:       # %bb.0:
30; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
31; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
32; FMA4-NEXT:    retq
33  %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
34  ret <4 x float> %res
35}
36
37declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
38define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
39; FMA4-LABEL: test_x86_fmadd_baa_ps:
40; FMA4:       # %bb.0:
41; FMA4-NEXT:    vmovaps (%rcx), %xmm0
42; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
43; FMA4-NEXT:    retq
44  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
45  ret <4 x float> %res
46}
47
48define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
49; FMA4-LABEL: test_x86_fmadd_aba_ps:
50; FMA4:       # %bb.0:
51; FMA4-NEXT:    vmovaps (%rcx), %xmm0
52; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
53; FMA4-NEXT:    retq
54  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
55  ret <4 x float> %res
56}
57
58define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
59; FMA4-LABEL: test_x86_fmadd_bba_ps:
60; FMA4:       # %bb.0:
61; FMA4-NEXT:    vmovaps (%rdx), %xmm0
62; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
63; FMA4-NEXT:    retq
64  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
65  ret <4 x float> %res
66}
67
68declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
69define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
70; FMA4-LABEL: test_x86_fmadd_baa_ps_y:
71; FMA4:       # %bb.0:
72; FMA4-NEXT:    vmovaps (%rcx), %ymm0
73; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
74; FMA4-NEXT:    retq
75  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
76  ret <8 x float> %res
77}
78
79define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
80; FMA4-LABEL: test_x86_fmadd_aba_ps_y:
81; FMA4:       # %bb.0:
82; FMA4-NEXT:    vmovaps (%rcx), %ymm0
83; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
84; FMA4-NEXT:    retq
85  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
86  ret <8 x float> %res
87}
88
89define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
90; FMA4-LABEL: test_x86_fmadd_bba_ps_y:
91; FMA4:       # %bb.0:
92; FMA4-NEXT:    vmovaps (%rdx), %ymm0
93; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem
94; FMA4-NEXT:    retq
95  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
96  ret <8 x float> %res
97}
98
99declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
100define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
101; FMA4-LABEL: test_x86_fmadd_baa_sd:
102; FMA4:       # %bb.0:
103; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
104; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
105; FMA4-NEXT:    retq
106  %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
107  ret <2 x double> %res
108}
109
110define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
111; FMA4-LABEL: test_x86_fmadd_aba_sd:
112; FMA4:       # %bb.0:
113; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
114; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
115; FMA4-NEXT:    retq
116  %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
117  ret <2 x double> %res
118}
119
120define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
121; FMA4-LABEL: test_x86_fmadd_bba_sd:
122; FMA4:       # %bb.0:
123; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
124; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
125; FMA4-NEXT:    retq
126  %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
127  ret <2 x double> %res
128}
129
130declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
131define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
132; FMA4-LABEL: test_x86_fmadd_baa_pd:
133; FMA4:       # %bb.0:
134; FMA4-NEXT:    vmovapd (%rcx), %xmm0
135; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
136; FMA4-NEXT:    retq
137  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
138  ret <2 x double> %res
139}
140
141define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
142; FMA4-LABEL: test_x86_fmadd_aba_pd:
143; FMA4:       # %bb.0:
144; FMA4-NEXT:    vmovapd (%rcx), %xmm0
145; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
146; FMA4-NEXT:    retq
147  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
148  ret <2 x double> %res
149}
150
151define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
152; FMA4-LABEL: test_x86_fmadd_bba_pd:
153; FMA4:       # %bb.0:
154; FMA4-NEXT:    vmovapd (%rdx), %xmm0
155; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
156; FMA4-NEXT:    retq
157  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
158  ret <2 x double> %res
159}
160
161declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
162define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
163; FMA4-LABEL: test_x86_fmadd_baa_pd_y:
164; FMA4:       # %bb.0:
165; FMA4-NEXT:    vmovapd (%rcx), %ymm0
166; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
167; FMA4-NEXT:    retq
168  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
169  ret <4 x double> %res
170}
171
172define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
173; FMA4-LABEL: test_x86_fmadd_aba_pd_y:
174; FMA4:       # %bb.0:
175; FMA4-NEXT:    vmovapd (%rcx), %ymm0
176; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
177; FMA4-NEXT:    retq
178  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
179  ret <4 x double> %res
180}
181
182define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
183; FMA4-LABEL: test_x86_fmadd_bba_pd_y:
184; FMA4:       # %bb.0:
185; FMA4-NEXT:    vmovapd (%rdx), %ymm0
186; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm0) + mem
187; FMA4-NEXT:    retq
188  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
189  ret <4 x double> %res
190}
191
192declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
193define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
194; FMA4-LABEL: test_x86_fnmadd_baa_ps:
195; FMA4:       # %bb.0:
196; FMA4-NEXT:    vmovaps (%rcx), %xmm0
197; FMA4-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
198; FMA4-NEXT:    retq
199  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
200  ret <4 x float> %res
201}
202
203define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
204; FMA4-LABEL: test_x86_fnmadd_aba_ps:
205; FMA4:       # %bb.0:
206; FMA4-NEXT:    vmovaps (%rcx), %xmm0
207; FMA4-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
208; FMA4-NEXT:    retq
209  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
210  ret <4 x float> %res
211}
212
213define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
214; FMA4-LABEL: test_x86_fnmadd_bba_ps:
215; FMA4:       # %bb.0:
216; FMA4-NEXT:    vmovaps (%rdx), %xmm0
217; FMA4-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem
218; FMA4-NEXT:    retq
219  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
220  ret <4 x float> %res
221}
222
223declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
224define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
225; FMA4-LABEL: test_x86_fnmadd_baa_ps_y:
226; FMA4:       # %bb.0:
227; FMA4-NEXT:    vmovaps (%rcx), %ymm0
228; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
229; FMA4-NEXT:    retq
230  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
231  ret <8 x float> %res
232}
233
234define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
235; FMA4-LABEL: test_x86_fnmadd_aba_ps_y:
236; FMA4:       # %bb.0:
237; FMA4-NEXT:    vmovaps (%rcx), %ymm0
238; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
239; FMA4-NEXT:    retq
240  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
241  ret <8 x float> %res
242}
243
244define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
245; FMA4-LABEL: test_x86_fnmadd_bba_ps_y:
246; FMA4:       # %bb.0:
247; FMA4-NEXT:    vmovaps (%rdx), %ymm0
248; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem
249; FMA4-NEXT:    retq
250  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
251  ret <8 x float> %res
252}
253
254declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
255define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
256; FMA4-LABEL: test_x86_fnmadd_baa_pd:
257; FMA4:       # %bb.0:
258; FMA4-NEXT:    vmovapd (%rcx), %xmm0
259; FMA4-NEXT:    vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
260; FMA4-NEXT:    retq
261  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
262  ret <2 x double> %res
263}
264
265define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
266; FMA4-LABEL: test_x86_fnmadd_aba_pd:
267; FMA4:       # %bb.0:
268; FMA4-NEXT:    vmovapd (%rcx), %xmm0
269; FMA4-NEXT:    vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
270; FMA4-NEXT:    retq
271  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
272  ret <2 x double> %res
273}
274
275define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
276; FMA4-LABEL: test_x86_fnmadd_bba_pd:
277; FMA4:       # %bb.0:
278; FMA4-NEXT:    vmovapd (%rdx), %xmm0
279; FMA4-NEXT:    vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem
280; FMA4-NEXT:    retq
281  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
282  ret <2 x double> %res
283}
284
285declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
286define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
287; FMA4-LABEL: test_x86_fnmadd_baa_pd_y:
288; FMA4:       # %bb.0:
289; FMA4-NEXT:    vmovapd (%rcx), %ymm0
290; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
291; FMA4-NEXT:    retq
292  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
293  ret <4 x double> %res
294}
295
296define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
297; FMA4-LABEL: test_x86_fnmadd_aba_pd_y:
298; FMA4:       # %bb.0:
299; FMA4-NEXT:    vmovapd (%rcx), %ymm0
300; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
301; FMA4-NEXT:    retq
302  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
303  ret <4 x double> %res
304}
305
306define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
307; FMA4-LABEL: test_x86_fnmadd_bba_pd_y:
308; FMA4:       # %bb.0:
309; FMA4-NEXT:    vmovapd (%rdx), %ymm0
310; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem
311; FMA4-NEXT:    retq
312  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
313  ret <4 x double> %res
314}
315
316declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
317define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
318; FMA4-LABEL: test_x86_fmsub_baa_ps:
319; FMA4:       # %bb.0:
320; FMA4-NEXT:    vmovaps (%rcx), %xmm0
321; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
322; FMA4-NEXT:    retq
323  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
324  ret <4 x float> %res
325}
326
327define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
328; FMA4-LABEL: test_x86_fmsub_aba_ps:
329; FMA4:       # %bb.0:
330; FMA4-NEXT:    vmovaps (%rcx), %xmm0
331; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
332; FMA4-NEXT:    retq
333  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
334  ret <4 x float> %res
335}
336
337define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
338; FMA4-LABEL: test_x86_fmsub_bba_ps:
339; FMA4:       # %bb.0:
340; FMA4-NEXT:    vmovaps (%rdx), %xmm0
341; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm0) - mem
342; FMA4-NEXT:    retq
343  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
344  ret <4 x float> %res
345}
346
347declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
348define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
349; FMA4-LABEL: test_x86_fmsub_baa_ps_y:
350; FMA4:       # %bb.0:
351; FMA4-NEXT:    vmovaps (%rcx), %ymm0
352; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
353; FMA4-NEXT:    retq
354  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
355  ret <8 x float> %res
356}
357
358define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
359; FMA4-LABEL: test_x86_fmsub_aba_ps_y:
360; FMA4:       # %bb.0:
361; FMA4-NEXT:    vmovaps (%rcx), %ymm0
362; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
363; FMA4-NEXT:    retq
364  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
365  ret <8 x float> %res
366}
367
368define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
369; FMA4-LABEL: test_x86_fmsub_bba_ps_y:
370; FMA4:       # %bb.0:
371; FMA4-NEXT:    vmovaps (%rdx), %ymm0
372; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm0) - mem
373; FMA4-NEXT:    retq
374  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
375  ret <8 x float> %res
376}
377
378declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
379define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
380; FMA4-LABEL: test_x86_fmsub_baa_pd:
381; FMA4:       # %bb.0:
382; FMA4-NEXT:    vmovapd (%rcx), %xmm0
383; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
384; FMA4-NEXT:    retq
385  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
386  ret <2 x double> %res
387}
388
389define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
390; FMA4-LABEL: test_x86_fmsub_aba_pd:
391; FMA4:       # %bb.0:
392; FMA4-NEXT:    vmovapd (%rcx), %xmm0
393; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
394; FMA4-NEXT:    retq
395  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
396  ret <2 x double> %res
397}
398
399define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
400; FMA4-LABEL: test_x86_fmsub_bba_pd:
401; FMA4:       # %bb.0:
402; FMA4-NEXT:    vmovapd (%rdx), %xmm0
403; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm0) - mem
404; FMA4-NEXT:    retq
405  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
406  ret <2 x double> %res
407}
408
409declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
410define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
411; FMA4-LABEL: test_x86_fmsub_baa_pd_y:
412; FMA4:       # %bb.0:
413; FMA4-NEXT:    vmovapd (%rcx), %ymm0
414; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
415; FMA4-NEXT:    retq
416  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
417  ret <4 x double> %res
418}
419
420define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
421; FMA4-LABEL: test_x86_fmsub_aba_pd_y:
422; FMA4:       # %bb.0:
423; FMA4-NEXT:    vmovapd (%rcx), %ymm0
424; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
425; FMA4-NEXT:    retq
426  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
427  ret <4 x double> %res
428}
429
430define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
431; FMA4-LABEL: test_x86_fmsub_bba_pd_y:
432; FMA4:       # %bb.0:
433; FMA4-NEXT:    vmovapd (%rdx), %ymm0
434; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm0) - mem
435; FMA4-NEXT:    retq
436  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
437  ret <4 x double> %res
438}
439
440declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
441define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
442; FMA4-LABEL: test_x86_fnmsub_baa_ps:
443; FMA4:       # %bb.0:
444; FMA4-NEXT:    vmovaps (%rcx), %xmm0
445; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
446; FMA4-NEXT:    retq
447  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
448  ret <4 x float> %res
449}
450
451define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
452; FMA4-LABEL: test_x86_fnmsub_aba_ps:
453; FMA4:       # %bb.0:
454; FMA4-NEXT:    vmovaps (%rcx), %xmm0
455; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
456; FMA4-NEXT:    retq
457  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
458  ret <4 x float> %res
459}
460
461define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
462; FMA4-LABEL: test_x86_fnmsub_bba_ps:
463; FMA4:       # %bb.0:
464; FMA4-NEXT:    vmovaps (%rdx), %xmm0
465; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem
466; FMA4-NEXT:    retq
467  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
468  ret <4 x float> %res
469}
470
471declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
472define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
473; FMA4-LABEL: test_x86_fnmsub_baa_ps_y:
474; FMA4:       # %bb.0:
475; FMA4-NEXT:    vmovaps (%rcx), %ymm0
476; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
477; FMA4-NEXT:    retq
478  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
479  ret <8 x float> %res
480}
481
482define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
483; FMA4-LABEL: test_x86_fnmsub_aba_ps_y:
484; FMA4:       # %bb.0:
485; FMA4-NEXT:    vmovaps (%rcx), %ymm0
486; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
487; FMA4-NEXT:    retq
488  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
489  ret <8 x float> %res
490}
491
492define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
493; FMA4-LABEL: test_x86_fnmsub_bba_ps_y:
494; FMA4:       # %bb.0:
495; FMA4-NEXT:    vmovaps (%rdx), %ymm0
496; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem
497; FMA4-NEXT:    retq
498  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
499  ret <8 x float> %res
500}
501
502declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
503define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
504; FMA4-LABEL: test_x86_fnmsub_baa_pd:
505; FMA4:       # %bb.0:
506; FMA4-NEXT:    vmovapd (%rcx), %xmm0
507; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
508; FMA4-NEXT:    retq
509  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
510  ret <2 x double> %res
511}
512
513define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
514; FMA4-LABEL: test_x86_fnmsub_aba_pd:
515; FMA4:       # %bb.0:
516; FMA4-NEXT:    vmovapd (%rcx), %xmm0
517; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
518; FMA4-NEXT:    retq
519  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
520  ret <2 x double> %res
521}
522
523define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
524; FMA4-LABEL: test_x86_fnmsub_bba_pd:
525; FMA4:       # %bb.0:
526; FMA4-NEXT:    vmovapd (%rdx), %xmm0
527; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem
528; FMA4-NEXT:    retq
529  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
530  ret <2 x double> %res
531}
532
533declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
534define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
535; FMA4-LABEL: test_x86_fnmsub_baa_pd_y:
536; FMA4:       # %bb.0:
537; FMA4-NEXT:    vmovapd (%rcx), %ymm0
538; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
539; FMA4-NEXT:    retq
540  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
541  ret <4 x double> %res
542}
543
544define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
545; FMA4-LABEL: test_x86_fnmsub_aba_pd_y:
546; FMA4:       # %bb.0:
547; FMA4-NEXT:    vmovapd (%rcx), %ymm0
548; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
549; FMA4-NEXT:    retq
550  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
551  ret <4 x double> %res
552}
553
554define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
555; FMA4-LABEL: test_x86_fnmsub_bba_pd_y:
556; FMA4:       # %bb.0:
557; FMA4-NEXT:    vmovapd (%rdx), %ymm0
558; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem
559; FMA4-NEXT:    retq
560  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
561  ret <4 x double> %res
562}
563
564