• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
4; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
5
6; VFMADD
7define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
8; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
9; CHECK-FMA:       # %bb.0:
10; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
11; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
12; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
13;
14; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
15; CHECK-AVX512VL:       # %bb.0:
16; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
17; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
18; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
19;
20; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
21; CHECK-FMA-WIN:       # %bb.0:
22; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
23; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
24; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
25; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
26; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
27; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
28  %1 = extractelement <4 x float> %a0, i64 0
29  %2 = extractelement <4 x float> %a1, i64 0
30  %3 = extractelement <4 x float> %a2, i64 0
31  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
32  %5 = insertelement <4 x float> %a0, float %4, i64 0
33  ret <4 x float> %5
34}
35
36define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
37; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
38; CHECK-FMA:       # %bb.0:
39; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
40; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
41; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
42; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
43;
44; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
45; CHECK-AVX512VL:       # %bb.0:
46; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
47; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
48; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
49; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
50;
51; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
52; CHECK-FMA-WIN:       # %bb.0:
53; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
54; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
55; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
56; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01]
57; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
58; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
59  %1 = extractelement <4 x float> %a1, i64 0
60  %2 = extractelement <4 x float> %a0, i64 0
61  %3 = extractelement <4 x float> %a2, i64 0
62  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
63  %5 = insertelement <4 x float> %a1, float %4, i64 0
64  ret <4 x float> %5
65}
66
67define <4 x float> @test_x86_fma_vfmadd_ss_231(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
68; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss_231:
69; CHECK-FMA:       # %bb.0:
70; CHECK-FMA-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 # encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
71; CHECK-FMA-NEXT:    # xmm2 = (xmm0 * xmm1) + xmm2
72; CHECK-FMA-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
73; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
74;
75; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss_231:
76; CHECK-AVX512VL:       # %bb.0:
77; CHECK-AVX512VL-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
78; CHECK-AVX512VL-NEXT:    # xmm2 = (xmm0 * xmm1) + xmm2
79; CHECK-AVX512VL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
80; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
81;
82; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss_231:
83; CHECK-FMA-WIN:       # %bb.0:
84; CHECK-FMA-WIN-NEXT:    vmovaps (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x28,0x00]
85; CHECK-FMA-WIN-NEXT:    vmovss (%rcx), %xmm1 # encoding: [0xc5,0xfa,0x10,0x09]
86; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
87; CHECK-FMA-WIN-NEXT:    vfmadd231ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xb9,0x02]
88; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * mem) + xmm0
89; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
90  %1 = extractelement <4 x float> %a0, i64 0
91  %2 = extractelement <4 x float> %a1, i64 0
92  %3 = extractelement <4 x float> %a2, i64 0
93  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
94  %5 = insertelement <4 x float> %a2, float %4, i64 0
95  ret <4 x float> %5
96}
97
98define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
99; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
100; CHECK-FMA:       # %bb.0:
101; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
102; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
103; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
104;
105; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
106; CHECK-AVX512VL:       # %bb.0:
107; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
108; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
109; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
110;
111; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
112; CHECK-FMA-WIN:       # %bb.0:
113; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
114; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
115; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
116; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
117; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
118; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
119  %1 = extractelement <2 x double> %a0, i64 0
120  %2 = extractelement <2 x double> %a1, i64 0
121  %3 = extractelement <2 x double> %a2, i64 0
122  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
123  %5 = insertelement <2 x double> %a0, double %4, i64 0
124  ret <2 x double> %5
125}
126
127define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
128; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
129; CHECK-FMA:       # %bb.0:
130; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
131; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
132; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
133; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
134;
135; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
136; CHECK-AVX512VL:       # %bb.0:
137; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
138; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
139; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
140; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
141;
142; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
143; CHECK-FMA-WIN:       # %bb.0:
144; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
145; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
146; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
147; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01]
148; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
149; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
150  %1 = extractelement <2 x double> %a1, i64 0
151  %2 = extractelement <2 x double> %a0, i64 0
152  %3 = extractelement <2 x double> %a2, i64 0
153  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
154  %5 = insertelement <2 x double> %a1, double %4, i64 0
155  ret <2 x double> %5
156}
157
158define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
159; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
160; CHECK-FMA:       # %bb.0:
161; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
162; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
163; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
164;
165; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
166; CHECK-AVX512VL:       # %bb.0:
167; CHECK-AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
168; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
169; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
170;
171; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
172; CHECK-FMA-WIN:       # %bb.0:
173; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
174; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
175; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
176; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
177; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
178  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
179  ret <4 x float> %1
180}
181
182define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
183; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
184; CHECK-FMA:       # %bb.0:
185; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
186; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
187; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
188;
189; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
190; CHECK-AVX512VL:       # %bb.0:
191; CHECK-AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
192; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
193; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
194;
195; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
196; CHECK-FMA-WIN:       # %bb.0:
197; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
198; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
199; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
200; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
201; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
202  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
203  ret <2 x double> %1
204}
205
206define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
207; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
208; CHECK-FMA:       # %bb.0:
209; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
210; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
211; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
212;
213; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
214; CHECK-AVX512VL:       # %bb.0:
215; CHECK-AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
216; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
217; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
218;
219; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
220; CHECK-FMA-WIN:       # %bb.0:
221; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
222; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
223; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
224; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
225; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
226  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
227  ret <8 x float> %1
228}
229
230define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
231; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
232; CHECK-FMA:       # %bb.0:
233; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
234; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
235; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
236;
237; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
238; CHECK-AVX512VL:       # %bb.0:
239; CHECK-AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
240; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
241; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
242;
243; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
244; CHECK-FMA-WIN:       # %bb.0:
245; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
246; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
247; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
248; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
249; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
250  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
251  ret <4 x double> %1
252}
253
254; VFMSUB
255define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
256; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
257; CHECK-FMA:       # %bb.0:
258; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
259; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
260; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
261;
262; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
263; CHECK-AVX512VL:       # %bb.0:
264; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
265; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
266; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
267;
268; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
269; CHECK-FMA-WIN:       # %bb.0:
270; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
271; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
272; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
273; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
274; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
275; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
276  %1 = extractelement <4 x float> %a0, i64 0
277  %2 = extractelement <4 x float> %a1, i64 0
278  %3 = extractelement <4 x float> %a2, i64 0
279  %4 = fsub float -0.000000e+00, %3
280  %5 = call float @llvm.fma.f32(float %1, float %2, float %4)
281  %6 = insertelement <4 x float> %a0, float %5, i64 0
282  ret <4 x float> %6
283}
284
285define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
286; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
287; CHECK-FMA:       # %bb.0:
288; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
289; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
290; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
291; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
292;
293; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
294; CHECK-AVX512VL:       # %bb.0:
295; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
296; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
297; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
298; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
299;
300; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
301; CHECK-FMA-WIN:       # %bb.0:
302; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
303; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
304; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
305; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01]
306; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
307; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
308  %1 = extractelement <4 x float> %a1, i64 0
309  %2 = extractelement <4 x float> %a0, i64 0
310  %3 = extractelement <4 x float> %a2, i64 0
311  %4 = fsub float -0.000000e+00, %3
312  %5 = call float @llvm.fma.f32(float %1, float %2, float %4)
313  %6 = insertelement <4 x float> %a1, float %5, i64 0
314  ret <4 x float> %6
315}
316
317define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
318; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
319; CHECK-FMA:       # %bb.0:
320; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
321; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
322; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
323;
324; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
325; CHECK-AVX512VL:       # %bb.0:
326; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
327; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
328; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
329;
330; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
331; CHECK-FMA-WIN:       # %bb.0:
332; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
333; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
334; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
335; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
336; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
337; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
338  %1 = extractelement <2 x double> %a0, i64 0
339  %2 = extractelement <2 x double> %a1, i64 0
340  %3 = extractelement <2 x double> %a2, i64 0
341  %4 = fsub double -0.000000e+00, %3
342  %5 = call double @llvm.fma.f64(double %1, double %2, double %4)
343  %6 = insertelement <2 x double> %a0, double %5, i64 0
344  ret <2 x double> %6
345}
346
347define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
348; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
349; CHECK-FMA:       # %bb.0:
350; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
351; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
352; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
353; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
354;
355; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
356; CHECK-AVX512VL:       # %bb.0:
357; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
358; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
359; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
360; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
361;
362; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
363; CHECK-FMA-WIN:       # %bb.0:
364; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
365; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
366; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
367; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01]
368; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
369; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
370  %1 = extractelement <2 x double> %a1, i64 0
371  %2 = extractelement <2 x double> %a0, i64 0
372  %3 = extractelement <2 x double> %a2, i64 0
373  %4 = fsub double -0.000000e+00, %3
374  %5 = call double @llvm.fma.f64(double %1, double %2, double %4)
375  %6 = insertelement <2 x double> %a1, double %5, i64 0
376  ret <2 x double> %6
377}
378
379define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
380; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
381; CHECK-FMA:       # %bb.0:
382; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
383; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
384; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
385;
386; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
387; CHECK-AVX512VL:       # %bb.0:
388; CHECK-AVX512VL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
389; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
390; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
391;
392; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
393; CHECK-FMA-WIN:       # %bb.0:
394; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
395; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
396; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
397; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
398; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
399  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
400  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %1)
401  ret <4 x float> %2
402}
403
404define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
405; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
406; CHECK-FMA:       # %bb.0:
407; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
408; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
409; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
410;
411; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
412; CHECK-AVX512VL:       # %bb.0:
413; CHECK-AVX512VL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
414; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
415; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
416;
417; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
418; CHECK-FMA-WIN:       # %bb.0:
419; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
420; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
421; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
422; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
423; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
424  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
425  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %1)
426  ret <2 x double> %2
427}
428
429define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
430; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
431; CHECK-FMA:       # %bb.0:
432; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
433; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
434; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
435;
436; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
437; CHECK-AVX512VL:       # %bb.0:
438; CHECK-AVX512VL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
439; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
440; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
441;
442; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
443; CHECK-FMA-WIN:       # %bb.0:
444; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
445; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
446; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
447; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
448; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
449  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
450  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %1)
451  ret <8 x float> %2
452}
453
454define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
455; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
456; CHECK-FMA:       # %bb.0:
457; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
458; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
459; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
460;
461; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
462; CHECK-AVX512VL:       # %bb.0:
463; CHECK-AVX512VL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
464; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
465; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
466;
467; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
468; CHECK-FMA-WIN:       # %bb.0:
469; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
470; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
471; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
472; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
473; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
474  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
475  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %1)
476  ret <4 x double> %2
477}
478
479; VFNMADD
480define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
481; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
482; CHECK-FMA:       # %bb.0:
483; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
484; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
485; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
486;
487; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
488; CHECK-AVX512VL:       # %bb.0:
489; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
490; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
491; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
492;
493; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
494; CHECK-FMA-WIN:       # %bb.0:
495; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
496; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
497; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
498; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
499; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
500; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
501  %1 = extractelement <4 x float> %a0, i64 0
502  %2 = extractelement <4 x float> %a1, i64 0
503  %3 = extractelement <4 x float> %a2, i64 0
504  %4 = fsub float -0.000000e+00, %2
505  %5 = call float @llvm.fma.f32(float %1, float %4, float %3)
506  %6 = insertelement <4 x float> %a0, float %5, i64 0
507  ret <4 x float> %6
508}
509
510define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
511; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
512; CHECK-FMA:       # %bb.0:
513; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
514; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
515; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
516; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
517;
518; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
519; CHECK-AVX512VL:       # %bb.0:
520; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
521; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
522; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
523; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
524;
525; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
526; CHECK-FMA-WIN:       # %bb.0:
527; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
528; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
529; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
530; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01]
531; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
532; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
533  %1 = extractelement <4 x float> %a1, i64 0
534  %2 = extractelement <4 x float> %a0, i64 0
535  %3 = extractelement <4 x float> %a2, i64 0
536  %4 = fsub float -0.000000e+00, %2
537  %5 = call float @llvm.fma.f32(float %1, float %4, float %3)
538  %6 = insertelement <4 x float> %a1, float %5, i64 0
539  ret <4 x float> %6
540}
541
542define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
543; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
544; CHECK-FMA:       # %bb.0:
545; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
546; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
547; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
548;
549; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
550; CHECK-AVX512VL:       # %bb.0:
551; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
552; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
553; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
554;
555; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
556; CHECK-FMA-WIN:       # %bb.0:
557; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
558; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
559; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
560; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
561; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
562; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
563  %1 = extractelement <2 x double> %a0, i64 0
564  %2 = extractelement <2 x double> %a1, i64 0
565  %3 = extractelement <2 x double> %a2, i64 0
566  %4 = fsub double -0.000000e+00, %2
567  %5 = call double @llvm.fma.f64(double %1, double %4, double %3)
568  %6 = insertelement <2 x double> %a0, double %5, i64 0
569  ret <2 x double> %6
570}
571
572define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
573; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
574; CHECK-FMA:       # %bb.0:
575; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
576; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
577; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
578; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
579;
580; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
581; CHECK-AVX512VL:       # %bb.0:
582; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
583; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
584; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
585; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
586;
587; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
588; CHECK-FMA-WIN:       # %bb.0:
589; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
590; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
591; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
592; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01]
593; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
594; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
595  %1 = extractelement <2 x double> %a1, i64 0
596  %2 = extractelement <2 x double> %a0, i64 0
597  %3 = extractelement <2 x double> %a2, i64 0
598  %4 = fsub double -0.000000e+00, %2
599  %5 = call double @llvm.fma.f64(double %1, double %4, double %3)
600  %6 = insertelement <2 x double> %a1, double %5, i64 0
601  ret <2 x double> %6
602}
603
604define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
605; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
606; CHECK-FMA:       # %bb.0:
607; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
608; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
609; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
610;
611; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
612; CHECK-AVX512VL:       # %bb.0:
613; CHECK-AVX512VL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
614; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
615; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
616;
617; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
618; CHECK-FMA-WIN:       # %bb.0:
619; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
620; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
621; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
622; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
623; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
624  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
625  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %a2)
626  ret <4 x float> %2
627}
628
629define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
630; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
631; CHECK-FMA:       # %bb.0:
632; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
633; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
634; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
635;
636; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
637; CHECK-AVX512VL:       # %bb.0:
638; CHECK-AVX512VL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
639; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
640; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
641;
642; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
643; CHECK-FMA-WIN:       # %bb.0:
644; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
645; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
646; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
647; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
648; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
649  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a0
650  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %a2)
651  ret <2 x double> %2
652}
653
654define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
655; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
656; CHECK-FMA:       # %bb.0:
657; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
658; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
659; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
660;
661; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
662; CHECK-AVX512VL:       # %bb.0:
663; CHECK-AVX512VL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
664; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
665; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
666;
667; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
668; CHECK-FMA-WIN:       # %bb.0:
669; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
670; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
671; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
672; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
673; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
674  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
675  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %a2)
676  ret <8 x float> %2
677}
678
679define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
680; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
681; CHECK-FMA:       # %bb.0:
682; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
683; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
684; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
685;
686; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
687; CHECK-AVX512VL:       # %bb.0:
688; CHECK-AVX512VL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
689; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
690; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
691;
692; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
693; CHECK-FMA-WIN:       # %bb.0:
694; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
695; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
696; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
697; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
698; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
699  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a0
700  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %a2)
701  ret <4 x double> %2
702}
703
704; VFNMSUB
705define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
706; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
707; CHECK-FMA:       # %bb.0:
708; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
709; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
710; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
711;
712; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
713; CHECK-AVX512VL:       # %bb.0:
714; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
715; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
716; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
717;
718; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
719; CHECK-FMA-WIN:       # %bb.0:
720; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
721; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
722; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
723; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
724; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
725; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
726  %1 = extractelement <4 x float> %a0, i64 0
727  %2 = extractelement <4 x float> %a1, i64 0
728  %3 = extractelement <4 x float> %a2, i64 0
729  %4 = fsub float -0.000000e+00, %2
730  %5 = fsub float -0.000000e+00, %3
731  %6 = call float @llvm.fma.f32(float %1, float %4, float %5)
732  %7 = insertelement <4 x float> %a0, float %6, i64 0
733  ret <4 x float> %7
734}
735
736define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
737; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
738; CHECK-FMA:       # %bb.0:
739; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
740; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
741; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
742; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
743;
744; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
745; CHECK-AVX512VL:       # %bb.0:
746; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
747; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
748; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
749; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
750;
751; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
752; CHECK-FMA-WIN:       # %bb.0:
753; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
754; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
755; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
756; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01]
757; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
758; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
759  %1 = extractelement <4 x float> %a1, i64 0
760  %2 = extractelement <4 x float> %a0, i64 0
761  %3 = extractelement <4 x float> %a2, i64 0
762  %4 = fsub float -0.000000e+00, %2
763  %5 = fsub float -0.000000e+00, %3
764  %6 = call float @llvm.fma.f32(float %1, float %4, float %5)
765  %7 = insertelement <4 x float> %a1, float %6, i64 0
766  ret <4 x float> %7
767}
768
769define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
770; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
771; CHECK-FMA:       # %bb.0:
772; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
773; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
774; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
775;
776; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
777; CHECK-AVX512VL:       # %bb.0:
778; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
779; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
780; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
781;
782; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
783; CHECK-FMA-WIN:       # %bb.0:
784; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
785; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
786; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
787; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
788; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
789; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
790  %1 = extractelement <2 x double> %a0, i64 0
791  %2 = extractelement <2 x double> %a1, i64 0
792  %3 = extractelement <2 x double> %a2, i64 0
793  %4 = fsub double -0.000000e+00, %2
794  %5 = fsub double -0.000000e+00, %3
795  %6 = call double @llvm.fma.f64(double %1, double %4, double %5)
796  %7 = insertelement <2 x double> %a0, double %6, i64 0
797  ret <2 x double> %7
798}
799
800define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
801; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
802; CHECK-FMA:       # %bb.0:
803; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
804; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
805; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
806; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
807;
808; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
809; CHECK-AVX512VL:       # %bb.0:
810; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
811; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
812; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
813; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
814;
815; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
816; CHECK-FMA-WIN:       # %bb.0:
817; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
818; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
819; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
820; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01]
821; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
822; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
823  %1 = extractelement <2 x double> %a1, i64 0
824  %2 = extractelement <2 x double> %a0, i64 0
825  %3 = extractelement <2 x double> %a2, i64 0
826  %4 = fsub double -0.000000e+00, %2
827  %5 = fsub double -0.000000e+00, %3
828  %6 = call double @llvm.fma.f64(double %1, double %4, double %5)
829  %7 = insertelement <2 x double> %a1, double %6, i64 0
830  ret <2 x double> %7
831}
832
833define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
834; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
835; CHECK-FMA:       # %bb.0:
836; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
837; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
838; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
839;
840; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
841; CHECK-AVX512VL:       # %bb.0:
842; CHECK-AVX512VL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
843; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
844; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
845;
846; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
847; CHECK-FMA-WIN:       # %bb.0:
848; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
849; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
850; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
851; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
852; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
853  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
854  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
855  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
856  ret <4 x float> %3
857}
858
859define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
860; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
861; CHECK-FMA:       # %bb.0:
862; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
863; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
864; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
865;
866; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
867; CHECK-AVX512VL:       # %bb.0:
868; CHECK-AVX512VL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
869; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
870; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
871;
872; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
873; CHECK-FMA-WIN:       # %bb.0:
874; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
875; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
876; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
877; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
878; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
879  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a0
880  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
881  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
882  ret <2 x double> %3
883}
884
885define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
886; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
887; CHECK-FMA:       # %bb.0:
888; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
889; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
890; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
891;
892; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
893; CHECK-AVX512VL:       # %bb.0:
894; CHECK-AVX512VL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
895; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
896; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
897;
898; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
899; CHECK-FMA-WIN:       # %bb.0:
900; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
901; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
902; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
903; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
904; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
905  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
906  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
907  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
908  ret <8 x float> %3
909}
910
911define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
912; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
913; CHECK-FMA:       # %bb.0:
914; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
915; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
916; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
917;
918; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
919; CHECK-AVX512VL:       # %bb.0:
920; CHECK-AVX512VL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
921; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
922; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
923;
924; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
925; CHECK-FMA-WIN:       # %bb.0:
926; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
927; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
928; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
929; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
930; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
931  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a0
932  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
933  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
934  ret <4 x double> %3
935}
936
937; VFMADDSUB
938define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
939; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
940; CHECK-FMA:       # %bb.0:
941; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
942; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
943; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
944;
945; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
946; CHECK-AVX512VL:       # %bb.0:
947; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
948; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
949; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
950;
951; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
952; CHECK-FMA-WIN:       # %bb.0:
953; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
954; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
955; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
956; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
957; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
958  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
959  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
960  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2)
961  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
962  ret <4 x float> %4
963}
964
965define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
966; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
967; CHECK-FMA:       # %bb.0:
968; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
969; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
970; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
971;
972; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
973; CHECK-AVX512VL:       # %bb.0:
974; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
975; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
976; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
977;
978; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
979; CHECK-FMA-WIN:       # %bb.0:
980; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
981; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
982; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
983; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
984; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
985  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
986  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
987  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
988  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
989  ret <2 x double> %4
990}
991
992define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
993; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
994; CHECK-FMA:       # %bb.0:
995; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
996; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
997; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
998;
999; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
1000; CHECK-AVX512VL:       # %bb.0:
1001; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
1002; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
1003; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1004;
1005; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
1006; CHECK-FMA-WIN:       # %bb.0:
1007; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
1008; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
1009; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
1010; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
1011; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1012  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
1013  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
1014  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2)
1015  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1016  ret <8 x float> %4
1017}
1018
1019define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1020; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
1021; CHECK-FMA:       # %bb.0:
1022; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
1023; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
1024; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
1025;
1026; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
1027; CHECK-AVX512VL:       # %bb.0:
1028; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
1029; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
1030; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1031;
1032; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
1033; CHECK-FMA-WIN:       # %bb.0:
1034; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
1035; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
1036; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
1037; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
1038; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1039  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
1040  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
1041  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
1042  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1043  ret <4 x double> %4
1044}
1045
1046; VFMSUBADD
1047define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
1048; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
1049; CHECK-FMA:       # %bb.0:
1050; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
1051; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
1052; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
1053;
1054; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
1055; CHECK-AVX512VL:       # %bb.0:
1056; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
1057; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
1058; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1059;
1060; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
1061; CHECK-FMA-WIN:       # %bb.0:
1062; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
1063; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
1064; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
1065; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
1066; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1067  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
1068  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
1069  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2)
1070  %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1071  ret <4 x float> %4
1072}
1073
1074define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
1075; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
1076; CHECK-FMA:       # %bb.0:
1077; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
1078; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
1079; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
1080;
1081; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
1082; CHECK-AVX512VL:       # %bb.0:
1083; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
1084; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
1085; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1086;
1087; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
1088; CHECK-FMA-WIN:       # %bb.0:
1089; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
1090; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
1091; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
1092; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
1093; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1094  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
1095  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
1096  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
1097  %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> <i32 0, i32 3>
1098  ret <2 x double> %4
1099}
1100
1101define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
1102; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
1103; CHECK-FMA:       # %bb.0:
1104; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
1105; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
1106; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
1107;
1108; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
1109; CHECK-AVX512VL:       # %bb.0:
1110; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
1111; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
1112; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1113;
1114; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
1115; CHECK-FMA-WIN:       # %bb.0:
1116; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
1117; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
1118; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
1119; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
1120; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1121  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
1122  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
1123  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2)
1124  %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1125  ret <8 x float> %4
1126}
1127
1128define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1129; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
1130; CHECK-FMA:       # %bb.0:
1131; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
1132; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
1133; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
1134;
1135; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
1136; CHECK-AVX512VL:       # %bb.0:
1137; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
1138; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
1139; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
1140;
1141; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
1142; CHECK-FMA-WIN:       # %bb.0:
1143; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
1144; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
1145; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
1146; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
1147; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
1148  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
1149  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
1150  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
1151  %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1152  ret <4 x double> %4
1153}
1154
1155declare float @llvm.fma.f32(float, float, float)
1156declare double @llvm.fma.f64(double, double, double)
1157declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1158declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
1159declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
1160declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
1161
1162attributes #0 = { nounwind }
1163