• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm1, %xmm0
17; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
18; SSE2-NEXT:    mulps %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
24; SSE41-NEXT:    mulps %xmm1, %xmm0
25; SSE41-NEXT:    retq
26;
27; AVX-LABEL: test_v2f32:
28; AVX:       # %bb.0:
29; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
30; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
31; AVX-NEXT:    retq
32;
33; AVX512-LABEL: test_v2f32:
34; AVX512:       # %bb.0:
35; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
36; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
37; AVX512-NEXT:    retq
38  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
39  ret float %1
40}
41
42define float @test_v4f32(float %a0, <4 x float> %a1) {
43; SSE2-LABEL: test_v4f32:
44; SSE2:       # %bb.0:
45; SSE2-NEXT:    movaps %xmm1, %xmm2
46; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
47; SSE2-NEXT:    mulps %xmm1, %xmm2
48; SSE2-NEXT:    movaps %xmm2, %xmm0
49; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
50; SSE2-NEXT:    mulps %xmm2, %xmm0
51; SSE2-NEXT:    retq
52;
53; SSE41-LABEL: test_v4f32:
54; SSE41:       # %bb.0:
55; SSE41-NEXT:    movaps %xmm1, %xmm2
56; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
57; SSE41-NEXT:    mulps %xmm1, %xmm2
58; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
59; SSE41-NEXT:    mulps %xmm2, %xmm0
60; SSE41-NEXT:    retq
61;
62; AVX-LABEL: test_v4f32:
63; AVX:       # %bb.0:
64; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
65; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
66; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
67; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    retq
69;
70; AVX512-LABEL: test_v4f32:
71; AVX512:       # %bb.0:
72; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
73; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
74; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
75; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
76; AVX512-NEXT:    retq
77  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
78  ret float %1
79}
80
81define float @test_v8f32(float %a0, <8 x float> %a1) {
82; SSE2-LABEL: test_v8f32:
83; SSE2:       # %bb.0:
84; SSE2-NEXT:    mulps %xmm2, %xmm1
85; SSE2-NEXT:    movaps %xmm1, %xmm2
86; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
87; SSE2-NEXT:    mulps %xmm1, %xmm2
88; SSE2-NEXT:    movaps %xmm2, %xmm0
89; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
90; SSE2-NEXT:    mulps %xmm2, %xmm0
91; SSE2-NEXT:    retq
92;
93; SSE41-LABEL: test_v8f32:
94; SSE41:       # %bb.0:
95; SSE41-NEXT:    mulps %xmm2, %xmm1
96; SSE41-NEXT:    movaps %xmm1, %xmm2
97; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
98; SSE41-NEXT:    mulps %xmm1, %xmm2
99; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
100; SSE41-NEXT:    mulps %xmm2, %xmm0
101; SSE41-NEXT:    retq
102;
103; AVX-LABEL: test_v8f32:
104; AVX:       # %bb.0:
105; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
106; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
107; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
108; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
109; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
110; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
111; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
112; AVX-NEXT:    vzeroupper
113; AVX-NEXT:    retq
114;
115; AVX512-LABEL: test_v8f32:
116; AVX512:       # %bb.0:
117; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
118; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
119; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
120; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
121; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
122; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
123; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
124; AVX512-NEXT:    vzeroupper
125; AVX512-NEXT:    retq
126  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
127  ret float %1
128}
129
130define float @test_v16f32(float %a0, <16 x float> %a1) {
131; SSE2-LABEL: test_v16f32:
132; SSE2:       # %bb.0:
133; SSE2-NEXT:    mulps %xmm4, %xmm2
134; SSE2-NEXT:    mulps %xmm3, %xmm1
135; SSE2-NEXT:    mulps %xmm2, %xmm1
136; SSE2-NEXT:    movaps %xmm1, %xmm2
137; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
138; SSE2-NEXT:    mulps %xmm1, %xmm2
139; SSE2-NEXT:    movaps %xmm2, %xmm0
140; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
141; SSE2-NEXT:    mulps %xmm2, %xmm0
142; SSE2-NEXT:    retq
143;
144; SSE41-LABEL: test_v16f32:
145; SSE41:       # %bb.0:
146; SSE41-NEXT:    mulps %xmm4, %xmm2
147; SSE41-NEXT:    mulps %xmm3, %xmm1
148; SSE41-NEXT:    mulps %xmm2, %xmm1
149; SSE41-NEXT:    movaps %xmm1, %xmm2
150; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
151; SSE41-NEXT:    mulps %xmm1, %xmm2
152; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
153; SSE41-NEXT:    mulps %xmm2, %xmm0
154; SSE41-NEXT:    retq
155;
156; AVX-LABEL: test_v16f32:
157; AVX:       # %bb.0:
158; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm0
159; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
160; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
161; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
162; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
163; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
164; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
165; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
166; AVX-NEXT:    vzeroupper
167; AVX-NEXT:    retq
168;
169; AVX512-LABEL: test_v16f32:
170; AVX512:       # %bb.0:
171; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
172; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
173; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
174; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
175; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
176; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
177; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
178; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
179; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
180; AVX512-NEXT:    vzeroupper
181; AVX512-NEXT:    retq
182  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
183  ret float %1
184}
185
186;
187; vXf32 (one)
188;
189
190define float @test_v2f32_zero(<2 x float> %a0) {
191; SSE2-LABEL: test_v2f32_zero:
192; SSE2:       # %bb.0:
193; SSE2-NEXT:    movaps %xmm0, %xmm1
194; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
195; SSE2-NEXT:    mulps %xmm0, %xmm1
196; SSE2-NEXT:    movaps %xmm1, %xmm0
197; SSE2-NEXT:    retq
198;
199; SSE41-LABEL: test_v2f32_zero:
200; SSE41:       # %bb.0:
201; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
202; SSE41-NEXT:    mulps %xmm1, %xmm0
203; SSE41-NEXT:    retq
204;
205; AVX-LABEL: test_v2f32_zero:
206; AVX:       # %bb.0:
207; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
208; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
209; AVX-NEXT:    retq
210;
211; AVX512-LABEL: test_v2f32_zero:
212; AVX512:       # %bb.0:
213; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
214; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
215; AVX512-NEXT:    retq
216  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
217  ret float %1
218}
219
220define float @test_v4f32_zero(<4 x float> %a0) {
221; SSE2-LABEL: test_v4f32_zero:
222; SSE2:       # %bb.0:
223; SSE2-NEXT:    movaps %xmm0, %xmm1
224; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
225; SSE2-NEXT:    mulps %xmm0, %xmm1
226; SSE2-NEXT:    movaps %xmm1, %xmm0
227; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
228; SSE2-NEXT:    mulps %xmm1, %xmm0
229; SSE2-NEXT:    retq
230;
231; SSE41-LABEL: test_v4f32_zero:
232; SSE41:       # %bb.0:
233; SSE41-NEXT:    movaps %xmm0, %xmm1
234; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
235; SSE41-NEXT:    mulps %xmm0, %xmm1
236; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
237; SSE41-NEXT:    mulps %xmm0, %xmm1
238; SSE41-NEXT:    movaps %xmm1, %xmm0
239; SSE41-NEXT:    retq
240;
241; AVX-LABEL: test_v4f32_zero:
242; AVX:       # %bb.0:
243; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
244; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
245; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
246; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
247; AVX-NEXT:    retq
248;
249; AVX512-LABEL: test_v4f32_zero:
250; AVX512:       # %bb.0:
251; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
252; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
253; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
254; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
255; AVX512-NEXT:    retq
256  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
257  ret float %1
258}
259
260define float @test_v8f32_zero(<8 x float> %a0) {
261; SSE2-LABEL: test_v8f32_zero:
262; SSE2:       # %bb.0:
263; SSE2-NEXT:    mulps %xmm1, %xmm0
264; SSE2-NEXT:    movaps %xmm0, %xmm1
265; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
266; SSE2-NEXT:    mulps %xmm0, %xmm1
267; SSE2-NEXT:    movaps %xmm1, %xmm0
268; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
269; SSE2-NEXT:    mulps %xmm1, %xmm0
270; SSE2-NEXT:    retq
271;
272; SSE41-LABEL: test_v8f32_zero:
273; SSE41:       # %bb.0:
274; SSE41-NEXT:    mulps %xmm1, %xmm0
275; SSE41-NEXT:    movaps %xmm0, %xmm1
276; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
277; SSE41-NEXT:    mulps %xmm0, %xmm1
278; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
279; SSE41-NEXT:    mulps %xmm0, %xmm1
280; SSE41-NEXT:    movaps %xmm1, %xmm0
281; SSE41-NEXT:    retq
282;
283; AVX-LABEL: test_v8f32_zero:
284; AVX:       # %bb.0:
285; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
286; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
287; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
288; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
289; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
290; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
291; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
292; AVX-NEXT:    vzeroupper
293; AVX-NEXT:    retq
294;
295; AVX512-LABEL: test_v8f32_zero:
296; AVX512:       # %bb.0:
297; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
298; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
299; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
300; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
301; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
302; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
303; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
304; AVX512-NEXT:    vzeroupper
305; AVX512-NEXT:    retq
306  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
307  ret float %1
308}
309
310define float @test_v16f32_zero(<16 x float> %a0) {
311; SSE2-LABEL: test_v16f32_zero:
312; SSE2:       # %bb.0:
313; SSE2-NEXT:    mulps %xmm3, %xmm1
314; SSE2-NEXT:    mulps %xmm2, %xmm0
315; SSE2-NEXT:    mulps %xmm1, %xmm0
316; SSE2-NEXT:    movaps %xmm0, %xmm1
317; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
318; SSE2-NEXT:    mulps %xmm0, %xmm1
319; SSE2-NEXT:    movaps %xmm1, %xmm0
320; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
321; SSE2-NEXT:    mulps %xmm1, %xmm0
322; SSE2-NEXT:    retq
323;
324; SSE41-LABEL: test_v16f32_zero:
325; SSE41:       # %bb.0:
326; SSE41-NEXT:    mulps %xmm3, %xmm1
327; SSE41-NEXT:    mulps %xmm2, %xmm0
328; SSE41-NEXT:    mulps %xmm1, %xmm0
329; SSE41-NEXT:    movaps %xmm0, %xmm1
330; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
331; SSE41-NEXT:    mulps %xmm0, %xmm1
332; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
333; SSE41-NEXT:    mulps %xmm0, %xmm1
334; SSE41-NEXT:    movaps %xmm1, %xmm0
335; SSE41-NEXT:    retq
336;
337; AVX-LABEL: test_v16f32_zero:
338; AVX:       # %bb.0:
339; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
340; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
341; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
342; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
343; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
344; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
345; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
346; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
347; AVX-NEXT:    vzeroupper
348; AVX-NEXT:    retq
349;
350; AVX512-LABEL: test_v16f32_zero:
351; AVX512:       # %bb.0:
352; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
353; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
354; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
355; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
356; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
357; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
358; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
359; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
360; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
361; AVX512-NEXT:    vzeroupper
362; AVX512-NEXT:    retq
363  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
364  ret float %1
365}
366
367;
368; vXf32 (undef)
369;
370
371define float @test_v2f32_undef(<2 x float> %a0) {
372; SSE2-LABEL: test_v2f32_undef:
373; SSE2:       # %bb.0:
374; SSE2-NEXT:    movaps %xmm0, %xmm1
375; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
376; SSE2-NEXT:    mulps %xmm0, %xmm1
377; SSE2-NEXT:    movaps %xmm1, %xmm0
378; SSE2-NEXT:    retq
379;
380; SSE41-LABEL: test_v2f32_undef:
381; SSE41:       # %bb.0:
382; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
383; SSE41-NEXT:    mulps %xmm1, %xmm0
384; SSE41-NEXT:    retq
385;
386; AVX-LABEL: test_v2f32_undef:
387; AVX:       # %bb.0:
388; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
389; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
390; AVX-NEXT:    retq
391;
392; AVX512-LABEL: test_v2f32_undef:
393; AVX512:       # %bb.0:
394; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
395; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
396; AVX512-NEXT:    retq
397  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
398  ret float %1
399}
400
401define float @test_v4f32_undef(<4 x float> %a0) {
402; SSE2-LABEL: test_v4f32_undef:
403; SSE2:       # %bb.0:
404; SSE2-NEXT:    movaps %xmm0, %xmm1
405; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
406; SSE2-NEXT:    mulps %xmm0, %xmm1
407; SSE2-NEXT:    movaps %xmm1, %xmm0
408; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
409; SSE2-NEXT:    mulps %xmm1, %xmm0
410; SSE2-NEXT:    retq
411;
412; SSE41-LABEL: test_v4f32_undef:
413; SSE41:       # %bb.0:
414; SSE41-NEXT:    movaps %xmm0, %xmm1
415; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
416; SSE41-NEXT:    mulps %xmm0, %xmm1
417; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
418; SSE41-NEXT:    mulps %xmm0, %xmm1
419; SSE41-NEXT:    movaps %xmm1, %xmm0
420; SSE41-NEXT:    retq
421;
422; AVX-LABEL: test_v4f32_undef:
423; AVX:       # %bb.0:
424; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
425; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
426; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
427; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
428; AVX-NEXT:    retq
429;
430; AVX512-LABEL: test_v4f32_undef:
431; AVX512:       # %bb.0:
432; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
433; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
434; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
436; AVX512-NEXT:    retq
437  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
438  ret float %1
439}
440
441define float @test_v8f32_undef(<8 x float> %a0) {
442; SSE2-LABEL: test_v8f32_undef:
443; SSE2:       # %bb.0:
444; SSE2-NEXT:    mulps %xmm1, %xmm0
445; SSE2-NEXT:    movaps %xmm0, %xmm1
446; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
447; SSE2-NEXT:    mulps %xmm0, %xmm1
448; SSE2-NEXT:    movaps %xmm1, %xmm0
449; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
450; SSE2-NEXT:    mulps %xmm1, %xmm0
451; SSE2-NEXT:    retq
452;
453; SSE41-LABEL: test_v8f32_undef:
454; SSE41:       # %bb.0:
455; SSE41-NEXT:    mulps %xmm1, %xmm0
456; SSE41-NEXT:    movaps %xmm0, %xmm1
457; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
458; SSE41-NEXT:    mulps %xmm0, %xmm1
459; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
460; SSE41-NEXT:    mulps %xmm0, %xmm1
461; SSE41-NEXT:    movaps %xmm1, %xmm0
462; SSE41-NEXT:    retq
463;
464; AVX-LABEL: test_v8f32_undef:
465; AVX:       # %bb.0:
466; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
467; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
468; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
469; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
470; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
471; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
472; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
473; AVX-NEXT:    vzeroupper
474; AVX-NEXT:    retq
475;
476; AVX512-LABEL: test_v8f32_undef:
477; AVX512:       # %bb.0:
478; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
479; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
480; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
481; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
482; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
483; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
484; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
485; AVX512-NEXT:    vzeroupper
486; AVX512-NEXT:    retq
487  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
488  ret float %1
489}
490
491define float @test_v16f32_undef(<16 x float> %a0) {
492; SSE2-LABEL: test_v16f32_undef:
493; SSE2:       # %bb.0:
494; SSE2-NEXT:    mulps %xmm3, %xmm1
495; SSE2-NEXT:    mulps %xmm2, %xmm0
496; SSE2-NEXT:    mulps %xmm1, %xmm0
497; SSE2-NEXT:    movaps %xmm0, %xmm1
498; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
499; SSE2-NEXT:    mulps %xmm0, %xmm1
500; SSE2-NEXT:    movaps %xmm1, %xmm0
501; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
502; SSE2-NEXT:    mulps %xmm1, %xmm0
503; SSE2-NEXT:    retq
504;
505; SSE41-LABEL: test_v16f32_undef:
506; SSE41:       # %bb.0:
507; SSE41-NEXT:    mulps %xmm3, %xmm1
508; SSE41-NEXT:    mulps %xmm2, %xmm0
509; SSE41-NEXT:    mulps %xmm1, %xmm0
510; SSE41-NEXT:    movaps %xmm0, %xmm1
511; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
512; SSE41-NEXT:    mulps %xmm0, %xmm1
513; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
514; SSE41-NEXT:    mulps %xmm0, %xmm1
515; SSE41-NEXT:    movaps %xmm1, %xmm0
516; SSE41-NEXT:    retq
517;
518; AVX-LABEL: test_v16f32_undef:
519; AVX:       # %bb.0:
520; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
521; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
522; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
523; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
524; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
525; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
526; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
527; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
528; AVX-NEXT:    vzeroupper
529; AVX-NEXT:    retq
530;
531; AVX512-LABEL: test_v16f32_undef:
532; AVX512:       # %bb.0:
533; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
534; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
535; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
536; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
537; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
538; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
539; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
540; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
541; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
542; AVX512-NEXT:    vzeroupper
543; AVX512-NEXT:    retq
544  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
545  ret float %1
546}
547
548;
549; vXf64 (accum)
550;
551
552define double @test_v2f64(double %a0, <2 x double> %a1) {
553; SSE-LABEL: test_v2f64:
554; SSE:       # %bb.0:
555; SSE-NEXT:    movaps %xmm1, %xmm0
556; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
557; SSE-NEXT:    mulpd %xmm1, %xmm0
558; SSE-NEXT:    retq
559;
560; AVX-LABEL: test_v2f64:
561; AVX:       # %bb.0:
562; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
563; AVX-NEXT:    vmulpd %xmm0, %xmm1, %xmm0
564; AVX-NEXT:    retq
565;
566; AVX512-LABEL: test_v2f64:
567; AVX512:       # %bb.0:
568; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
569; AVX512-NEXT:    vmulpd %xmm0, %xmm1, %xmm0
570; AVX512-NEXT:    retq
571  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
572  ret double %1
573}
574
575define double @test_v4f64(double %a0, <4 x double> %a1) {
576; SSE-LABEL: test_v4f64:
577; SSE:       # %bb.0:
578; SSE-NEXT:    mulpd %xmm2, %xmm1
579; SSE-NEXT:    movapd %xmm1, %xmm0
580; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
581; SSE-NEXT:    mulpd %xmm1, %xmm0
582; SSE-NEXT:    retq
583;
584; AVX-LABEL: test_v4f64:
585; AVX:       # %bb.0:
586; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
587; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
588; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
589; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
590; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
591; AVX-NEXT:    vzeroupper
592; AVX-NEXT:    retq
593;
594; AVX512-LABEL: test_v4f64:
595; AVX512:       # %bb.0:
596; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
597; AVX512-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
598; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
600; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
601; AVX512-NEXT:    vzeroupper
602; AVX512-NEXT:    retq
603  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
604  ret double %1
605}
606
607define double @test_v8f64(double %a0, <8 x double> %a1) {
608; SSE-LABEL: test_v8f64:
609; SSE:       # %bb.0:
610; SSE-NEXT:    mulpd %xmm4, %xmm2
611; SSE-NEXT:    mulpd %xmm3, %xmm1
612; SSE-NEXT:    mulpd %xmm2, %xmm1
613; SSE-NEXT:    movapd %xmm1, %xmm0
614; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
615; SSE-NEXT:    mulpd %xmm1, %xmm0
616; SSE-NEXT:    retq
617;
618; AVX-LABEL: test_v8f64:
619; AVX:       # %bb.0:
620; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm0
621; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
622; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
623; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
624; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
625; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
626; AVX-NEXT:    vzeroupper
627; AVX-NEXT:    retq
628;
629; AVX512-LABEL: test_v8f64:
630; AVX512:       # %bb.0:
631; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
632; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
633; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
634; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
635; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
636; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
637; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
638; AVX512-NEXT:    vzeroupper
639; AVX512-NEXT:    retq
640  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
641  ret double %1
642}
643
644define double @test_v16f64(double %a0, <16 x double> %a1) {
645; SSE-LABEL: test_v16f64:
646; SSE:       # %bb.0:
647; SSE-NEXT:    mulpd %xmm6, %xmm2
648; SSE-NEXT:    mulpd %xmm7, %xmm3
649; SSE-NEXT:    mulpd %xmm5, %xmm1
650; SSE-NEXT:    mulpd %xmm3, %xmm1
651; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
652; SSE-NEXT:    mulpd %xmm2, %xmm4
653; SSE-NEXT:    mulpd %xmm1, %xmm4
654; SSE-NEXT:    movapd %xmm4, %xmm0
655; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1]
656; SSE-NEXT:    mulpd %xmm4, %xmm0
657; SSE-NEXT:    retq
658;
659; AVX-LABEL: test_v16f64:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vmulpd %ymm4, %ymm2, %ymm0
662; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
663; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
664; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
665; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
666; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
667; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
668; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
669; AVX-NEXT:    vzeroupper
670; AVX-NEXT:    retq
671;
672; AVX512-LABEL: test_v16f64:
673; AVX512:       # %bb.0:
674; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm0
675; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
676; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
677; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
678; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
679; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
680; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
681; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
682; AVX512-NEXT:    vzeroupper
683; AVX512-NEXT:    retq
684  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
685  ret double %1
686}
687
688;
689; vXf64 (one)
690;
691
692define double @test_v2f64_zero(<2 x double> %a0) {
693; SSE-LABEL: test_v2f64_zero:
694; SSE:       # %bb.0:
695; SSE-NEXT:    movaps %xmm0, %xmm1
696; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
697; SSE-NEXT:    mulpd %xmm0, %xmm1
698; SSE-NEXT:    movapd %xmm1, %xmm0
699; SSE-NEXT:    retq
700;
701; AVX-LABEL: test_v2f64_zero:
702; AVX:       # %bb.0:
703; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
704; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
705; AVX-NEXT:    retq
706;
707; AVX512-LABEL: test_v2f64_zero:
708; AVX512:       # %bb.0:
709; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
710; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
711; AVX512-NEXT:    retq
712  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
713  ret double %1
714}
715
716define double @test_v4f64_zero(<4 x double> %a0) {
717; SSE-LABEL: test_v4f64_zero:
718; SSE:       # %bb.0:
719; SSE-NEXT:    mulpd %xmm1, %xmm0
720; SSE-NEXT:    movapd %xmm0, %xmm1
721; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
722; SSE-NEXT:    mulpd %xmm0, %xmm1
723; SSE-NEXT:    movapd %xmm1, %xmm0
724; SSE-NEXT:    retq
725;
726; AVX-LABEL: test_v4f64_zero:
727; AVX:       # %bb.0:
728; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
729; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
730; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
731; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
732; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
733; AVX-NEXT:    vzeroupper
734; AVX-NEXT:    retq
735;
736; AVX512-LABEL: test_v4f64_zero:
737; AVX512:       # %bb.0:
738; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
739; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
740; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
741; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
742; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
743; AVX512-NEXT:    vzeroupper
744; AVX512-NEXT:    retq
745  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
746  ret double %1
747}
748
749define double @test_v8f64_zero(<8 x double> %a0) {
750; SSE-LABEL: test_v8f64_zero:
751; SSE:       # %bb.0:
752; SSE-NEXT:    mulpd %xmm3, %xmm1
753; SSE-NEXT:    mulpd %xmm2, %xmm0
754; SSE-NEXT:    mulpd %xmm1, %xmm0
755; SSE-NEXT:    movapd %xmm0, %xmm1
756; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
757; SSE-NEXT:    mulpd %xmm0, %xmm1
758; SSE-NEXT:    movapd %xmm1, %xmm0
759; SSE-NEXT:    retq
760;
761; AVX-LABEL: test_v8f64_zero:
762; AVX:       # %bb.0:
763; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
764; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
765; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
766; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
767; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
768; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
769; AVX-NEXT:    vzeroupper
770; AVX-NEXT:    retq
771;
772; AVX512-LABEL: test_v8f64_zero:
773; AVX512:       # %bb.0:
774; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
775; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
776; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
777; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
778; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
779; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
780; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
781; AVX512-NEXT:    vzeroupper
782; AVX512-NEXT:    retq
783  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
784  ret double %1
785}
786
787define double @test_v16f64_zero(<16 x double> %a0) {
788; SSE-LABEL: test_v16f64_zero:
789; SSE:       # %bb.0:
790; SSE-NEXT:    mulpd %xmm6, %xmm2
791; SSE-NEXT:    mulpd %xmm4, %xmm0
792; SSE-NEXT:    mulpd %xmm2, %xmm0
793; SSE-NEXT:    mulpd %xmm7, %xmm3
794; SSE-NEXT:    mulpd %xmm5, %xmm1
795; SSE-NEXT:    mulpd %xmm3, %xmm1
796; SSE-NEXT:    mulpd %xmm0, %xmm1
797; SSE-NEXT:    movapd %xmm1, %xmm0
798; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
799; SSE-NEXT:    mulpd %xmm1, %xmm0
800; SSE-NEXT:    retq
801;
802; AVX-LABEL: test_v16f64_zero:
803; AVX:       # %bb.0:
804; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
805; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
806; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
807; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
808; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
809; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
810; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
811; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
812; AVX-NEXT:    vzeroupper
813; AVX-NEXT:    retq
814;
815; AVX512-LABEL: test_v16f64_zero:
816; AVX512:       # %bb.0:
817; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
818; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
819; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
820; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
821; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
822; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
823; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
824; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
825; AVX512-NEXT:    vzeroupper
826; AVX512-NEXT:    retq
827  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
828  ret double %1
829}
830
831;
832; vXf64 (undef)
833;
834
835define double @test_v2f64_undef(<2 x double> %a0) {
836; SSE-LABEL: test_v2f64_undef:
837; SSE:       # %bb.0:
838; SSE-NEXT:    movaps %xmm0, %xmm1
839; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
840; SSE-NEXT:    mulpd %xmm0, %xmm1
841; SSE-NEXT:    movapd %xmm1, %xmm0
842; SSE-NEXT:    retq
843;
844; AVX-LABEL: test_v2f64_undef:
845; AVX:       # %bb.0:
846; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
847; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
848; AVX-NEXT:    retq
849;
850; AVX512-LABEL: test_v2f64_undef:
851; AVX512:       # %bb.0:
852; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
853; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
854; AVX512-NEXT:    retq
855  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
856  ret double %1
857}
858
859define double @test_v4f64_undef(<4 x double> %a0) {
860; SSE-LABEL: test_v4f64_undef:
861; SSE:       # %bb.0:
862; SSE-NEXT:    mulpd %xmm1, %xmm0
863; SSE-NEXT:    movapd %xmm0, %xmm1
864; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
865; SSE-NEXT:    mulpd %xmm0, %xmm1
866; SSE-NEXT:    movapd %xmm1, %xmm0
867; SSE-NEXT:    retq
868;
869; AVX-LABEL: test_v4f64_undef:
870; AVX:       # %bb.0:
871; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
872; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
873; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
874; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
875; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
876; AVX-NEXT:    vzeroupper
877; AVX-NEXT:    retq
878;
879; AVX512-LABEL: test_v4f64_undef:
880; AVX512:       # %bb.0:
881; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
882; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
883; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
884; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
885; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
886; AVX512-NEXT:    vzeroupper
887; AVX512-NEXT:    retq
888  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
889  ret double %1
890}
891
892define double @test_v8f64_undef(<8 x double> %a0) {
893; SSE-LABEL: test_v8f64_undef:
894; SSE:       # %bb.0:
895; SSE-NEXT:    mulpd %xmm3, %xmm1
896; SSE-NEXT:    mulpd %xmm2, %xmm0
897; SSE-NEXT:    mulpd %xmm1, %xmm0
898; SSE-NEXT:    movapd %xmm0, %xmm1
899; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
900; SSE-NEXT:    mulpd %xmm0, %xmm1
901; SSE-NEXT:    movapd %xmm1, %xmm0
902; SSE-NEXT:    retq
903;
904; AVX-LABEL: test_v8f64_undef:
905; AVX:       # %bb.0:
906; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
907; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
908; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
909; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
910; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
911; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
912; AVX-NEXT:    vzeroupper
913; AVX-NEXT:    retq
914;
915; AVX512-LABEL: test_v8f64_undef:
916; AVX512:       # %bb.0:
917; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
918; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
919; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
920; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
921; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
922; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
923; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
924; AVX512-NEXT:    vzeroupper
925; AVX512-NEXT:    retq
926  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
927  ret double %1
928}
929
930define double @test_v16f64_undef(<16 x double> %a0) {
931; SSE-LABEL: test_v16f64_undef:
932; SSE:       # %bb.0:
933; SSE-NEXT:    mulpd %xmm6, %xmm2
934; SSE-NEXT:    mulpd %xmm4, %xmm0
935; SSE-NEXT:    mulpd %xmm2, %xmm0
936; SSE-NEXT:    mulpd %xmm7, %xmm3
937; SSE-NEXT:    mulpd %xmm5, %xmm1
938; SSE-NEXT:    mulpd %xmm3, %xmm1
939; SSE-NEXT:    mulpd %xmm0, %xmm1
940; SSE-NEXT:    movapd %xmm1, %xmm0
941; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
942; SSE-NEXT:    mulpd %xmm1, %xmm0
943; SSE-NEXT:    retq
944;
945; AVX-LABEL: test_v16f64_undef:
946; AVX:       # %bb.0:
947; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
948; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
949; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
950; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
951; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
952; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
953; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
954; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
955; AVX-NEXT:    vzeroupper
956; AVX-NEXT:    retq
957;
958; AVX512-LABEL: test_v16f64_undef:
959; AVX512:       # %bb.0:
960; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
961; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
962; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
963; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
964; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
965; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
966; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
967; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
968; AVX512-NEXT:    vzeroupper
969; AVX512-NEXT:    retq
970  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
971  ret double %1
972}
973
974declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
975declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
976declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
977declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
978
979declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
980declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
981declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
982declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
983