• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm1, %xmm0
17; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
18; SSE2-NEXT:    addps %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    haddps %xmm1, %xmm1
24; SSE41-NEXT:    movaps %xmm1, %xmm0
25; SSE41-NEXT:    retq
26;
27; AVX-LABEL: test_v2f32:
28; AVX:       # %bb.0:
29; AVX-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
30; AVX-NEXT:    retq
31;
32; AVX512-LABEL: test_v2f32:
33; AVX512:       # %bb.0:
34; AVX512-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
35; AVX512-NEXT:    retq
36  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
37  ret float %1
38}
39
40define float @test_v4f32(float %a0, <4 x float> %a1) {
41; SSE2-LABEL: test_v4f32:
42; SSE2:       # %bb.0:
43; SSE2-NEXT:    movaps %xmm1, %xmm2
44; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
45; SSE2-NEXT:    addps %xmm1, %xmm2
46; SSE2-NEXT:    movaps %xmm2, %xmm0
47; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
48; SSE2-NEXT:    addps %xmm2, %xmm0
49; SSE2-NEXT:    retq
50;
51; SSE41-LABEL: test_v4f32:
52; SSE41:       # %bb.0:
53; SSE41-NEXT:    movaps %xmm1, %xmm0
54; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
55; SSE41-NEXT:    addps %xmm1, %xmm0
56; SSE41-NEXT:    haddps %xmm0, %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX-LABEL: test_v4f32:
60; AVX:       # %bb.0:
61; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
62; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
63; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
64; AVX-NEXT:    retq
65;
66; AVX512-LABEL: test_v4f32:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
69; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
70; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
71; AVX512-NEXT:    retq
72  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
73  ret float %1
74}
75
76define float @test_v8f32(float %a0, <8 x float> %a1) {
77; SSE2-LABEL: test_v8f32:
78; SSE2:       # %bb.0:
79; SSE2-NEXT:    addps %xmm2, %xmm1
80; SSE2-NEXT:    movaps %xmm1, %xmm2
81; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
82; SSE2-NEXT:    addps %xmm1, %xmm2
83; SSE2-NEXT:    movaps %xmm2, %xmm0
84; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
85; SSE2-NEXT:    addps %xmm2, %xmm0
86; SSE2-NEXT:    retq
87;
88; SSE41-LABEL: test_v8f32:
89; SSE41:       # %bb.0:
90; SSE41-NEXT:    addps %xmm2, %xmm1
91; SSE41-NEXT:    movaps %xmm1, %xmm0
92; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
93; SSE41-NEXT:    addps %xmm1, %xmm0
94; SSE41-NEXT:    haddps %xmm0, %xmm0
95; SSE41-NEXT:    retq
96;
97; AVX-LABEL: test_v8f32:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
100; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
101; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
102; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
103; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
104; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
105; AVX-NEXT:    vzeroupper
106; AVX-NEXT:    retq
107;
108; AVX512-LABEL: test_v8f32:
109; AVX512:       # %bb.0:
110; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
111; AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
112; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
113; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
114; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
115; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
116; AVX512-NEXT:    vzeroupper
117; AVX512-NEXT:    retq
118  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
119  ret float %1
120}
121
122define float @test_v16f32(float %a0, <16 x float> %a1) {
123; SSE2-LABEL: test_v16f32:
124; SSE2:       # %bb.0:
125; SSE2-NEXT:    addps %xmm4, %xmm2
126; SSE2-NEXT:    addps %xmm3, %xmm1
127; SSE2-NEXT:    addps %xmm2, %xmm1
128; SSE2-NEXT:    movaps %xmm1, %xmm2
129; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
130; SSE2-NEXT:    addps %xmm1, %xmm2
131; SSE2-NEXT:    movaps %xmm2, %xmm0
132; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
133; SSE2-NEXT:    addps %xmm2, %xmm0
134; SSE2-NEXT:    retq
135;
136; SSE41-LABEL: test_v16f32:
137; SSE41:       # %bb.0:
138; SSE41-NEXT:    addps %xmm4, %xmm2
139; SSE41-NEXT:    addps %xmm3, %xmm1
140; SSE41-NEXT:    addps %xmm2, %xmm1
141; SSE41-NEXT:    movaps %xmm1, %xmm0
142; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
143; SSE41-NEXT:    addps %xmm1, %xmm0
144; SSE41-NEXT:    haddps %xmm0, %xmm0
145; SSE41-NEXT:    retq
146;
147; AVX-LABEL: test_v16f32:
148; AVX:       # %bb.0:
149; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm0
150; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
151; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
152; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
153; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
154; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
155; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
156; AVX-NEXT:    vzeroupper
157; AVX-NEXT:    retq
158;
159; AVX512-LABEL: test_v16f32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
162; AVX512-NEXT:    vaddps %zmm0, %zmm1, %zmm0
163; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
164; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
165; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
166; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
167; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
169; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
170; AVX512-NEXT:    vzeroupper
171; AVX512-NEXT:    retq
172  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
173  ret float %1
174}
175
176;
177; vXf32 (zero)
178;
179
180define float @test_v2f32_zero(<2 x float> %a0) {
181; SSE2-LABEL: test_v2f32_zero:
182; SSE2:       # %bb.0:
183; SSE2-NEXT:    movaps %xmm0, %xmm1
184; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
185; SSE2-NEXT:    addps %xmm0, %xmm1
186; SSE2-NEXT:    movaps %xmm1, %xmm0
187; SSE2-NEXT:    retq
188;
189; SSE41-LABEL: test_v2f32_zero:
190; SSE41:       # %bb.0:
191; SSE41-NEXT:    haddps %xmm0, %xmm0
192; SSE41-NEXT:    retq
193;
194; AVX-LABEL: test_v2f32_zero:
195; AVX:       # %bb.0:
196; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
197; AVX-NEXT:    retq
198;
199; AVX512-LABEL: test_v2f32_zero:
200; AVX512:       # %bb.0:
201; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
202; AVX512-NEXT:    retq
203  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
204  ret float %1
205}
206
207define float @test_v4f32_zero(<4 x float> %a0) {
208; SSE2-LABEL: test_v4f32_zero:
209; SSE2:       # %bb.0:
210; SSE2-NEXT:    movaps %xmm0, %xmm1
211; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
212; SSE2-NEXT:    addps %xmm0, %xmm1
213; SSE2-NEXT:    movaps %xmm1, %xmm0
214; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
215; SSE2-NEXT:    addps %xmm1, %xmm0
216; SSE2-NEXT:    retq
217;
218; SSE41-LABEL: test_v4f32_zero:
219; SSE41:       # %bb.0:
220; SSE41-NEXT:    movaps %xmm0, %xmm1
221; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
222; SSE41-NEXT:    addps %xmm0, %xmm1
223; SSE41-NEXT:    haddps %xmm1, %xmm1
224; SSE41-NEXT:    movaps %xmm1, %xmm0
225; SSE41-NEXT:    retq
226;
227; AVX-LABEL: test_v4f32_zero:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
230; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
231; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
232; AVX-NEXT:    retq
233;
234; AVX512-LABEL: test_v4f32_zero:
235; AVX512:       # %bb.0:
236; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
237; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
238; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
239; AVX512-NEXT:    retq
240  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
241  ret float %1
242}
243
244define float @test_v8f32_zero(<8 x float> %a0) {
245; SSE2-LABEL: test_v8f32_zero:
246; SSE2:       # %bb.0:
247; SSE2-NEXT:    addps %xmm1, %xmm0
248; SSE2-NEXT:    movaps %xmm0, %xmm1
249; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
250; SSE2-NEXT:    addps %xmm0, %xmm1
251; SSE2-NEXT:    movaps %xmm1, %xmm0
252; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
253; SSE2-NEXT:    addps %xmm1, %xmm0
254; SSE2-NEXT:    retq
255;
256; SSE41-LABEL: test_v8f32_zero:
257; SSE41:       # %bb.0:
258; SSE41-NEXT:    addps %xmm1, %xmm0
259; SSE41-NEXT:    movaps %xmm0, %xmm1
260; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
261; SSE41-NEXT:    addps %xmm0, %xmm1
262; SSE41-NEXT:    haddps %xmm1, %xmm1
263; SSE41-NEXT:    movaps %xmm1, %xmm0
264; SSE41-NEXT:    retq
265;
266; AVX-LABEL: test_v8f32_zero:
267; AVX:       # %bb.0:
268; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
269; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
270; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
271; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
272; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
273; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
274; AVX-NEXT:    vzeroupper
275; AVX-NEXT:    retq
276;
277; AVX512-LABEL: test_v8f32_zero:
278; AVX512:       # %bb.0:
279; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
280; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
281; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
282; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
283; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
284; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
285; AVX512-NEXT:    vzeroupper
286; AVX512-NEXT:    retq
287  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
288  ret float %1
289}
290
291define float @test_v16f32_zero(<16 x float> %a0) {
292; SSE2-LABEL: test_v16f32_zero:
293; SSE2:       # %bb.0:
294; SSE2-NEXT:    addps %xmm3, %xmm1
295; SSE2-NEXT:    addps %xmm2, %xmm0
296; SSE2-NEXT:    addps %xmm1, %xmm0
297; SSE2-NEXT:    movaps %xmm0, %xmm1
298; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
299; SSE2-NEXT:    addps %xmm0, %xmm1
300; SSE2-NEXT:    movaps %xmm1, %xmm0
301; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
302; SSE2-NEXT:    addps %xmm1, %xmm0
303; SSE2-NEXT:    retq
304;
305; SSE41-LABEL: test_v16f32_zero:
306; SSE41:       # %bb.0:
307; SSE41-NEXT:    addps %xmm3, %xmm1
308; SSE41-NEXT:    addps %xmm2, %xmm0
309; SSE41-NEXT:    addps %xmm1, %xmm0
310; SSE41-NEXT:    movaps %xmm0, %xmm1
311; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
312; SSE41-NEXT:    addps %xmm0, %xmm1
313; SSE41-NEXT:    haddps %xmm1, %xmm1
314; SSE41-NEXT:    movaps %xmm1, %xmm0
315; SSE41-NEXT:    retq
316;
317; AVX-LABEL: test_v16f32_zero:
318; AVX:       # %bb.0:
319; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
320; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
321; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
322; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
323; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
324; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
325; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
326; AVX-NEXT:    vzeroupper
327; AVX-NEXT:    retq
328;
329; AVX512-LABEL: test_v16f32_zero:
330; AVX512:       # %bb.0:
331; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
332; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
333; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
334; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
335; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
336; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
337; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
338; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
339; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
340; AVX512-NEXT:    vzeroupper
341; AVX512-NEXT:    retq
342  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
343  ret float %1
344}
345
346;
347; vXf32 (undef)
348;
349
350define float @test_v2f32_undef(<2 x float> %a0) {
351; SSE2-LABEL: test_v2f32_undef:
352; SSE2:       # %bb.0:
353; SSE2-NEXT:    movaps %xmm0, %xmm1
354; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
355; SSE2-NEXT:    addps %xmm0, %xmm1
356; SSE2-NEXT:    movaps %xmm1, %xmm0
357; SSE2-NEXT:    retq
358;
359; SSE41-LABEL: test_v2f32_undef:
360; SSE41:       # %bb.0:
361; SSE41-NEXT:    haddps %xmm0, %xmm0
362; SSE41-NEXT:    retq
363;
364; AVX-LABEL: test_v2f32_undef:
365; AVX:       # %bb.0:
366; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
367; AVX-NEXT:    retq
368;
369; AVX512-LABEL: test_v2f32_undef:
370; AVX512:       # %bb.0:
371; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
372; AVX512-NEXT:    retq
373  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
374  ret float %1
375}
376
377define float @test_v4f32_undef(<4 x float> %a0) {
378; SSE2-LABEL: test_v4f32_undef:
379; SSE2:       # %bb.0:
380; SSE2-NEXT:    movaps %xmm0, %xmm1
381; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
382; SSE2-NEXT:    addps %xmm0, %xmm1
383; SSE2-NEXT:    movaps %xmm1, %xmm0
384; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
385; SSE2-NEXT:    addps %xmm1, %xmm0
386; SSE2-NEXT:    retq
387;
388; SSE41-LABEL: test_v4f32_undef:
389; SSE41:       # %bb.0:
390; SSE41-NEXT:    movaps %xmm0, %xmm1
391; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
392; SSE41-NEXT:    addps %xmm0, %xmm1
393; SSE41-NEXT:    haddps %xmm1, %xmm1
394; SSE41-NEXT:    movaps %xmm1, %xmm0
395; SSE41-NEXT:    retq
396;
397; AVX-LABEL: test_v4f32_undef:
398; AVX:       # %bb.0:
399; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
400; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
401; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
402; AVX-NEXT:    retq
403;
404; AVX512-LABEL: test_v4f32_undef:
405; AVX512:       # %bb.0:
406; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
407; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
408; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
409; AVX512-NEXT:    retq
410  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
411  ret float %1
412}
413
414define float @test_v8f32_undef(<8 x float> %a0) {
415; SSE2-LABEL: test_v8f32_undef:
416; SSE2:       # %bb.0:
417; SSE2-NEXT:    addps %xmm1, %xmm0
418; SSE2-NEXT:    movaps %xmm0, %xmm1
419; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
420; SSE2-NEXT:    addps %xmm0, %xmm1
421; SSE2-NEXT:    movaps %xmm1, %xmm0
422; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
423; SSE2-NEXT:    addps %xmm1, %xmm0
424; SSE2-NEXT:    retq
425;
426; SSE41-LABEL: test_v8f32_undef:
427; SSE41:       # %bb.0:
428; SSE41-NEXT:    addps %xmm1, %xmm0
429; SSE41-NEXT:    movaps %xmm0, %xmm1
430; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
431; SSE41-NEXT:    addps %xmm0, %xmm1
432; SSE41-NEXT:    haddps %xmm1, %xmm1
433; SSE41-NEXT:    movaps %xmm1, %xmm0
434; SSE41-NEXT:    retq
435;
436; AVX-LABEL: test_v8f32_undef:
437; AVX:       # %bb.0:
438; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
439; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
440; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
441; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
442; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
443; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
444; AVX-NEXT:    vzeroupper
445; AVX-NEXT:    retq
446;
447; AVX512-LABEL: test_v8f32_undef:
448; AVX512:       # %bb.0:
449; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
450; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
451; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
452; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
453; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
454; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
455; AVX512-NEXT:    vzeroupper
456; AVX512-NEXT:    retq
457  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
458  ret float %1
459}
460
461define float @test_v16f32_undef(<16 x float> %a0) {
462; SSE2-LABEL: test_v16f32_undef:
463; SSE2:       # %bb.0:
464; SSE2-NEXT:    addps %xmm3, %xmm1
465; SSE2-NEXT:    addps %xmm2, %xmm0
466; SSE2-NEXT:    addps %xmm1, %xmm0
467; SSE2-NEXT:    movaps %xmm0, %xmm1
468; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
469; SSE2-NEXT:    addps %xmm0, %xmm1
470; SSE2-NEXT:    movaps %xmm1, %xmm0
471; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
472; SSE2-NEXT:    addps %xmm1, %xmm0
473; SSE2-NEXT:    retq
474;
475; SSE41-LABEL: test_v16f32_undef:
476; SSE41:       # %bb.0:
477; SSE41-NEXT:    addps %xmm3, %xmm1
478; SSE41-NEXT:    addps %xmm2, %xmm0
479; SSE41-NEXT:    addps %xmm1, %xmm0
480; SSE41-NEXT:    movaps %xmm0, %xmm1
481; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
482; SSE41-NEXT:    addps %xmm0, %xmm1
483; SSE41-NEXT:    haddps %xmm1, %xmm1
484; SSE41-NEXT:    movaps %xmm1, %xmm0
485; SSE41-NEXT:    retq
486;
487; AVX-LABEL: test_v16f32_undef:
488; AVX:       # %bb.0:
489; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
490; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
491; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
492; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
493; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
494; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
495; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
496; AVX-NEXT:    vzeroupper
497; AVX-NEXT:    retq
498;
499; AVX512-LABEL: test_v16f32_undef:
500; AVX512:       # %bb.0:
501; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
502; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
503; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
504; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
505; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
507; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
508; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
509; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
510; AVX512-NEXT:    vzeroupper
511; AVX512-NEXT:    retq
512  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
513  ret float %1
514}
515
516;
517; vXf64 (accum)
518;
519
520define double @test_v2f64(double %a0, <2 x double> %a1) {
521; SSE2-LABEL: test_v2f64:
522; SSE2:       # %bb.0:
523; SSE2-NEXT:    movaps %xmm1, %xmm0
524; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
525; SSE2-NEXT:    addpd %xmm1, %xmm0
526; SSE2-NEXT:    retq
527;
528; SSE41-LABEL: test_v2f64:
529; SSE41:       # %bb.0:
530; SSE41-NEXT:    haddpd %xmm1, %xmm1
531; SSE41-NEXT:    movapd %xmm1, %xmm0
532; SSE41-NEXT:    retq
533;
534; AVX-LABEL: test_v2f64:
535; AVX:       # %bb.0:
536; AVX-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
537; AVX-NEXT:    retq
538;
539; AVX512-LABEL: test_v2f64:
540; AVX512:       # %bb.0:
541; AVX512-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
542; AVX512-NEXT:    retq
543  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
544  ret double %1
545}
546
547define double @test_v4f64(double %a0, <4 x double> %a1) {
548; SSE2-LABEL: test_v4f64:
549; SSE2:       # %bb.0:
550; SSE2-NEXT:    addpd %xmm2, %xmm1
551; SSE2-NEXT:    movapd %xmm1, %xmm0
552; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
553; SSE2-NEXT:    addpd %xmm1, %xmm0
554; SSE2-NEXT:    retq
555;
556; SSE41-LABEL: test_v4f64:
557; SSE41:       # %bb.0:
558; SSE41-NEXT:    addpd %xmm2, %xmm1
559; SSE41-NEXT:    haddpd %xmm1, %xmm1
560; SSE41-NEXT:    movapd %xmm1, %xmm0
561; SSE41-NEXT:    retq
562;
563; AVX-LABEL: test_v4f64:
564; AVX:       # %bb.0:
565; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
566; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
567; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
568; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
569; AVX-NEXT:    vzeroupper
570; AVX-NEXT:    retq
571;
572; AVX512-LABEL: test_v4f64:
573; AVX512:       # %bb.0:
574; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
575; AVX512-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
576; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
577; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
578; AVX512-NEXT:    vzeroupper
579; AVX512-NEXT:    retq
580  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
581  ret double %1
582}
583
584define double @test_v8f64(double %a0, <8 x double> %a1) {
585; SSE2-LABEL: test_v8f64:
586; SSE2:       # %bb.0:
587; SSE2-NEXT:    addpd %xmm4, %xmm2
588; SSE2-NEXT:    addpd %xmm3, %xmm1
589; SSE2-NEXT:    addpd %xmm2, %xmm1
590; SSE2-NEXT:    movapd %xmm1, %xmm0
591; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
592; SSE2-NEXT:    addpd %xmm1, %xmm0
593; SSE2-NEXT:    retq
594;
595; SSE41-LABEL: test_v8f64:
596; SSE41:       # %bb.0:
597; SSE41-NEXT:    addpd %xmm4, %xmm2
598; SSE41-NEXT:    addpd %xmm3, %xmm1
599; SSE41-NEXT:    addpd %xmm2, %xmm1
600; SSE41-NEXT:    haddpd %xmm1, %xmm1
601; SSE41-NEXT:    movapd %xmm1, %xmm0
602; SSE41-NEXT:    retq
603;
604; AVX-LABEL: test_v8f64:
605; AVX:       # %bb.0:
606; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm0
607; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
608; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
609; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
610; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
611; AVX-NEXT:    vzeroupper
612; AVX-NEXT:    retq
613;
614; AVX512-LABEL: test_v8f64:
615; AVX512:       # %bb.0:
616; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
617; AVX512-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
618; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
619; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
620; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
621; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
622; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
623; AVX512-NEXT:    vzeroupper
624; AVX512-NEXT:    retq
625  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
626  ret double %1
627}
628
629define double @test_v16f64(double %a0, <16 x double> %a1) {
630; SSE2-LABEL: test_v16f64:
631; SSE2:       # %bb.0:
632; SSE2-NEXT:    addpd %xmm6, %xmm2
633; SSE2-NEXT:    addpd %xmm7, %xmm3
634; SSE2-NEXT:    addpd %xmm5, %xmm1
635; SSE2-NEXT:    addpd %xmm3, %xmm1
636; SSE2-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
637; SSE2-NEXT:    addpd %xmm2, %xmm4
638; SSE2-NEXT:    addpd %xmm1, %xmm4
639; SSE2-NEXT:    movapd %xmm4, %xmm0
640; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1]
641; SSE2-NEXT:    addpd %xmm4, %xmm0
642; SSE2-NEXT:    retq
643;
644; SSE41-LABEL: test_v16f64:
645; SSE41:       # %bb.0:
646; SSE41-NEXT:    addpd %xmm6, %xmm2
647; SSE41-NEXT:    addpd %xmm7, %xmm3
648; SSE41-NEXT:    addpd %xmm5, %xmm1
649; SSE41-NEXT:    addpd %xmm3, %xmm1
650; SSE41-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
651; SSE41-NEXT:    addpd %xmm2, %xmm4
652; SSE41-NEXT:    addpd %xmm1, %xmm4
653; SSE41-NEXT:    haddpd %xmm4, %xmm4
654; SSE41-NEXT:    movapd %xmm4, %xmm0
655; SSE41-NEXT:    retq
656;
657; AVX-LABEL: test_v16f64:
658; AVX:       # %bb.0:
659; AVX-NEXT:    vaddpd %ymm4, %ymm2, %ymm0
660; AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
661; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
662; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
663; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
664; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
665; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
666; AVX-NEXT:    vzeroupper
667; AVX-NEXT:    retq
668;
669; AVX512-LABEL: test_v16f64:
670; AVX512:       # %bb.0:
671; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm0
672; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
673; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
674; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
675; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
676; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
677; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
678; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
679; AVX512-NEXT:    vzeroupper
680; AVX512-NEXT:    retq
681  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
682  ret double %1
683}
684
685;
686; vXf64 (zero)
687;
688
689define double @test_v2f64_zero(<2 x double> %a0) {
690; SSE2-LABEL: test_v2f64_zero:
691; SSE2:       # %bb.0:
692; SSE2-NEXT:    movaps %xmm0, %xmm1
693; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
694; SSE2-NEXT:    addpd %xmm0, %xmm1
695; SSE2-NEXT:    movapd %xmm1, %xmm0
696; SSE2-NEXT:    retq
697;
698; SSE41-LABEL: test_v2f64_zero:
699; SSE41:       # %bb.0:
700; SSE41-NEXT:    haddpd %xmm0, %xmm0
701; SSE41-NEXT:    retq
702;
703; AVX-LABEL: test_v2f64_zero:
704; AVX:       # %bb.0:
705; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
706; AVX-NEXT:    retq
707;
708; AVX512-LABEL: test_v2f64_zero:
709; AVX512:       # %bb.0:
710; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
711; AVX512-NEXT:    retq
712  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
713  ret double %1
714}
715
716define double @test_v4f64_zero(<4 x double> %a0) {
717; SSE2-LABEL: test_v4f64_zero:
718; SSE2:       # %bb.0:
719; SSE2-NEXT:    addpd %xmm1, %xmm0
720; SSE2-NEXT:    movapd %xmm0, %xmm1
721; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
722; SSE2-NEXT:    addpd %xmm0, %xmm1
723; SSE2-NEXT:    movapd %xmm1, %xmm0
724; SSE2-NEXT:    retq
725;
726; SSE41-LABEL: test_v4f64_zero:
727; SSE41:       # %bb.0:
728; SSE41-NEXT:    addpd %xmm1, %xmm0
729; SSE41-NEXT:    haddpd %xmm0, %xmm0
730; SSE41-NEXT:    retq
731;
732; AVX-LABEL: test_v4f64_zero:
733; AVX:       # %bb.0:
734; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
735; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
736; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
737; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
738; AVX-NEXT:    vzeroupper
739; AVX-NEXT:    retq
740;
741; AVX512-LABEL: test_v4f64_zero:
742; AVX512:       # %bb.0:
743; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
744; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
745; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
746; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
747; AVX512-NEXT:    vzeroupper
748; AVX512-NEXT:    retq
749  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
750  ret double %1
751}
752
753define double @test_v8f64_zero(<8 x double> %a0) {
754; SSE2-LABEL: test_v8f64_zero:
755; SSE2:       # %bb.0:
756; SSE2-NEXT:    addpd %xmm3, %xmm1
757; SSE2-NEXT:    addpd %xmm2, %xmm0
758; SSE2-NEXT:    addpd %xmm1, %xmm0
759; SSE2-NEXT:    movapd %xmm0, %xmm1
760; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
761; SSE2-NEXT:    addpd %xmm0, %xmm1
762; SSE2-NEXT:    movapd %xmm1, %xmm0
763; SSE2-NEXT:    retq
764;
765; SSE41-LABEL: test_v8f64_zero:
766; SSE41:       # %bb.0:
767; SSE41-NEXT:    addpd %xmm3, %xmm1
768; SSE41-NEXT:    addpd %xmm2, %xmm0
769; SSE41-NEXT:    addpd %xmm1, %xmm0
770; SSE41-NEXT:    haddpd %xmm0, %xmm0
771; SSE41-NEXT:    retq
772;
773; AVX-LABEL: test_v8f64_zero:
774; AVX:       # %bb.0:
775; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
776; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
777; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
778; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
779; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
780; AVX-NEXT:    vzeroupper
781; AVX-NEXT:    retq
782;
783; AVX512-LABEL: test_v8f64_zero:
784; AVX512:       # %bb.0:
785; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
786; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
787; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
788; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
789; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
790; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
791; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
792; AVX512-NEXT:    vzeroupper
793; AVX512-NEXT:    retq
794  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
795  ret double %1
796}
797
798define double @test_v16f64_zero(<16 x double> %a0) {
799; SSE2-LABEL: test_v16f64_zero:
800; SSE2:       # %bb.0:
801; SSE2-NEXT:    addpd %xmm6, %xmm2
802; SSE2-NEXT:    addpd %xmm4, %xmm0
803; SSE2-NEXT:    addpd %xmm2, %xmm0
804; SSE2-NEXT:    addpd %xmm7, %xmm3
805; SSE2-NEXT:    addpd %xmm5, %xmm1
806; SSE2-NEXT:    addpd %xmm3, %xmm1
807; SSE2-NEXT:    addpd %xmm0, %xmm1
808; SSE2-NEXT:    movapd %xmm1, %xmm0
809; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
810; SSE2-NEXT:    addpd %xmm1, %xmm0
811; SSE2-NEXT:    retq
812;
813; SSE41-LABEL: test_v16f64_zero:
814; SSE41:       # %bb.0:
815; SSE41-NEXT:    addpd %xmm6, %xmm2
816; SSE41-NEXT:    addpd %xmm4, %xmm0
817; SSE41-NEXT:    addpd %xmm2, %xmm0
818; SSE41-NEXT:    addpd %xmm7, %xmm3
819; SSE41-NEXT:    addpd %xmm5, %xmm1
820; SSE41-NEXT:    addpd %xmm3, %xmm1
821; SSE41-NEXT:    addpd %xmm0, %xmm1
822; SSE41-NEXT:    haddpd %xmm1, %xmm1
823; SSE41-NEXT:    movapd %xmm1, %xmm0
824; SSE41-NEXT:    retq
825;
826; AVX-LABEL: test_v16f64_zero:
827; AVX:       # %bb.0:
828; AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
829; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
830; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
831; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
832; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
833; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
834; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
835; AVX-NEXT:    vzeroupper
836; AVX-NEXT:    retq
837;
838; AVX512-LABEL: test_v16f64_zero:
839; AVX512:       # %bb.0:
840; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
841; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
842; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
843; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
844; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
845; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
846; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
847; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
848; AVX512-NEXT:    vzeroupper
849; AVX512-NEXT:    retq
850  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
851  ret double %1
852}
853
854;
855; vXf64 (undef)
856;
857
858define double @test_v2f64_undef(<2 x double> %a0) {
859; SSE2-LABEL: test_v2f64_undef:
860; SSE2:       # %bb.0:
861; SSE2-NEXT:    movaps %xmm0, %xmm1
862; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
863; SSE2-NEXT:    addpd %xmm0, %xmm1
864; SSE2-NEXT:    movapd %xmm1, %xmm0
865; SSE2-NEXT:    retq
866;
867; SSE41-LABEL: test_v2f64_undef:
868; SSE41:       # %bb.0:
869; SSE41-NEXT:    haddpd %xmm0, %xmm0
870; SSE41-NEXT:    retq
871;
872; AVX-LABEL: test_v2f64_undef:
873; AVX:       # %bb.0:
874; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
875; AVX-NEXT:    retq
876;
877; AVX512-LABEL: test_v2f64_undef:
878; AVX512:       # %bb.0:
879; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
880; AVX512-NEXT:    retq
881  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
882  ret double %1
883}
884
885define double @test_v4f64_undef(<4 x double> %a0) {
886; SSE2-LABEL: test_v4f64_undef:
887; SSE2:       # %bb.0:
888; SSE2-NEXT:    addpd %xmm1, %xmm0
889; SSE2-NEXT:    movapd %xmm0, %xmm1
890; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
891; SSE2-NEXT:    addpd %xmm0, %xmm1
892; SSE2-NEXT:    movapd %xmm1, %xmm0
893; SSE2-NEXT:    retq
894;
895; SSE41-LABEL: test_v4f64_undef:
896; SSE41:       # %bb.0:
897; SSE41-NEXT:    addpd %xmm1, %xmm0
898; SSE41-NEXT:    haddpd %xmm0, %xmm0
899; SSE41-NEXT:    retq
900;
901; AVX-LABEL: test_v4f64_undef:
902; AVX:       # %bb.0:
903; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
904; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
905; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
906; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
907; AVX-NEXT:    vzeroupper
908; AVX-NEXT:    retq
909;
910; AVX512-LABEL: test_v4f64_undef:
911; AVX512:       # %bb.0:
912; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
913; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
914; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
915; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
916; AVX512-NEXT:    vzeroupper
917; AVX512-NEXT:    retq
918  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
919  ret double %1
920}
921
922define double @test_v8f64_undef(<8 x double> %a0) {
923; SSE2-LABEL: test_v8f64_undef:
924; SSE2:       # %bb.0:
925; SSE2-NEXT:    addpd %xmm3, %xmm1
926; SSE2-NEXT:    addpd %xmm2, %xmm0
927; SSE2-NEXT:    addpd %xmm1, %xmm0
928; SSE2-NEXT:    movapd %xmm0, %xmm1
929; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
930; SSE2-NEXT:    addpd %xmm0, %xmm1
931; SSE2-NEXT:    movapd %xmm1, %xmm0
932; SSE2-NEXT:    retq
933;
934; SSE41-LABEL: test_v8f64_undef:
935; SSE41:       # %bb.0:
936; SSE41-NEXT:    addpd %xmm3, %xmm1
937; SSE41-NEXT:    addpd %xmm2, %xmm0
938; SSE41-NEXT:    addpd %xmm1, %xmm0
939; SSE41-NEXT:    haddpd %xmm0, %xmm0
940; SSE41-NEXT:    retq
941;
942; AVX-LABEL: test_v8f64_undef:
943; AVX:       # %bb.0:
944; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
945; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
946; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
947; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
948; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
949; AVX-NEXT:    vzeroupper
950; AVX-NEXT:    retq
951;
952; AVX512-LABEL: test_v8f64_undef:
953; AVX512:       # %bb.0:
954; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
955; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
956; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
957; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
958; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
959; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
960; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
961; AVX512-NEXT:    vzeroupper
962; AVX512-NEXT:    retq
963  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
964  ret double %1
965}
966
967define double @test_v16f64_undef(<16 x double> %a0) {
968; SSE2-LABEL: test_v16f64_undef:
969; SSE2:       # %bb.0:
970; SSE2-NEXT:    addpd %xmm6, %xmm2
971; SSE2-NEXT:    addpd %xmm4, %xmm0
972; SSE2-NEXT:    addpd %xmm2, %xmm0
973; SSE2-NEXT:    addpd %xmm7, %xmm3
974; SSE2-NEXT:    addpd %xmm5, %xmm1
975; SSE2-NEXT:    addpd %xmm3, %xmm1
976; SSE2-NEXT:    addpd %xmm0, %xmm1
977; SSE2-NEXT:    movapd %xmm1, %xmm0
978; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
979; SSE2-NEXT:    addpd %xmm1, %xmm0
980; SSE2-NEXT:    retq
981;
982; SSE41-LABEL: test_v16f64_undef:
983; SSE41:       # %bb.0:
984; SSE41-NEXT:    addpd %xmm6, %xmm2
985; SSE41-NEXT:    addpd %xmm4, %xmm0
986; SSE41-NEXT:    addpd %xmm2, %xmm0
987; SSE41-NEXT:    addpd %xmm7, %xmm3
988; SSE41-NEXT:    addpd %xmm5, %xmm1
989; SSE41-NEXT:    addpd %xmm3, %xmm1
990; SSE41-NEXT:    addpd %xmm0, %xmm1
991; SSE41-NEXT:    haddpd %xmm1, %xmm1
992; SSE41-NEXT:    movapd %xmm1, %xmm0
993; SSE41-NEXT:    retq
994;
995; AVX-LABEL: test_v16f64_undef:
996; AVX:       # %bb.0:
997; AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
998; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
999; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1000; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1001; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1002; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
1003; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1004; AVX-NEXT:    vzeroupper
1005; AVX-NEXT:    retq
1006;
1007; AVX512-LABEL: test_v16f64_undef:
1008; AVX512:       # %bb.0:
1009; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1010; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1011; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1012; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1013; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1014; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1015; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1016; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1017; AVX512-NEXT:    vzeroupper
1018; AVX512-NEXT:    retq
1019  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
1020  ret double %1
1021}
1022
1023declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
1024declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
1025declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
1026declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
1027
1028declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
1029declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
1030declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
1031declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
1032