• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    addss %xmm1, %xmm0
17; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
18; SSE2-NEXT:    addss %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    addss %xmm1, %xmm0
24; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
25; SSE41-NEXT:    addss %xmm1, %xmm0
26; SSE41-NEXT:    retq
27;
28; AVX-LABEL: test_v2f32:
29; AVX:       # %bb.0:
30; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
31; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
32; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
33; AVX-NEXT:    retq
34;
35; AVX512-LABEL: test_v2f32:
36; AVX512:       # %bb.0:
37; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
38; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
39; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
40; AVX512-NEXT:    retq
41  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
42  ret float %1
43}
44
45define float @test_v4f32(float %a0, <4 x float> %a1) {
46; SSE2-LABEL: test_v4f32:
47; SSE2:       # %bb.0:
48; SSE2-NEXT:    addss %xmm1, %xmm0
49; SSE2-NEXT:    movaps %xmm1, %xmm2
50; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
51; SSE2-NEXT:    addss %xmm2, %xmm0
52; SSE2-NEXT:    movaps %xmm1, %xmm2
53; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
54; SSE2-NEXT:    addss %xmm2, %xmm0
55; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
56; SSE2-NEXT:    addss %xmm1, %xmm0
57; SSE2-NEXT:    retq
58;
59; SSE41-LABEL: test_v4f32:
60; SSE41:       # %bb.0:
61; SSE41-NEXT:    addss %xmm1, %xmm0
62; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
63; SSE41-NEXT:    addss %xmm2, %xmm0
64; SSE41-NEXT:    movaps %xmm1, %xmm2
65; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
66; SSE41-NEXT:    addss %xmm2, %xmm0
67; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
68; SSE41-NEXT:    addss %xmm1, %xmm0
69; SSE41-NEXT:    retq
70;
71; AVX-LABEL: test_v4f32:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
75; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
76; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
77; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
78; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
79; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
80; AVX-NEXT:    retq
81;
82; AVX512-LABEL: test_v4f32:
83; AVX512:       # %bb.0:
84; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
85; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
86; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
87; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
88; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
89; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
90; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
91; AVX512-NEXT:    retq
92  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
93  ret float %1
94}
95
96define float @test_v8f32(float %a0, <8 x float> %a1) {
97; SSE2-LABEL: test_v8f32:
98; SSE2:       # %bb.0:
99; SSE2-NEXT:    addss %xmm1, %xmm0
100; SSE2-NEXT:    movaps %xmm1, %xmm3
101; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3]
102; SSE2-NEXT:    addss %xmm3, %xmm0
103; SSE2-NEXT:    movaps %xmm1, %xmm3
104; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
105; SSE2-NEXT:    addss %xmm3, %xmm0
106; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
107; SSE2-NEXT:    addss %xmm1, %xmm0
108; SSE2-NEXT:    addss %xmm2, %xmm0
109; SSE2-NEXT:    movaps %xmm2, %xmm1
110; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
111; SSE2-NEXT:    addss %xmm1, %xmm0
112; SSE2-NEXT:    movaps %xmm2, %xmm1
113; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
114; SSE2-NEXT:    addss %xmm1, %xmm0
115; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
116; SSE2-NEXT:    addss %xmm2, %xmm0
117; SSE2-NEXT:    retq
118;
119; SSE41-LABEL: test_v8f32:
120; SSE41:       # %bb.0:
121; SSE41-NEXT:    addss %xmm1, %xmm0
122; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
123; SSE41-NEXT:    addss %xmm3, %xmm0
124; SSE41-NEXT:    movaps %xmm1, %xmm3
125; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
126; SSE41-NEXT:    addss %xmm3, %xmm0
127; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
128; SSE41-NEXT:    addss %xmm1, %xmm0
129; SSE41-NEXT:    addss %xmm2, %xmm0
130; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
131; SSE41-NEXT:    addss %xmm1, %xmm0
132; SSE41-NEXT:    movaps %xmm2, %xmm1
133; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
134; SSE41-NEXT:    addss %xmm1, %xmm0
135; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
136; SSE41-NEXT:    addss %xmm2, %xmm0
137; SSE41-NEXT:    retq
138;
139; AVX-LABEL: test_v8f32:
140; AVX:       # %bb.0:
141; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
142; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
143; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
144; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
145; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
146; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
147; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
148; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
149; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
150; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
151; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
152; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
153; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
154; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
155; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
156; AVX-NEXT:    vzeroupper
157; AVX-NEXT:    retq
158;
159; AVX512-LABEL: test_v8f32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
162; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
163; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
164; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
165; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
166; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
167; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
168; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
169; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
170; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
172; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
173; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
174; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
175; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
176; AVX512-NEXT:    vzeroupper
177; AVX512-NEXT:    retq
178  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
179  ret float %1
180}
181
182define float @test_v16f32(float %a0, <16 x float> %a1) {
183; SSE2-LABEL: test_v16f32:
184; SSE2:       # %bb.0:
185; SSE2-NEXT:    addss %xmm1, %xmm0
186; SSE2-NEXT:    movaps %xmm1, %xmm5
187; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3]
188; SSE2-NEXT:    addss %xmm5, %xmm0
189; SSE2-NEXT:    movaps %xmm1, %xmm5
190; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
191; SSE2-NEXT:    addss %xmm5, %xmm0
192; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
193; SSE2-NEXT:    addss %xmm1, %xmm0
194; SSE2-NEXT:    addss %xmm2, %xmm0
195; SSE2-NEXT:    movaps %xmm2, %xmm1
196; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
197; SSE2-NEXT:    addss %xmm1, %xmm0
198; SSE2-NEXT:    movaps %xmm2, %xmm1
199; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
200; SSE2-NEXT:    addss %xmm1, %xmm0
201; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
202; SSE2-NEXT:    addss %xmm2, %xmm0
203; SSE2-NEXT:    addss %xmm3, %xmm0
204; SSE2-NEXT:    movaps %xmm3, %xmm1
205; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
206; SSE2-NEXT:    addss %xmm1, %xmm0
207; SSE2-NEXT:    movaps %xmm3, %xmm1
208; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
209; SSE2-NEXT:    addss %xmm1, %xmm0
210; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
211; SSE2-NEXT:    addss %xmm3, %xmm0
212; SSE2-NEXT:    addss %xmm4, %xmm0
213; SSE2-NEXT:    movaps %xmm4, %xmm1
214; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3]
215; SSE2-NEXT:    addss %xmm1, %xmm0
216; SSE2-NEXT:    movaps %xmm4, %xmm1
217; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
218; SSE2-NEXT:    addss %xmm1, %xmm0
219; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
220; SSE2-NEXT:    addss %xmm4, %xmm0
221; SSE2-NEXT:    retq
222;
223; SSE41-LABEL: test_v16f32:
224; SSE41:       # %bb.0:
225; SSE41-NEXT:    addss %xmm1, %xmm0
226; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
227; SSE41-NEXT:    addss %xmm5, %xmm0
228; SSE41-NEXT:    movaps %xmm1, %xmm5
229; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
230; SSE41-NEXT:    addss %xmm5, %xmm0
231; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
232; SSE41-NEXT:    addss %xmm1, %xmm0
233; SSE41-NEXT:    addss %xmm2, %xmm0
234; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
235; SSE41-NEXT:    addss %xmm1, %xmm0
236; SSE41-NEXT:    movaps %xmm2, %xmm1
237; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
238; SSE41-NEXT:    addss %xmm1, %xmm0
239; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
240; SSE41-NEXT:    addss %xmm2, %xmm0
241; SSE41-NEXT:    addss %xmm3, %xmm0
242; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
243; SSE41-NEXT:    addss %xmm1, %xmm0
244; SSE41-NEXT:    movaps %xmm3, %xmm1
245; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
246; SSE41-NEXT:    addss %xmm1, %xmm0
247; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
248; SSE41-NEXT:    addss %xmm3, %xmm0
249; SSE41-NEXT:    addss %xmm4, %xmm0
250; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
251; SSE41-NEXT:    addss %xmm1, %xmm0
252; SSE41-NEXT:    movaps %xmm4, %xmm1
253; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
254; SSE41-NEXT:    addss %xmm1, %xmm0
255; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
256; SSE41-NEXT:    addss %xmm4, %xmm0
257; SSE41-NEXT:    retq
258;
259; AVX-LABEL: test_v16f32:
260; AVX:       # %bb.0:
261; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
262; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
263; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
264; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
265; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
266; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
267; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
268; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
269; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
270; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
271; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
272; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
273; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
274; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
275; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
276; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
277; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
278; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
279; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
280; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
281; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
282; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
283; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
284; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
285; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
286; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
287; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
288; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
289; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
290; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
291; AVX-NEXT:    vzeroupper
292; AVX-NEXT:    retq
293;
294; AVX512-LABEL: test_v16f32:
295; AVX512:       # %bb.0:
296; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
297; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
298; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
299; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
300; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
301; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
302; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
303; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
304; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
305; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
306; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
307; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
308; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
309; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
310; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
311; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
312; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
313; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
314; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
315; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
316; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
317; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
318; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
319; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
320; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
321; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
322; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
323; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
324; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
325; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
326; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
327; AVX512-NEXT:    vzeroupper
328; AVX512-NEXT:    retq
329  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
330  ret float %1
331}
332
333;
334; vXf32 (zero)
335;
336
337define float @test_v2f32_zero(<2 x float> %a0) {
338; SSE2-LABEL: test_v2f32_zero:
339; SSE2:       # %bb.0:
340; SSE2-NEXT:    xorps %xmm1, %xmm1
341; SSE2-NEXT:    addss %xmm0, %xmm1
342; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
343; SSE2-NEXT:    addss %xmm1, %xmm0
344; SSE2-NEXT:    retq
345;
346; SSE41-LABEL: test_v2f32_zero:
347; SSE41:       # %bb.0:
348; SSE41-NEXT:    xorps %xmm1, %xmm1
349; SSE41-NEXT:    addss %xmm0, %xmm1
350; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
351; SSE41-NEXT:    addss %xmm1, %xmm0
352; SSE41-NEXT:    retq
353;
354; AVX-LABEL: test_v2f32_zero:
355; AVX:       # %bb.0:
356; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
357; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
358; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
359; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
360; AVX-NEXT:    retq
361;
362; AVX512-LABEL: test_v2f32_zero:
363; AVX512:       # %bb.0:
364; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
365; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
366; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
367; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
368; AVX512-NEXT:    retq
369  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
370  ret float %1
371}
372
373define float @test_v4f32_zero(<4 x float> %a0) {
374; SSE2-LABEL: test_v4f32_zero:
375; SSE2:       # %bb.0:
376; SSE2-NEXT:    xorps %xmm1, %xmm1
377; SSE2-NEXT:    addss %xmm0, %xmm1
378; SSE2-NEXT:    movaps %xmm0, %xmm2
379; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
380; SSE2-NEXT:    addss %xmm1, %xmm2
381; SSE2-NEXT:    movaps %xmm0, %xmm1
382; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
383; SSE2-NEXT:    addss %xmm2, %xmm1
384; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
385; SSE2-NEXT:    addss %xmm1, %xmm0
386; SSE2-NEXT:    retq
387;
388; SSE41-LABEL: test_v4f32_zero:
389; SSE41:       # %bb.0:
390; SSE41-NEXT:    xorps %xmm1, %xmm1
391; SSE41-NEXT:    addss %xmm0, %xmm1
392; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
393; SSE41-NEXT:    addss %xmm1, %xmm2
394; SSE41-NEXT:    movaps %xmm0, %xmm1
395; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
396; SSE41-NEXT:    addss %xmm2, %xmm1
397; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
398; SSE41-NEXT:    addss %xmm1, %xmm0
399; SSE41-NEXT:    retq
400;
401; AVX-LABEL: test_v4f32_zero:
402; AVX:       # %bb.0:
403; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
404; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
405; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
406; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
407; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
408; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
409; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
410; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
411; AVX-NEXT:    retq
412;
413; AVX512-LABEL: test_v4f32_zero:
414; AVX512:       # %bb.0:
415; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
416; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
417; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
418; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
419; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
420; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
421; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
422; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
423; AVX512-NEXT:    retq
424  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
425  ret float %1
426}
427
428define float @test_v8f32_zero(<8 x float> %a0) {
429; SSE2-LABEL: test_v8f32_zero:
430; SSE2:       # %bb.0:
431; SSE2-NEXT:    xorps %xmm2, %xmm2
432; SSE2-NEXT:    addss %xmm0, %xmm2
433; SSE2-NEXT:    movaps %xmm0, %xmm3
434; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
435; SSE2-NEXT:    addss %xmm2, %xmm3
436; SSE2-NEXT:    movaps %xmm0, %xmm2
437; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
438; SSE2-NEXT:    addss %xmm3, %xmm2
439; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
440; SSE2-NEXT:    addss %xmm2, %xmm0
441; SSE2-NEXT:    addss %xmm1, %xmm0
442; SSE2-NEXT:    movaps %xmm1, %xmm2
443; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
444; SSE2-NEXT:    addss %xmm2, %xmm0
445; SSE2-NEXT:    movaps %xmm1, %xmm2
446; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
447; SSE2-NEXT:    addss %xmm2, %xmm0
448; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
449; SSE2-NEXT:    addss %xmm1, %xmm0
450; SSE2-NEXT:    retq
451;
452; SSE41-LABEL: test_v8f32_zero:
453; SSE41:       # %bb.0:
454; SSE41-NEXT:    xorps %xmm2, %xmm2
455; SSE41-NEXT:    addss %xmm0, %xmm2
456; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
457; SSE41-NEXT:    addss %xmm2, %xmm3
458; SSE41-NEXT:    movaps %xmm0, %xmm2
459; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
460; SSE41-NEXT:    addss %xmm3, %xmm2
461; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
462; SSE41-NEXT:    addss %xmm2, %xmm0
463; SSE41-NEXT:    addss %xmm1, %xmm0
464; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
465; SSE41-NEXT:    addss %xmm2, %xmm0
466; SSE41-NEXT:    movaps %xmm1, %xmm2
467; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
468; SSE41-NEXT:    addss %xmm2, %xmm0
469; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
470; SSE41-NEXT:    addss %xmm1, %xmm0
471; SSE41-NEXT:    retq
472;
473; AVX-LABEL: test_v8f32_zero:
474; AVX:       # %bb.0:
475; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
476; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
477; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
478; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
479; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
480; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
481; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
482; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
483; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
484; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
485; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
486; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
487; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
488; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
489; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
490; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
491; AVX-NEXT:    vzeroupper
492; AVX-NEXT:    retq
493;
494; AVX512-LABEL: test_v8f32_zero:
495; AVX512:       # %bb.0:
496; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
497; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
498; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
499; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
500; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
501; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
502; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
503; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
504; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
505; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
506; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
507; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
508; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
509; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
510; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
511; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
512; AVX512-NEXT:    vzeroupper
513; AVX512-NEXT:    retq
514  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
515  ret float %1
516}
517
518define float @test_v16f32_zero(<16 x float> %a0) {
519; SSE2-LABEL: test_v16f32_zero:
520; SSE2:       # %bb.0:
521; SSE2-NEXT:    xorps %xmm4, %xmm4
522; SSE2-NEXT:    addss %xmm0, %xmm4
523; SSE2-NEXT:    movaps %xmm0, %xmm5
524; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3]
525; SSE2-NEXT:    addss %xmm4, %xmm5
526; SSE2-NEXT:    movaps %xmm0, %xmm4
527; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
528; SSE2-NEXT:    addss %xmm5, %xmm4
529; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
530; SSE2-NEXT:    addss %xmm4, %xmm0
531; SSE2-NEXT:    addss %xmm1, %xmm0
532; SSE2-NEXT:    movaps %xmm1, %xmm4
533; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
534; SSE2-NEXT:    addss %xmm4, %xmm0
535; SSE2-NEXT:    movaps %xmm1, %xmm4
536; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
537; SSE2-NEXT:    addss %xmm4, %xmm0
538; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
539; SSE2-NEXT:    addss %xmm1, %xmm0
540; SSE2-NEXT:    addss %xmm2, %xmm0
541; SSE2-NEXT:    movaps %xmm2, %xmm1
542; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
543; SSE2-NEXT:    addss %xmm1, %xmm0
544; SSE2-NEXT:    movaps %xmm2, %xmm1
545; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
546; SSE2-NEXT:    addss %xmm1, %xmm0
547; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
548; SSE2-NEXT:    addss %xmm2, %xmm0
549; SSE2-NEXT:    addss %xmm3, %xmm0
550; SSE2-NEXT:    movaps %xmm3, %xmm1
551; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
552; SSE2-NEXT:    addss %xmm1, %xmm0
553; SSE2-NEXT:    movaps %xmm3, %xmm1
554; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
555; SSE2-NEXT:    addss %xmm1, %xmm0
556; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
557; SSE2-NEXT:    addss %xmm3, %xmm0
558; SSE2-NEXT:    retq
559;
560; SSE41-LABEL: test_v16f32_zero:
561; SSE41:       # %bb.0:
562; SSE41-NEXT:    xorps %xmm4, %xmm4
563; SSE41-NEXT:    addss %xmm0, %xmm4
564; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
565; SSE41-NEXT:    addss %xmm4, %xmm5
566; SSE41-NEXT:    movaps %xmm0, %xmm4
567; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
568; SSE41-NEXT:    addss %xmm5, %xmm4
569; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
570; SSE41-NEXT:    addss %xmm4, %xmm0
571; SSE41-NEXT:    addss %xmm1, %xmm0
572; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
573; SSE41-NEXT:    addss %xmm4, %xmm0
574; SSE41-NEXT:    movaps %xmm1, %xmm4
575; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
576; SSE41-NEXT:    addss %xmm4, %xmm0
577; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
578; SSE41-NEXT:    addss %xmm1, %xmm0
579; SSE41-NEXT:    addss %xmm2, %xmm0
580; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
581; SSE41-NEXT:    addss %xmm1, %xmm0
582; SSE41-NEXT:    movaps %xmm2, %xmm1
583; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
584; SSE41-NEXT:    addss %xmm1, %xmm0
585; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
586; SSE41-NEXT:    addss %xmm2, %xmm0
587; SSE41-NEXT:    addss %xmm3, %xmm0
588; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
589; SSE41-NEXT:    addss %xmm1, %xmm0
590; SSE41-NEXT:    movaps %xmm3, %xmm1
591; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
592; SSE41-NEXT:    addss %xmm1, %xmm0
593; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
594; SSE41-NEXT:    addss %xmm3, %xmm0
595; SSE41-NEXT:    retq
596;
597; AVX-LABEL: test_v16f32_zero:
598; AVX:       # %bb.0:
599; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
600; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm2
601; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
602; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
603; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
604; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
605; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
606; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
607; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
608; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
609; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
610; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
611; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
612; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
613; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
614; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
615; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
616; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
617; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
618; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
619; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
620; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
621; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
622; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
623; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
624; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
625; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
626; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
627; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
628; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
629; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
630; AVX-NEXT:    vzeroupper
631; AVX-NEXT:    retq
632;
633; AVX512-LABEL: test_v16f32_zero:
634; AVX512:       # %bb.0:
635; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
636; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
637; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
638; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
639; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
640; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
641; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
642; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
643; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
644; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
645; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
646; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
647; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
648; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
649; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
650; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
651; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
652; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
653; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
654; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
655; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
656; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
657; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
658; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
659; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
660; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
661; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
662; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
663; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
664; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
665; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
666; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
667; AVX512-NEXT:    vzeroupper
668; AVX512-NEXT:    retq
669  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
670  ret float %1
671}
672
673;
674; vXf32 (undef)
675;
676
677define float @test_v2f32_undef(<2 x float> %a0) {
678; SSE2-LABEL: test_v2f32_undef:
679; SSE2:       # %bb.0:
680; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
681; SSE2-NEXT:    addss {{.*}}(%rip), %xmm0
682; SSE2-NEXT:    retq
683;
684; SSE41-LABEL: test_v2f32_undef:
685; SSE41:       # %bb.0:
686; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
687; SSE41-NEXT:    addss {{.*}}(%rip), %xmm0
688; SSE41-NEXT:    retq
689;
690; AVX-LABEL: test_v2f32_undef:
691; AVX:       # %bb.0:
692; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
693; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
694; AVX-NEXT:    retq
695;
696; AVX512-LABEL: test_v2f32_undef:
697; AVX512:       # %bb.0:
698; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
699; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
700; AVX512-NEXT:    retq
701  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
702  ret float %1
703}
704
705define float @test_v4f32_undef(<4 x float> %a0) {
706; SSE2-LABEL: test_v4f32_undef:
707; SSE2:       # %bb.0:
708; SSE2-NEXT:    movaps %xmm0, %xmm1
709; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
710; SSE2-NEXT:    addss {{.*}}(%rip), %xmm1
711; SSE2-NEXT:    movaps %xmm0, %xmm2
712; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
713; SSE2-NEXT:    addss %xmm1, %xmm2
714; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
715; SSE2-NEXT:    addss %xmm2, %xmm0
716; SSE2-NEXT:    retq
717;
718; SSE41-LABEL: test_v4f32_undef:
719; SSE41:       # %bb.0:
720; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
721; SSE41-NEXT:    addss {{.*}}(%rip), %xmm1
722; SSE41-NEXT:    movaps %xmm0, %xmm2
723; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
724; SSE41-NEXT:    addss %xmm1, %xmm2
725; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
726; SSE41-NEXT:    addss %xmm2, %xmm0
727; SSE41-NEXT:    retq
728;
729; AVX-LABEL: test_v4f32_undef:
730; AVX:       # %bb.0:
731; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
732; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
733; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
734; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
735; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
736; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
737; AVX-NEXT:    retq
738;
739; AVX512-LABEL: test_v4f32_undef:
740; AVX512:       # %bb.0:
741; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
742; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
743; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
744; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
745; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
746; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
747; AVX512-NEXT:    retq
748  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
749  ret float %1
750}
751
752define float @test_v8f32_undef(<8 x float> %a0) {
753; SSE2-LABEL: test_v8f32_undef:
754; SSE2:       # %bb.0:
755; SSE2-NEXT:    movaps %xmm0, %xmm2
756; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
757; SSE2-NEXT:    addss {{.*}}(%rip), %xmm2
758; SSE2-NEXT:    movaps %xmm0, %xmm3
759; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
760; SSE2-NEXT:    addss %xmm2, %xmm3
761; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
762; SSE2-NEXT:    addss %xmm3, %xmm0
763; SSE2-NEXT:    addss %xmm1, %xmm0
764; SSE2-NEXT:    movaps %xmm1, %xmm2
765; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
766; SSE2-NEXT:    addss %xmm2, %xmm0
767; SSE2-NEXT:    movaps %xmm1, %xmm2
768; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
769; SSE2-NEXT:    addss %xmm2, %xmm0
770; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
771; SSE2-NEXT:    addss %xmm1, %xmm0
772; SSE2-NEXT:    retq
773;
774; SSE41-LABEL: test_v8f32_undef:
775; SSE41:       # %bb.0:
776; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
777; SSE41-NEXT:    addss {{.*}}(%rip), %xmm2
778; SSE41-NEXT:    movaps %xmm0, %xmm3
779; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
780; SSE41-NEXT:    addss %xmm2, %xmm3
781; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
782; SSE41-NEXT:    addss %xmm3, %xmm0
783; SSE41-NEXT:    addss %xmm1, %xmm0
784; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
785; SSE41-NEXT:    addss %xmm2, %xmm0
786; SSE41-NEXT:    movaps %xmm1, %xmm2
787; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
788; SSE41-NEXT:    addss %xmm2, %xmm0
789; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
790; SSE41-NEXT:    addss %xmm1, %xmm0
791; SSE41-NEXT:    retq
792;
793; AVX-LABEL: test_v8f32_undef:
794; AVX:       # %bb.0:
795; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
796; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
797; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
798; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
799; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
800; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
801; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
802; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
803; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
804; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
805; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
806; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
807; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
808; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
809; AVX-NEXT:    vzeroupper
810; AVX-NEXT:    retq
811;
812; AVX512-LABEL: test_v8f32_undef:
813; AVX512:       # %bb.0:
814; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
815; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
816; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
817; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
818; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
819; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
820; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
821; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
822; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
823; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
824; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
825; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
826; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
827; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
828; AVX512-NEXT:    vzeroupper
829; AVX512-NEXT:    retq
830  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
831  ret float %1
832}
833
834define float @test_v16f32_undef(<16 x float> %a0) {
835; SSE2-LABEL: test_v16f32_undef:
836; SSE2:       # %bb.0:
837; SSE2-NEXT:    movaps %xmm0, %xmm4
838; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
839; SSE2-NEXT:    addss {{.*}}(%rip), %xmm4
840; SSE2-NEXT:    movaps %xmm0, %xmm5
841; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
842; SSE2-NEXT:    addss %xmm4, %xmm5
843; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
844; SSE2-NEXT:    addss %xmm5, %xmm0
845; SSE2-NEXT:    addss %xmm1, %xmm0
846; SSE2-NEXT:    movaps %xmm1, %xmm4
847; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
848; SSE2-NEXT:    addss %xmm4, %xmm0
849; SSE2-NEXT:    movaps %xmm1, %xmm4
850; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
851; SSE2-NEXT:    addss %xmm4, %xmm0
852; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
853; SSE2-NEXT:    addss %xmm1, %xmm0
854; SSE2-NEXT:    addss %xmm2, %xmm0
855; SSE2-NEXT:    movaps %xmm2, %xmm1
856; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
857; SSE2-NEXT:    addss %xmm1, %xmm0
858; SSE2-NEXT:    movaps %xmm2, %xmm1
859; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
860; SSE2-NEXT:    addss %xmm1, %xmm0
861; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
862; SSE2-NEXT:    addss %xmm2, %xmm0
863; SSE2-NEXT:    addss %xmm3, %xmm0
864; SSE2-NEXT:    movaps %xmm3, %xmm1
865; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
866; SSE2-NEXT:    addss %xmm1, %xmm0
867; SSE2-NEXT:    movaps %xmm3, %xmm1
868; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
869; SSE2-NEXT:    addss %xmm1, %xmm0
870; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
871; SSE2-NEXT:    addss %xmm3, %xmm0
872; SSE2-NEXT:    retq
873;
874; SSE41-LABEL: test_v16f32_undef:
875; SSE41:       # %bb.0:
876; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
877; SSE41-NEXT:    addss {{.*}}(%rip), %xmm4
878; SSE41-NEXT:    movaps %xmm0, %xmm5
879; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
880; SSE41-NEXT:    addss %xmm4, %xmm5
881; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
882; SSE41-NEXT:    addss %xmm5, %xmm0
883; SSE41-NEXT:    addss %xmm1, %xmm0
884; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
885; SSE41-NEXT:    addss %xmm4, %xmm0
886; SSE41-NEXT:    movaps %xmm1, %xmm4
887; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
888; SSE41-NEXT:    addss %xmm4, %xmm0
889; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
890; SSE41-NEXT:    addss %xmm1, %xmm0
891; SSE41-NEXT:    addss %xmm2, %xmm0
892; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
893; SSE41-NEXT:    addss %xmm1, %xmm0
894; SSE41-NEXT:    movaps %xmm2, %xmm1
895; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
896; SSE41-NEXT:    addss %xmm1, %xmm0
897; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
898; SSE41-NEXT:    addss %xmm2, %xmm0
899; SSE41-NEXT:    addss %xmm3, %xmm0
900; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
901; SSE41-NEXT:    addss %xmm1, %xmm0
902; SSE41-NEXT:    movaps %xmm3, %xmm1
903; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
904; SSE41-NEXT:    addss %xmm1, %xmm0
905; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
906; SSE41-NEXT:    addss %xmm3, %xmm0
907; SSE41-NEXT:    retq
908;
909; AVX-LABEL: test_v16f32_undef:
910; AVX:       # %bb.0:
911; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
912; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm2, %xmm2
913; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
914; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
915; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
916; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
917; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
918; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
919; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
920; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
921; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
922; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
923; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
924; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
925; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
926; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
927; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
928; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
929; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
930; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
931; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
932; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
933; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
934; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
935; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
936; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
937; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
938; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
939; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
940; AVX-NEXT:    vzeroupper
941; AVX-NEXT:    retq
942;
943; AVX512-LABEL: test_v16f32_undef:
944; AVX512:       # %bb.0:
945; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
946; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
947; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
948; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
949; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
950; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
951; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
952; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
953; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
954; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
955; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
956; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
957; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
958; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
959; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
960; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
961; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
962; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
963; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
964; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
965; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
966; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
967; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
968; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
969; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
970; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
971; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
972; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
973; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
974; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
975; AVX512-NEXT:    vzeroupper
976; AVX512-NEXT:    retq
977  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
978  ret float %1
979}
980
981;
982; vXf64 (accum)
983;
984
985define double @test_v2f64(double %a0, <2 x double> %a1) {
986; SSE-LABEL: test_v2f64:
987; SSE:       # %bb.0:
988; SSE-NEXT:    addsd %xmm1, %xmm0
989; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
990; SSE-NEXT:    addsd %xmm1, %xmm0
991; SSE-NEXT:    retq
992;
993; AVX-LABEL: test_v2f64:
994; AVX:       # %bb.0:
995; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
996; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
997; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
998; AVX-NEXT:    retq
999;
1000; AVX512-LABEL: test_v2f64:
1001; AVX512:       # %bb.0:
1002; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1003; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1004; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1005; AVX512-NEXT:    retq
1006  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
1007  ret double %1
1008}
1009
1010define double @test_v4f64(double %a0, <4 x double> %a1) {
1011; SSE-LABEL: test_v4f64:
1012; SSE:       # %bb.0:
1013; SSE-NEXT:    addsd %xmm1, %xmm0
1014; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1015; SSE-NEXT:    addsd %xmm1, %xmm0
1016; SSE-NEXT:    addsd %xmm2, %xmm0
1017; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1018; SSE-NEXT:    addsd %xmm2, %xmm0
1019; SSE-NEXT:    retq
1020;
1021; AVX-LABEL: test_v4f64:
1022; AVX:       # %bb.0:
1023; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1024; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1025; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1026; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1027; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1028; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1029; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1030; AVX-NEXT:    vzeroupper
1031; AVX-NEXT:    retq
1032;
1033; AVX512-LABEL: test_v4f64:
1034; AVX512:       # %bb.0:
1035; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1036; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1037; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1038; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
1039; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1040; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1041; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1042; AVX512-NEXT:    vzeroupper
1043; AVX512-NEXT:    retq
1044  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
1045  ret double %1
1046}
1047
1048define double @test_v8f64(double %a0, <8 x double> %a1) {
1049; SSE-LABEL: test_v8f64:
1050; SSE:       # %bb.0:
1051; SSE-NEXT:    addsd %xmm1, %xmm0
1052; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1053; SSE-NEXT:    addsd %xmm1, %xmm0
1054; SSE-NEXT:    addsd %xmm2, %xmm0
1055; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1056; SSE-NEXT:    addsd %xmm2, %xmm0
1057; SSE-NEXT:    addsd %xmm3, %xmm0
1058; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1059; SSE-NEXT:    addsd %xmm3, %xmm0
1060; SSE-NEXT:    addsd %xmm4, %xmm0
1061; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
1062; SSE-NEXT:    addsd %xmm4, %xmm0
1063; SSE-NEXT:    retq
1064;
1065; AVX-LABEL: test_v8f64:
1066; AVX:       # %bb.0:
1067; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1068; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1069; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1070; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1071; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1072; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1073; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1074; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1075; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1076; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1077; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1078; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1079; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1080; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1081; AVX-NEXT:    vzeroupper
1082; AVX-NEXT:    retq
1083;
1084; AVX512-LABEL: test_v8f64:
1085; AVX512:       # %bb.0:
1086; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1087; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1088; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1089; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1090; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1091; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1092; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1093; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1094; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1095; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1096; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1097; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1098; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1099; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1100; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1101; AVX512-NEXT:    vzeroupper
1102; AVX512-NEXT:    retq
1103  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
1104  ret double %1
1105}
1106
1107define double @test_v16f64(double %a0, <16 x double> %a1) {
1108; SSE-LABEL: test_v16f64:
1109; SSE:       # %bb.0:
1110; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1111; SSE-NEXT:    addsd %xmm1, %xmm0
1112; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1113; SSE-NEXT:    addsd %xmm1, %xmm0
1114; SSE-NEXT:    addsd %xmm2, %xmm0
1115; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1116; SSE-NEXT:    addsd %xmm2, %xmm0
1117; SSE-NEXT:    addsd %xmm3, %xmm0
1118; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1119; SSE-NEXT:    addsd %xmm3, %xmm0
1120; SSE-NEXT:    addsd %xmm4, %xmm0
1121; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
1122; SSE-NEXT:    addsd %xmm4, %xmm0
1123; SSE-NEXT:    addsd %xmm5, %xmm0
1124; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
1125; SSE-NEXT:    addsd %xmm5, %xmm0
1126; SSE-NEXT:    addsd %xmm6, %xmm0
1127; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
1128; SSE-NEXT:    addsd %xmm6, %xmm0
1129; SSE-NEXT:    addsd %xmm7, %xmm0
1130; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
1131; SSE-NEXT:    addsd %xmm7, %xmm0
1132; SSE-NEXT:    addsd %xmm8, %xmm0
1133; SSE-NEXT:    movhlps {{.*#+}} xmm8 = xmm8[1,1]
1134; SSE-NEXT:    addsd %xmm8, %xmm0
1135; SSE-NEXT:    retq
1136;
1137; AVX-LABEL: test_v16f64:
1138; AVX:       # %bb.0:
1139; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1140; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
1141; AVX-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
1142; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1143; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1144; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1145; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1146; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1147; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1148; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1149; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1150; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1151; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1152; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1153; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1154; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1155; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1156; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1157; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1158; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1159; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1160; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1161; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
1162; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1163; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
1164; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1165; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1166; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1167; AVX-NEXT:    vzeroupper
1168; AVX-NEXT:    retq
1169;
1170; AVX512-LABEL: test_v16f64:
1171; AVX512:       # %bb.0:
1172; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1173; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1174; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1175; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
1176; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1177; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1178; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1179; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
1180; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1181; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1182; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1183; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1184; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1185; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1186; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1187; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1188; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1189; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1190; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
1191; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1192; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1193; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1194; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
1195; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1196; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1197; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1198; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
1199; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1200; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1201; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1202; AVX512-NEXT:    vzeroupper
1203; AVX512-NEXT:    retq
1204  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
1205  ret double %1
1206}
1207
1208;
1209; vXf64 (zero)
1210;
1211
1212define double @test_v2f64_zero(<2 x double> %a0) {
1213; SSE-LABEL: test_v2f64_zero:
1214; SSE:       # %bb.0:
1215; SSE-NEXT:    xorpd %xmm1, %xmm1
1216; SSE-NEXT:    addsd %xmm0, %xmm1
1217; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1218; SSE-NEXT:    addsd %xmm1, %xmm0
1219; SSE-NEXT:    retq
1220;
1221; AVX-LABEL: test_v2f64_zero:
1222; AVX:       # %bb.0:
1223; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1224; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1225; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1226; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1227; AVX-NEXT:    retq
1228;
1229; AVX512-LABEL: test_v2f64_zero:
1230; AVX512:       # %bb.0:
1231; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1232; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1233; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1234; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1235; AVX512-NEXT:    retq
1236  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
1237  ret double %1
1238}
1239
1240define double @test_v4f64_zero(<4 x double> %a0) {
1241; SSE-LABEL: test_v4f64_zero:
1242; SSE:       # %bb.0:
1243; SSE-NEXT:    xorpd %xmm2, %xmm2
1244; SSE-NEXT:    addsd %xmm0, %xmm2
1245; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1246; SSE-NEXT:    addsd %xmm2, %xmm0
1247; SSE-NEXT:    addsd %xmm1, %xmm0
1248; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1249; SSE-NEXT:    addsd %xmm1, %xmm0
1250; SSE-NEXT:    retq
1251;
1252; AVX-LABEL: test_v4f64_zero:
1253; AVX:       # %bb.0:
1254; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1255; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1256; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1257; AVX-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1258; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1259; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1260; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1261; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1262; AVX-NEXT:    vzeroupper
1263; AVX-NEXT:    retq
1264;
1265; AVX512-LABEL: test_v4f64_zero:
1266; AVX512:       # %bb.0:
1267; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1268; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1269; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1270; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1271; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1272; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1273; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1274; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1275; AVX512-NEXT:    vzeroupper
1276; AVX512-NEXT:    retq
1277  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
1278  ret double %1
1279}
1280
1281define double @test_v8f64_zero(<8 x double> %a0) {
1282; SSE-LABEL: test_v8f64_zero:
1283; SSE:       # %bb.0:
1284; SSE-NEXT:    xorpd %xmm4, %xmm4
1285; SSE-NEXT:    addsd %xmm0, %xmm4
1286; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1287; SSE-NEXT:    addsd %xmm4, %xmm0
1288; SSE-NEXT:    addsd %xmm1, %xmm0
1289; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1290; SSE-NEXT:    addsd %xmm1, %xmm0
1291; SSE-NEXT:    addsd %xmm2, %xmm0
1292; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1293; SSE-NEXT:    addsd %xmm2, %xmm0
1294; SSE-NEXT:    addsd %xmm3, %xmm0
1295; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1296; SSE-NEXT:    addsd %xmm3, %xmm0
1297; SSE-NEXT:    retq
1298;
1299; AVX-LABEL: test_v8f64_zero:
1300; AVX:       # %bb.0:
1301; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1302; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1303; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1304; AVX-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1305; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1306; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1307; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1308; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1309; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1310; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1311; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1312; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1313; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1314; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1315; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1316; AVX-NEXT:    vzeroupper
1317; AVX-NEXT:    retq
1318;
1319; AVX512-LABEL: test_v8f64_zero:
1320; AVX512:       # %bb.0:
1321; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1322; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1323; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1324; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1325; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1326; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1327; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1328; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1329; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1330; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1331; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1332; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1333; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1334; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1335; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1336; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1337; AVX512-NEXT:    vzeroupper
1338; AVX512-NEXT:    retq
1339  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
1340  ret double %1
1341}
1342
1343define double @test_v16f64_zero(<16 x double> %a0) {
1344; SSE-LABEL: test_v16f64_zero:
1345; SSE:       # %bb.0:
1346; SSE-NEXT:    xorpd %xmm8, %xmm8
1347; SSE-NEXT:    addsd %xmm0, %xmm8
1348; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1349; SSE-NEXT:    addsd %xmm8, %xmm0
1350; SSE-NEXT:    addsd %xmm1, %xmm0
1351; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1352; SSE-NEXT:    addsd %xmm1, %xmm0
1353; SSE-NEXT:    addsd %xmm2, %xmm0
1354; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1355; SSE-NEXT:    addsd %xmm2, %xmm0
1356; SSE-NEXT:    addsd %xmm3, %xmm0
1357; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1358; SSE-NEXT:    addsd %xmm3, %xmm0
1359; SSE-NEXT:    addsd %xmm4, %xmm0
1360; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
1361; SSE-NEXT:    addsd %xmm4, %xmm0
1362; SSE-NEXT:    addsd %xmm5, %xmm0
1363; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
1364; SSE-NEXT:    addsd %xmm5, %xmm0
1365; SSE-NEXT:    addsd %xmm6, %xmm0
1366; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
1367; SSE-NEXT:    addsd %xmm6, %xmm0
1368; SSE-NEXT:    addsd %xmm7, %xmm0
1369; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
1370; SSE-NEXT:    addsd %xmm7, %xmm0
1371; SSE-NEXT:    retq
1372;
1373; AVX-LABEL: test_v16f64_zero:
1374; AVX:       # %bb.0:
1375; AVX-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
1376; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
1377; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
1378; AVX-NEXT:    vaddsd %xmm5, %xmm4, %xmm4
1379; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1380; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1381; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1382; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1383; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1384; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1385; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1386; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1387; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1388; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1389; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1390; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1391; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1392; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1393; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1394; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1395; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1396; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1397; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1398; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1399; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1400; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1401; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1402; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1403; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1404; AVX-NEXT:    vzeroupper
1405; AVX-NEXT:    retq
1406;
1407; AVX512-LABEL: test_v16f64_zero:
1408; AVX512:       # %bb.0:
1409; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1410; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1411; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1412; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1413; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1414; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1415; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1416; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1417; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1418; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1419; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1420; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1421; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1422; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1423; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1424; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1425; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1426; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1427; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1428; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1429; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1430; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1431; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1432; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1433; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1434; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1435; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1436; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1437; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1438; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1439; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1440; AVX512-NEXT:    vzeroupper
1441; AVX512-NEXT:    retq
1442  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
1443  ret double %1
1444}
1445
1446;
1447; vXf64 (undef)
1448;
1449
1450define double @test_v2f64_undef(<2 x double> %a0) {
1451; SSE-LABEL: test_v2f64_undef:
1452; SSE:       # %bb.0:
1453; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1454; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1455; SSE-NEXT:    retq
1456;
1457; AVX-LABEL: test_v2f64_undef:
1458; AVX:       # %bb.0:
1459; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1460; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
1461; AVX-NEXT:    retq
1462;
1463; AVX512-LABEL: test_v2f64_undef:
1464; AVX512:       # %bb.0:
1465; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1466; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
1467; AVX512-NEXT:    retq
1468  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
1469  ret double %1
1470}
1471
1472define double @test_v4f64_undef(<4 x double> %a0) {
1473; SSE-LABEL: test_v4f64_undef:
1474; SSE:       # %bb.0:
1475; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1476; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1477; SSE-NEXT:    addsd %xmm1, %xmm0
1478; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1479; SSE-NEXT:    addsd %xmm1, %xmm0
1480; SSE-NEXT:    retq
1481;
1482; AVX-LABEL: test_v4f64_undef:
1483; AVX:       # %bb.0:
1484; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1485; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1486; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1487; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1488; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1489; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1490; AVX-NEXT:    vzeroupper
1491; AVX-NEXT:    retq
1492;
1493; AVX512-LABEL: test_v4f64_undef:
1494; AVX512:       # %bb.0:
1495; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1496; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1497; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1498; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1499; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1500; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1501; AVX512-NEXT:    vzeroupper
1502; AVX512-NEXT:    retq
1503  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
1504  ret double %1
1505}
1506
1507define double @test_v8f64_undef(<8 x double> %a0) {
1508; SSE-LABEL: test_v8f64_undef:
1509; SSE:       # %bb.0:
1510; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1511; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1512; SSE-NEXT:    addsd %xmm1, %xmm0
1513; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1514; SSE-NEXT:    addsd %xmm1, %xmm0
1515; SSE-NEXT:    addsd %xmm2, %xmm0
1516; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1517; SSE-NEXT:    addsd %xmm2, %xmm0
1518; SSE-NEXT:    addsd %xmm3, %xmm0
1519; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1520; SSE-NEXT:    addsd %xmm3, %xmm0
1521; SSE-NEXT:    retq
1522;
1523; AVX-LABEL: test_v8f64_undef:
1524; AVX:       # %bb.0:
1525; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1526; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
1527; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1528; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1529; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1530; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1531; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1532; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1533; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1534; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1535; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1536; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1537; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1538; AVX-NEXT:    vzeroupper
1539; AVX-NEXT:    retq
1540;
1541; AVX512-LABEL: test_v8f64_undef:
1542; AVX512:       # %bb.0:
1543; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1544; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1545; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1546; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1547; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1548; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1549; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1550; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1551; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1552; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1553; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1554; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1555; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1556; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1557; AVX512-NEXT:    vzeroupper
1558; AVX512-NEXT:    retq
1559  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
1560  ret double %1
1561}
1562
1563define double @test_v16f64_undef(<16 x double> %a0) {
1564; SSE-LABEL: test_v16f64_undef:
1565; SSE:       # %bb.0:
1566; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1567; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1568; SSE-NEXT:    addsd %xmm1, %xmm0
1569; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1570; SSE-NEXT:    addsd %xmm1, %xmm0
1571; SSE-NEXT:    addsd %xmm2, %xmm0
1572; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
1573; SSE-NEXT:    addsd %xmm2, %xmm0
1574; SSE-NEXT:    addsd %xmm3, %xmm0
1575; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
1576; SSE-NEXT:    addsd %xmm3, %xmm0
1577; SSE-NEXT:    addsd %xmm4, %xmm0
1578; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
1579; SSE-NEXT:    addsd %xmm4, %xmm0
1580; SSE-NEXT:    addsd %xmm5, %xmm0
1581; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
1582; SSE-NEXT:    addsd %xmm5, %xmm0
1583; SSE-NEXT:    addsd %xmm6, %xmm0
1584; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
1585; SSE-NEXT:    addsd %xmm6, %xmm0
1586; SSE-NEXT:    addsd %xmm7, %xmm0
1587; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
1588; SSE-NEXT:    addsd %xmm7, %xmm0
1589; SSE-NEXT:    retq
1590;
1591; AVX-LABEL: test_v16f64_undef:
1592; AVX:       # %bb.0:
1593; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1594; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm4, %xmm4
1595; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1596; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1597; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1598; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1599; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1600; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1601; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1602; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1603; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1604; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1605; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1606; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1607; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1608; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1609; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1610; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1611; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1612; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1613; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1614; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1615; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1616; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1617; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1618; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1619; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1620; AVX-NEXT:    vzeroupper
1621; AVX-NEXT:    retq
1622;
1623; AVX512-LABEL: test_v16f64_undef:
1624; AVX512:       # %bb.0:
1625; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1626; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
1627; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1628; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1629; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1630; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1631; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1632; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1633; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1634; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1635; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1636; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1637; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1638; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1639; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1640; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1641; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1642; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1643; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1644; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1645; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1646; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1647; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1648; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1649; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1650; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1651; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1652; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1653; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1654; AVX512-NEXT:    vzeroupper
1655; AVX512-NEXT:    retq
1656  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
1657  ret double %1
1658}
1659
1660declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
1661declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
1662declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
1663declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
1664
1665declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
1666declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
1667declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
1668declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
1669