• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXf32
11;
12
13define float @test_v2f32(<2 x float> %a0) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm0, %xmm1
17; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
18; SSE2-NEXT:    maxps %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
24; SSE41-NEXT:    maxps %xmm1, %xmm0
25; SSE41-NEXT:    retq
26;
27; AVX-LABEL: test_v2f32:
28; AVX:       # %bb.0:
29; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
30; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
31; AVX-NEXT:    retq
32;
33; AVX512-LABEL: test_v2f32:
34; AVX512:       # %bb.0:
35; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
36; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
37; AVX512-NEXT:    retq
38  %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v2f32(<2 x float> %a0)
39  ret float %1
40}
41
42define float @test_v4f32(<4 x float> %a0) {
43; SSE2-LABEL: test_v4f32:
44; SSE2:       # %bb.0:
45; SSE2-NEXT:    movaps %xmm0, %xmm1
46; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
47; SSE2-NEXT:    maxps %xmm1, %xmm0
48; SSE2-NEXT:    movaps %xmm0, %xmm1
49; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
50; SSE2-NEXT:    maxps %xmm1, %xmm0
51; SSE2-NEXT:    retq
52;
53; SSE41-LABEL: test_v4f32:
54; SSE41:       # %bb.0:
55; SSE41-NEXT:    movaps %xmm0, %xmm1
56; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
57; SSE41-NEXT:    maxps %xmm1, %xmm0
58; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
59; SSE41-NEXT:    maxps %xmm1, %xmm0
60; SSE41-NEXT:    retq
61;
62; AVX-LABEL: test_v4f32:
63; AVX:       # %bb.0:
64; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
65; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
66; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
67; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    retq
69;
70; AVX512-LABEL: test_v4f32:
71; AVX512:       # %bb.0:
72; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
73; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
74; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
75; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
76; AVX512-NEXT:    retq
77  %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a0)
78  ret float %1
79}
80
81define float @test_v8f32(<8 x float> %a0) {
82; SSE2-LABEL: test_v8f32:
83; SSE2:       # %bb.0:
84; SSE2-NEXT:    maxps %xmm1, %xmm0
85; SSE2-NEXT:    movaps %xmm0, %xmm1
86; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
87; SSE2-NEXT:    maxps %xmm1, %xmm0
88; SSE2-NEXT:    movaps %xmm0, %xmm1
89; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
90; SSE2-NEXT:    maxps %xmm1, %xmm0
91; SSE2-NEXT:    retq
92;
93; SSE41-LABEL: test_v8f32:
94; SSE41:       # %bb.0:
95; SSE41-NEXT:    maxps %xmm1, %xmm0
96; SSE41-NEXT:    movaps %xmm0, %xmm1
97; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
98; SSE41-NEXT:    maxps %xmm1, %xmm0
99; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
100; SSE41-NEXT:    maxps %xmm1, %xmm0
101; SSE41-NEXT:    retq
102;
103; AVX-LABEL: test_v8f32:
104; AVX:       # %bb.0:
105; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
106; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
107; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
108; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
109; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
110; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
111; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
112; AVX-NEXT:    vzeroupper
113; AVX-NEXT:    retq
114;
115; AVX512-LABEL: test_v8f32:
116; AVX512:       # %bb.0:
117; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
118; AVX512-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
119; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
120; AVX512-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
121; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
122; AVX512-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
123; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
124; AVX512-NEXT:    vzeroupper
125; AVX512-NEXT:    retq
126  %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v8f32(<8 x float> %a0)
127  ret float %1
128}
129
130define float @test_v16f32(<16 x float> %a0) {
131; SSE2-LABEL: test_v16f32:
132; SSE2:       # %bb.0:
133; SSE2-NEXT:    maxps %xmm3, %xmm1
134; SSE2-NEXT:    maxps %xmm2, %xmm0
135; SSE2-NEXT:    maxps %xmm1, %xmm0
136; SSE2-NEXT:    movaps %xmm0, %xmm1
137; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
138; SSE2-NEXT:    maxps %xmm1, %xmm0
139; SSE2-NEXT:    movaps %xmm0, %xmm1
140; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
141; SSE2-NEXT:    maxps %xmm1, %xmm0
142; SSE2-NEXT:    retq
143;
144; SSE41-LABEL: test_v16f32:
145; SSE41:       # %bb.0:
146; SSE41-NEXT:    maxps %xmm3, %xmm1
147; SSE41-NEXT:    maxps %xmm2, %xmm0
148; SSE41-NEXT:    maxps %xmm1, %xmm0
149; SSE41-NEXT:    movaps %xmm0, %xmm1
150; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
151; SSE41-NEXT:    maxps %xmm1, %xmm0
152; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
153; SSE41-NEXT:    maxps %xmm1, %xmm0
154; SSE41-NEXT:    retq
155;
156; AVX-LABEL: test_v16f32:
157; AVX:       # %bb.0:
158; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
159; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
160; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
161; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
162; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
163; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
164; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
165; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
166; AVX-NEXT:    vzeroupper
167; AVX-NEXT:    retq
168;
169; AVX512-LABEL: test_v16f32:
170; AVX512:       # %bb.0:
171; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
172; AVX512-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
173; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
174; AVX512-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
175; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
176; AVX512-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
177; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
178; AVX512-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
179; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
180; AVX512-NEXT:    vzeroupper
181; AVX512-NEXT:    retq
182  %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float> %a0)
183  ret float %1
184}
185
186;
187; vXf64
188;
189
190define double @test_v2f64(<2 x double> %a0) {
191; SSE-LABEL: test_v2f64:
192; SSE:       # %bb.0:
193; SSE-NEXT:    movaps %xmm0, %xmm1
194; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
195; SSE-NEXT:    maxpd %xmm1, %xmm0
196; SSE-NEXT:    retq
197;
198; AVX-LABEL: test_v2f64:
199; AVX:       # %bb.0:
200; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
201; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
202; AVX-NEXT:    retq
203;
204; AVX512-LABEL: test_v2f64:
205; AVX512:       # %bb.0:
206; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
207; AVX512-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
208; AVX512-NEXT:    retq
209  %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a0)
210  ret double %1
211}
212
213define double @test_v4f64(<4 x double> %a0) {
214; SSE-LABEL: test_v4f64:
215; SSE:       # %bb.0:
216; SSE-NEXT:    maxpd %xmm1, %xmm0
217; SSE-NEXT:    movapd %xmm0, %xmm1
218; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
219; SSE-NEXT:    maxpd %xmm1, %xmm0
220; SSE-NEXT:    retq
221;
222; AVX-LABEL: test_v4f64:
223; AVX:       # %bb.0:
224; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
225; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
226; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
227; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
228; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
229; AVX-NEXT:    vzeroupper
230; AVX-NEXT:    retq
231;
232; AVX512-LABEL: test_v4f64:
233; AVX512:       # %bb.0:
234; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
235; AVX512-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
236; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
237; AVX512-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
238; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
239; AVX512-NEXT:    vzeroupper
240; AVX512-NEXT:    retq
241  %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v4f64(<4 x double> %a0)
242  ret double %1
243}
244
245define double @test_v8f64(<8 x double> %a0) {
246; SSE-LABEL: test_v8f64:
247; SSE:       # %bb.0:
248; SSE-NEXT:    maxpd %xmm3, %xmm1
249; SSE-NEXT:    maxpd %xmm2, %xmm0
250; SSE-NEXT:    maxpd %xmm1, %xmm0
251; SSE-NEXT:    movapd %xmm0, %xmm1
252; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
253; SSE-NEXT:    maxpd %xmm1, %xmm0
254; SSE-NEXT:    retq
255;
256; AVX-LABEL: test_v8f64:
257; AVX:       # %bb.0:
258; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
259; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
260; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
261; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
262; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
263; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
264; AVX-NEXT:    vzeroupper
265; AVX-NEXT:    retq
266;
267; AVX512-LABEL: test_v8f64:
268; AVX512:       # %bb.0:
269; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
270; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
271; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
272; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
273; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
274; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
275; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
276; AVX512-NEXT:    vzeroupper
277; AVX512-NEXT:    retq
278  %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v8f64(<8 x double> %a0)
279  ret double %1
280}
281
282define double @test_v16f64(<16 x double> %a0) {
283; SSE-LABEL: test_v16f64:
284; SSE:       # %bb.0:
285; SSE-NEXT:    maxpd %xmm6, %xmm2
286; SSE-NEXT:    maxpd %xmm4, %xmm0
287; SSE-NEXT:    maxpd %xmm2, %xmm0
288; SSE-NEXT:    maxpd %xmm7, %xmm3
289; SSE-NEXT:    maxpd %xmm5, %xmm1
290; SSE-NEXT:    maxpd %xmm3, %xmm1
291; SSE-NEXT:    maxpd %xmm1, %xmm0
292; SSE-NEXT:    movapd %xmm0, %xmm1
293; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
294; SSE-NEXT:    maxpd %xmm1, %xmm0
295; SSE-NEXT:    retq
296;
297; AVX-LABEL: test_v16f64:
298; AVX:       # %bb.0:
299; AVX-NEXT:    vmaxpd %ymm3, %ymm1, %ymm1
300; AVX-NEXT:    vmaxpd %ymm2, %ymm0, %ymm0
301; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
302; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
303; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
304; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
305; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
306; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
307; AVX-NEXT:    vzeroupper
308; AVX-NEXT:    retq
309;
310; AVX512-LABEL: test_v16f64:
311; AVX512:       # %bb.0:
312; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
313; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
314; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
315; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
316; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
317; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
318; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
319; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
320; AVX512-NEXT:    vzeroupper
321; AVX512-NEXT:    retq
322  %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v16f64(<16 x double> %a0)
323  ret double %1
324}
325
326declare float @llvm.experimental.vector.reduce.fmax.f32.v2f32(<2 x float>)
327declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>)
328declare float @llvm.experimental.vector.reduce.fmax.f32.v8f32(<8 x float>)
329declare float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float>)
330
331declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>)
332declare double @llvm.experimental.vector.reduce.fmax.f64.v4f64(<4 x double>)
333declare double @llvm.experimental.vector.reduce.fmax.f64.v8f64(<8 x double>)
334declare double @llvm.experimental.vector.reduce.fmax.f64.v16f64(<16 x double>)
335