• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
5
6; 'signum' test cases (PR13248)
7
8;
9; generic implementation for 128-bit vectors
10;
11
12define void @signum32a(<4 x float>*) {
13; AVX-LABEL: signum32a:
14; AVX:       # %bb.0: # %entry
15; AVX-NEXT:    vmovaps (%rdi), %xmm0
16; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
17; AVX-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
18; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
19; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
20; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
21; AVX-NEXT:    vsubps %xmm0, %xmm2, %xmm0
22; AVX-NEXT:    vmovaps %xmm0, (%rdi)
23; AVX-NEXT:    retq
24entry:
25  %1 = load <4 x float>, <4 x float>* %0
26  %2 = fcmp olt <4 x float> %1, zeroinitializer
27  %3 = sitofp <4 x i1> %2 to <4 x float>
28  %4 = fcmp ogt <4 x float> %1, zeroinitializer
29  %5 = sitofp <4 x i1> %4 to <4 x float>
30  %6 = fsub <4 x float> %3, %5
31  store <4 x float> %6, <4 x float>* %0
32  ret void
33}
34
35define void @signum64a(<2 x double>*) {
36; AVX-LABEL: signum64a:
37; AVX:       # %bb.0: # %entry
38; AVX-NEXT:    vmovapd (%rdi), %xmm0
39; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
40; AVX-NEXT:    vcmpltpd %xmm1, %xmm0, %xmm2
41; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3]
42; AVX-NEXT:    vcvtdq2pd %xmm2, %xmm2
43; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
44; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
45; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
46; AVX-NEXT:    vsubpd %xmm0, %xmm2, %xmm0
47; AVX-NEXT:    vmovapd %xmm0, (%rdi)
48; AVX-NEXT:    retq
49entry:
50  %1 = load <2 x double>, <2 x double>* %0
51  %2 = fcmp olt <2 x double> %1, zeroinitializer
52  %3 = sitofp <2 x i1> %2 to <2 x double>
53  %4 = fcmp ogt <2 x double> %1, zeroinitializer
54  %5 = sitofp <2 x i1> %4 to <2 x double>
55  %6 = fsub <2 x double> %3, %5
56  store <2 x double> %6, <2 x double>* %0
57  ret void
58}
59
60;
61; generic implementation for 256-bit vectors
62;
63
64define void @signum32b(<8 x float>*) {
65; AVX-LABEL: signum32b:
66; AVX:       # %bb.0: # %entry
67; AVX-NEXT:    vmovaps (%rdi), %ymm0
68; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
69; AVX-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
70; AVX-NEXT:    vcvtdq2ps %ymm2, %ymm2
71; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
72; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
73; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
74; AVX-NEXT:    vmovaps %ymm0, (%rdi)
75; AVX-NEXT:    vzeroupper
76; AVX-NEXT:    retq
77entry:
78  %1 = load <8 x float>, <8 x float>* %0
79  %2 = fcmp olt <8 x float> %1, zeroinitializer
80  %3 = sitofp <8 x i1> %2 to <8 x float>
81  %4 = fcmp ogt <8 x float> %1, zeroinitializer
82  %5 = sitofp <8 x i1> %4 to <8 x float>
83  %6 = fsub <8 x float> %3, %5
84  store <8 x float> %6, <8 x float>* %0
85  ret void
86}
87
88define void @signum64b(<4 x double>*) {
89; AVX1-LABEL: signum64b:
90; AVX1:       # %bb.0: # %entry
91; AVX1-NEXT:    vmovapd (%rdi), %ymm0
92; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
93; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
94; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
95; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
96; AVX1-NEXT:    vcvtdq2pd %xmm2, %ymm2
97; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
98; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
99; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
100; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
101; AVX1-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
102; AVX1-NEXT:    vmovapd %ymm0, (%rdi)
103; AVX1-NEXT:    vzeroupper
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: signum64b:
107; AVX2:       # %bb.0: # %entry
108; AVX2-NEXT:    vmovapd (%rdi), %ymm0
109; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
110; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
111; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
112; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
113; AVX2-NEXT:    vcvtdq2pd %xmm2, %ymm2
114; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
115; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
116; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
117; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
118; AVX2-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
119; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
120; AVX2-NEXT:    vzeroupper
121; AVX2-NEXT:    retq
122;
123; AVX512F-LABEL: signum64b:
124; AVX512F:       # %bb.0: # %entry
125; AVX512F-NEXT:    vmovapd (%rdi), %ymm0
126; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
127; AVX512F-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
128; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
129; AVX512F-NEXT:    vcvtdq2pd %xmm2, %ymm2
130; AVX512F-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
131; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
132; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
133; AVX512F-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
134; AVX512F-NEXT:    vmovapd %ymm0, (%rdi)
135; AVX512F-NEXT:    vzeroupper
136; AVX512F-NEXT:    retq
137entry:
138  %1 = load <4 x double>, <4 x double>* %0
139  %2 = fcmp olt <4 x double> %1, zeroinitializer
140  %3 = sitofp <4 x i1> %2 to <4 x double>
141  %4 = fcmp ogt <4 x double> %1, zeroinitializer
142  %5 = sitofp <4 x i1> %4 to <4 x double>
143  %6 = fsub <4 x double> %3, %5
144  store <4 x double> %6, <4 x double>* %0
145  ret void
146}
147
148;
149; implementation using AVX intrinsics for 256-bit vectors
150;
151
152define void @signum32c(<8 x float>*) {
153; AVX-LABEL: signum32c:
154; AVX:       # %bb.0: # %entry
155; AVX-NEXT:    vmovaps (%rdi), %ymm0
156; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
157; AVX-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
158; AVX-NEXT:    vcvtdq2ps %ymm2, %ymm2
159; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
160; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
161; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
162; AVX-NEXT:    vmovaps %ymm0, (%rdi)
163; AVX-NEXT:    vzeroupper
164; AVX-NEXT:    retq
165entry:
166  %1 = load <8 x float>, <8 x float>* %0
167  %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1)
168  %3 = bitcast <8 x float> %2 to <8 x i32>
169  %4 = sitofp <8 x i32> %3 to <8 x float>
170  %5 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %1, i8 1)
171  %6 = bitcast <8 x float> %5 to <8 x i32>
172  %7 = sitofp <8 x i32> %6 to <8 x float>
173  %8 = fsub <8 x float> %4, %7
174  store <8 x float> %8, <8 x float>* %0
175  ret void
176}
177
178define void @signum64c(<4 x double>*) {
179; AVX1-LABEL: signum64c:
180; AVX1:       # %bb.0: # %entry
181; AVX1-NEXT:    vmovapd (%rdi), %ymm0
182; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
183; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
184; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
185; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
186; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
187; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
188; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
189; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
190; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
191; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
192; AVX1-NEXT:    vzeroupper
193; AVX1-NEXT:    retq
194;
195; AVX2-LABEL: signum64c:
196; AVX2:       # %bb.0: # %entry
197; AVX2-NEXT:    vmovapd (%rdi), %ymm0
198; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
199; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
200; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
201; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
203; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
204; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
205; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
206; AVX2-NEXT:    vzeroupper
207; AVX2-NEXT:    retq
208;
209; AVX512F-LABEL: signum64c:
210; AVX512F:       # %bb.0: # %entry
211; AVX512F-NEXT:    vmovapd (%rdi), %ymm0
212; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
213; AVX512F-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
214; AVX512F-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
215; AVX512F-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
216; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
217; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
218; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
219; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
220; AVX512F-NEXT:    vzeroupper
221; AVX512F-NEXT:    retq
222entry:
223  %x = load <4 x double>, <4 x double>* %0
224  %xgt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %x, <4 x double> zeroinitializer, i8 1)
225  %igt = bitcast <4 x double> %xgt to <8 x i32>
226  %xlt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %x, i8 1)
227  %ilt = bitcast <4 x double> %xlt to <8 x i32>
228  ; it is important to use %igt twice as source in order to make LLVM use a shuffle operation
229  %isign = sub <8 x i32> %igt, %ilt
230  %ssign = shufflevector <8 x i32> %isign, <8 x i32> %isign, <4 x i32> <i32 0, i32 2, i32 12, i32 14>
231  %sign = tail call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %ssign)
232  store <4 x double> %sign, <4 x double>* %0
233  ret void
234}
235
236declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
237
238declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
239
240declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
241