• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
2; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
3; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
4; REQUIRES: asserts
5
6; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
7; regarding IEEE 754 standard.
8; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
9; because NEON is not IEEE compliant.
10; Darwin, on the other hand, doesn't support subnormals, and all optimizations
11; are allowed, even without -ffast-math.
12
13; Integer loops are always vectorizeable
14; CHECK: Checking a loop in "sumi"
15; CHECK: We can vectorize this loop!
16define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
17entry:
18  %cmp5 = icmp eq i32 %N, 0
19  br i1 %cmp5, label %for.end, label %for.body.preheader
20
21for.body.preheader:                               ; preds = %entry
22  br label %for.body
23
24for.body:                                         ; preds = %for.body.preheader, %for.body
25  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
26  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
27  %0 = load i32, i32* %arrayidx, align 4
28  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
29  %1 = load i32, i32* %arrayidx1, align 4
30  %mul = mul nsw i32 %1, %0
31  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
32  store i32 %mul, i32* %arrayidx2, align 4
33  %inc = add nuw nsw i32 %i.06, 1
34  %exitcond = icmp eq i32 %inc, %N
35  br i1 %exitcond, label %for.end.loopexit, label %for.body
36
37for.end.loopexit:                                 ; preds = %for.body
38  br label %for.end
39
40for.end:                                          ; preds = %for.end.loopexit, %entry
41  ret void
42}
43
44; Floating-point loops need fast-math to be vectorizeable
45; LINUX: Checking a loop in "sumf"
46; LINUX: Potentially unsafe FP op prevents vectorization
47; DARWIN: Checking a loop in "sumf"
48; DARWIN: We can vectorize this loop!
49define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
50entry:
51  %cmp5 = icmp eq i32 %N, 0
52  br i1 %cmp5, label %for.end, label %for.body.preheader
53
54for.body.preheader:                               ; preds = %entry
55  br label %for.body
56
57for.body:                                         ; preds = %for.body.preheader, %for.body
58  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
59  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
60  %0 = load float, float* %arrayidx, align 4
61  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
62  %1 = load float, float* %arrayidx1, align 4
63  %mul = fmul float %0, %1
64  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
65  store float %mul, float* %arrayidx2, align 4
66  %inc = add nuw nsw i32 %i.06, 1
67  %exitcond = icmp eq i32 %inc, %N
68  br i1 %exitcond, label %for.end.loopexit, label %for.body
69
70for.end.loopexit:                                 ; preds = %for.body
71  br label %for.end
72
73for.end:                                          ; preds = %for.end.loopexit, %entry
74  ret void
75}
76
77; Integer loops are always vectorizeable
78; CHECK: Checking a loop in "redi"
79; CHECK: We can vectorize this loop!
80define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
81entry:
82  %cmp5 = icmp eq i32 %N, 0
83  br i1 %cmp5, label %for.end, label %for.body.preheader
84
85for.body.preheader:                               ; preds = %entry
86  br label %for.body
87
88for.body:                                         ; preds = %for.body.preheader, %for.body
89  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
90  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
91  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
92  %0 = load i32, i32* %arrayidx, align 4
93  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
94  %1 = load i32, i32* %arrayidx1, align 4
95  %mul = mul nsw i32 %1, %0
96  %add = add nsw i32 %mul, %Red.06
97  %inc = add nuw nsw i32 %i.07, 1
98  %exitcond = icmp eq i32 %inc, %N
99  br i1 %exitcond, label %for.end.loopexit, label %for.body
100
101for.end.loopexit:                                 ; preds = %for.body
102  %add.lcssa = phi i32 [ %add, %for.body ]
103  br label %for.end
104
105for.end:                                          ; preds = %for.end.loopexit, %entry
106  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
107  ret i32 %Red.0.lcssa
108}
109
110; Floating-point loops need fast-math to be vectorizeable
111; LINUX: Checking a loop in "redf"
112; LINUX: Potentially unsafe FP op prevents vectorization
113; DARWIN: Checking a loop in "redf"
114; DARWIN: We can vectorize this loop!
115define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
116entry:
117  %cmp5 = icmp eq i32 %N, 0
118  br i1 %cmp5, label %for.end, label %for.body.preheader
119
120for.body.preheader:                               ; preds = %entry
121  br label %for.body
122
123for.body:                                         ; preds = %for.body.preheader, %for.body
124  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
125  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
126  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
127  %0 = load float, float* %arrayidx, align 4
128  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
129  %1 = load float, float* %arrayidx1, align 4
130  %mul = fmul float %0, %1
131  %add = fadd float %Red.06, %mul
132  %inc = add nuw nsw i32 %i.07, 1
133  %exitcond = icmp eq i32 %inc, %N
134  br i1 %exitcond, label %for.end.loopexit, label %for.body
135
136for.end.loopexit:                                 ; preds = %for.body
137  %add.lcssa = phi float [ %add, %for.body ]
138  br label %for.end
139
140for.end:                                          ; preds = %for.end.loopexit, %entry
141  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
142  ret float %Red.0.lcssa
143}
144
145; Make sure calls that turn into builtins are also covered
146; LINUX: Checking a loop in "fabs"
147; LINUX: Potentially unsafe FP op prevents vectorization
148; DARWIN: Checking a loop in "fabs"
149; DARWIN: We can vectorize this loop!
150define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
151entry:
152  %cmp10 = icmp eq i32 %N, 0
153  br i1 %cmp10, label %for.end, label %for.body
154
155for.body:                                         ; preds = %entry, %for.body
156  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
157  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
158  %0 = load float, float* %arrayidx, align 4
159  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
160  %1 = load float, float* %arrayidx1, align 4
161  %fabsf = tail call float @fabsf(float %1) #1
162  %conv3 = fmul float %0, %fabsf
163  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
164  store float %conv3, float* %arrayidx4, align 4
165  %inc = add nuw nsw i32 %i.011, 1
166  %exitcond = icmp eq i32 %inc, %N
167  br i1 %exitcond, label %for.end, label %for.body
168
169for.end:                                          ; preds = %for.body, %entry
170  ret void
171}
172
173; Integer loops are always vectorizeable
174; CHECK: Checking a loop in "sumi_fast"
175; CHECK: We can vectorize this loop!
176define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
177entry:
178  %cmp5 = icmp eq i32 %N, 0
179  br i1 %cmp5, label %for.end, label %for.body.preheader
180
181for.body.preheader:                               ; preds = %entry
182  br label %for.body
183
184for.body:                                         ; preds = %for.body.preheader, %for.body
185  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
186  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
187  %0 = load i32, i32* %arrayidx, align 4
188  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
189  %1 = load i32, i32* %arrayidx1, align 4
190  %mul = mul nsw i32 %1, %0
191  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
192  store i32 %mul, i32* %arrayidx2, align 4
193  %inc = add nuw nsw i32 %i.06, 1
194  %exitcond = icmp eq i32 %inc, %N
195  br i1 %exitcond, label %for.end.loopexit, label %for.body
196
197for.end.loopexit:                                 ; preds = %for.body
198  br label %for.end
199
200for.end:                                          ; preds = %for.end.loopexit, %entry
201  ret void
202}
203
204; Floating-point loops can be vectorizeable with fast-math
205; CHECK: Checking a loop in "sumf_fast"
206; CHECK: We can vectorize this loop!
207define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
208entry:
209  %cmp5 = icmp eq i32 %N, 0
210  br i1 %cmp5, label %for.end, label %for.body.preheader
211
212for.body.preheader:                               ; preds = %entry
213  br label %for.body
214
215for.body:                                         ; preds = %for.body.preheader, %for.body
216  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
217  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
218  %0 = load float, float* %arrayidx, align 4
219  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
220  %1 = load float, float* %arrayidx1, align 4
221  %mul = fmul fast float %1, %0
222  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
223  store float %mul, float* %arrayidx2, align 4
224  %inc = add nuw nsw i32 %i.06, 1
225  %exitcond = icmp eq i32 %inc, %N
226  br i1 %exitcond, label %for.end.loopexit, label %for.body
227
228for.end.loopexit:                                 ; preds = %for.body
229  br label %for.end
230
231for.end:                                          ; preds = %for.end.loopexit, %entry
232  ret void
233}
234
235; Integer loops are always vectorizeable
236; CHECK: Checking a loop in "redi_fast"
237; CHECK: We can vectorize this loop!
238define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
239entry:
240  %cmp5 = icmp eq i32 %N, 0
241  br i1 %cmp5, label %for.end, label %for.body.preheader
242
243for.body.preheader:                               ; preds = %entry
244  br label %for.body
245
246for.body:                                         ; preds = %for.body.preheader, %for.body
247  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
248  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
249  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
250  %0 = load i32, i32* %arrayidx, align 4
251  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
252  %1 = load i32, i32* %arrayidx1, align 4
253  %mul = mul nsw i32 %1, %0
254  %add = add nsw i32 %mul, %Red.06
255  %inc = add nuw nsw i32 %i.07, 1
256  %exitcond = icmp eq i32 %inc, %N
257  br i1 %exitcond, label %for.end.loopexit, label %for.body
258
259for.end.loopexit:                                 ; preds = %for.body
260  %add.lcssa = phi i32 [ %add, %for.body ]
261  br label %for.end
262
263for.end:                                          ; preds = %for.end.loopexit, %entry
264  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
265  ret i32 %Red.0.lcssa
266}
267
268; Floating-point loops can be vectorizeable with fast-math
269; CHECK: Checking a loop in "redf_fast"
270; CHECK: We can vectorize this loop!
271define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
272entry:
273  %cmp5 = icmp eq i32 %N, 0
274  br i1 %cmp5, label %for.end, label %for.body.preheader
275
276for.body.preheader:                               ; preds = %entry
277  br label %for.body
278
279for.body:                                         ; preds = %for.body.preheader, %for.body
280  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
281  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
282  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
283  %0 = load float, float* %arrayidx, align 4
284  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
285  %1 = load float, float* %arrayidx1, align 4
286  %mul = fmul fast float %1, %0
287  %add = fadd fast float %mul, %Red.06
288  %inc = add nuw nsw i32 %i.07, 1
289  %exitcond = icmp eq i32 %inc, %N
290  br i1 %exitcond, label %for.end.loopexit, label %for.body
291
292for.end.loopexit:                                 ; preds = %for.body
293  %add.lcssa = phi float [ %add, %for.body ]
294  br label %for.end
295
296for.end:                                          ; preds = %for.end.loopexit, %entry
297  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
298  ret float %Red.0.lcssa
299}
300
301; Make sure calls that turn into builtins are also covered
302; CHECK: Checking a loop in "fabs_fast"
303; CHECK: We can vectorize this loop!
304define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
305entry:
306  %cmp10 = icmp eq i32 %N, 0
307  br i1 %cmp10, label %for.end, label %for.body
308
309for.body:                                         ; preds = %entry, %for.body
310  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
311  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
312  %0 = load float, float* %arrayidx, align 4
313  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
314  %1 = load float, float* %arrayidx1, align 4
315  %fabsf = tail call fast float @fabsf(float %1) #2
316  %conv3 = fmul fast float %fabsf, %0
317  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
318  store float %conv3, float* %arrayidx4, align 4
319  %inc = add nuw nsw i32 %i.011, 1
320  %exitcond = icmp eq i32 %inc, %N
321  br i1 %exitcond, label %for.end, label %for.body
322
323for.end:                                          ; preds = %for.body, %entry
324  ret void
325}
326
327declare float @fabsf(float)
328
329attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
330attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
331