1; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 2; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 3; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN 4; REQUIRES: asserts 5 6; Testing the ability of the loop vectorizer to tell when SIMD is safe or not 7; regarding IEEE 754 standard. 8; On Linux, we only want the vectorizer to work when -ffast-math flag is set, 9; because NEON is not IEEE compliant. 10; Darwin, on the other hand, doesn't support subnormals, and all optimizations 11; are allowed, even without -ffast-math. 12 13; Integer loops are always vectorizeable 14; CHECK: Checking a loop in "sumi" 15; CHECK: We can vectorize this loop! 16define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 17entry: 18 %cmp5 = icmp eq i32 %N, 0 19 br i1 %cmp5, label %for.end, label %for.body.preheader 20 21for.body.preheader: ; preds = %entry 22 br label %for.body 23 24for.body: ; preds = %for.body.preheader, %for.body 25 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 26 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 27 %0 = load i32, i32* %arrayidx, align 4 28 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 29 %1 = load i32, i32* %arrayidx1, align 4 30 %mul = mul nsw i32 %1, %0 31 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 32 store i32 %mul, i32* %arrayidx2, align 4 33 %inc = add nuw nsw i32 %i.06, 1 34 %exitcond = icmp eq i32 %inc, %N 35 br i1 %exitcond, label %for.end.loopexit, label %for.body 36 37for.end.loopexit: ; preds = %for.body 38 br label %for.end 39 40for.end: ; preds = %for.end.loopexit, %entry 41 ret void 42} 43 44; Floating-point loops need fast-math to be vectorizeable 45; LINUX: Checking a loop in "sumf" 46; LINUX: Potentially unsafe FP op prevents vectorization 47; DARWIN: Checking a loop in "sumf" 48; DARWIN: We can vectorize this loop! 49define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 50entry: 51 %cmp5 = icmp eq i32 %N, 0 52 br i1 %cmp5, label %for.end, label %for.body.preheader 53 54for.body.preheader: ; preds = %entry 55 br label %for.body 56 57for.body: ; preds = %for.body.preheader, %for.body 58 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 59 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 60 %0 = load float, float* %arrayidx, align 4 61 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 62 %1 = load float, float* %arrayidx1, align 4 63 %mul = fmul float %0, %1 64 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 65 store float %mul, float* %arrayidx2, align 4 66 %inc = add nuw nsw i32 %i.06, 1 67 %exitcond = icmp eq i32 %inc, %N 68 br i1 %exitcond, label %for.end.loopexit, label %for.body 69 70for.end.loopexit: ; preds = %for.body 71 br label %for.end 72 73for.end: ; preds = %for.end.loopexit, %entry 74 ret void 75} 76 77; Integer loops are always vectorizeable 78; CHECK: Checking a loop in "redi" 79; CHECK: We can vectorize this loop! 80define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 81entry: 82 %cmp5 = icmp eq i32 %N, 0 83 br i1 %cmp5, label %for.end, label %for.body.preheader 84 85for.body.preheader: ; preds = %entry 86 br label %for.body 87 88for.body: ; preds = %for.body.preheader, %for.body 89 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 90 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 91 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 92 %0 = load i32, i32* %arrayidx, align 4 93 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 94 %1 = load i32, i32* %arrayidx1, align 4 95 %mul = mul nsw i32 %1, %0 96 %add = add nsw i32 %mul, %Red.06 97 %inc = add nuw nsw i32 %i.07, 1 98 %exitcond = icmp eq i32 %inc, %N 99 br i1 %exitcond, label %for.end.loopexit, label %for.body 100 101for.end.loopexit: ; preds = %for.body 102 %add.lcssa = phi i32 [ %add, %for.body ] 103 br label %for.end 104 105for.end: ; preds = %for.end.loopexit, %entry 106 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 107 ret i32 %Red.0.lcssa 108} 109 110; Floating-point loops need fast-math to be vectorizeable 111; LINUX: Checking a loop in "redf" 112; LINUX: Potentially unsafe FP op prevents vectorization 113; DARWIN: Checking a loop in "redf" 114; DARWIN: We can vectorize this loop! 115define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 116entry: 117 %cmp5 = icmp eq i32 %N, 0 118 br i1 %cmp5, label %for.end, label %for.body.preheader 119 120for.body.preheader: ; preds = %entry 121 br label %for.body 122 123for.body: ; preds = %for.body.preheader, %for.body 124 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 125 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 126 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 127 %0 = load float, float* %arrayidx, align 4 128 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 129 %1 = load float, float* %arrayidx1, align 4 130 %mul = fmul float %0, %1 131 %add = fadd float %Red.06, %mul 132 %inc = add nuw nsw i32 %i.07, 1 133 %exitcond = icmp eq i32 %inc, %N 134 br i1 %exitcond, label %for.end.loopexit, label %for.body 135 136for.end.loopexit: ; preds = %for.body 137 %add.lcssa = phi float [ %add, %for.body ] 138 br label %for.end 139 140for.end: ; preds = %for.end.loopexit, %entry 141 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 142 ret float %Red.0.lcssa 143} 144 145; Make sure calls that turn into builtins are also covered 146; LINUX: Checking a loop in "fabs" 147; LINUX: Potentially unsafe FP op prevents vectorization 148; DARWIN: Checking a loop in "fabs" 149; DARWIN: We can vectorize this loop! 150define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 151entry: 152 %cmp10 = icmp eq i32 %N, 0 153 br i1 %cmp10, label %for.end, label %for.body 154 155for.body: ; preds = %entry, %for.body 156 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 157 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 158 %0 = load float, float* %arrayidx, align 4 159 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 160 %1 = load float, float* %arrayidx1, align 4 161 %fabsf = tail call float @fabsf(float %1) #1 162 %conv3 = fmul float %0, %fabsf 163 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 164 store float %conv3, float* %arrayidx4, align 4 165 %inc = add nuw nsw i32 %i.011, 1 166 %exitcond = icmp eq i32 %inc, %N 167 br i1 %exitcond, label %for.end, label %for.body 168 169for.end: ; preds = %for.body, %entry 170 ret void 171} 172 173; Integer loops are always vectorizeable 174; CHECK: Checking a loop in "sumi_fast" 175; CHECK: We can vectorize this loop! 176define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 177entry: 178 %cmp5 = icmp eq i32 %N, 0 179 br i1 %cmp5, label %for.end, label %for.body.preheader 180 181for.body.preheader: ; preds = %entry 182 br label %for.body 183 184for.body: ; preds = %for.body.preheader, %for.body 185 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 186 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 187 %0 = load i32, i32* %arrayidx, align 4 188 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 189 %1 = load i32, i32* %arrayidx1, align 4 190 %mul = mul nsw i32 %1, %0 191 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 192 store i32 %mul, i32* %arrayidx2, align 4 193 %inc = add nuw nsw i32 %i.06, 1 194 %exitcond = icmp eq i32 %inc, %N 195 br i1 %exitcond, label %for.end.loopexit, label %for.body 196 197for.end.loopexit: ; preds = %for.body 198 br label %for.end 199 200for.end: ; preds = %for.end.loopexit, %entry 201 ret void 202} 203 204; Floating-point loops can be vectorizeable with fast-math 205; CHECK: Checking a loop in "sumf_fast" 206; CHECK: We can vectorize this loop! 207define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 208entry: 209 %cmp5 = icmp eq i32 %N, 0 210 br i1 %cmp5, label %for.end, label %for.body.preheader 211 212for.body.preheader: ; preds = %entry 213 br label %for.body 214 215for.body: ; preds = %for.body.preheader, %for.body 216 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 217 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 218 %0 = load float, float* %arrayidx, align 4 219 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 220 %1 = load float, float* %arrayidx1, align 4 221 %mul = fmul fast float %1, %0 222 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 223 store float %mul, float* %arrayidx2, align 4 224 %inc = add nuw nsw i32 %i.06, 1 225 %exitcond = icmp eq i32 %inc, %N 226 br i1 %exitcond, label %for.end.loopexit, label %for.body 227 228for.end.loopexit: ; preds = %for.body 229 br label %for.end 230 231for.end: ; preds = %for.end.loopexit, %entry 232 ret void 233} 234 235; Integer loops are always vectorizeable 236; CHECK: Checking a loop in "redi_fast" 237; CHECK: We can vectorize this loop! 238define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 239entry: 240 %cmp5 = icmp eq i32 %N, 0 241 br i1 %cmp5, label %for.end, label %for.body.preheader 242 243for.body.preheader: ; preds = %entry 244 br label %for.body 245 246for.body: ; preds = %for.body.preheader, %for.body 247 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 248 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 249 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 250 %0 = load i32, i32* %arrayidx, align 4 251 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 252 %1 = load i32, i32* %arrayidx1, align 4 253 %mul = mul nsw i32 %1, %0 254 %add = add nsw i32 %mul, %Red.06 255 %inc = add nuw nsw i32 %i.07, 1 256 %exitcond = icmp eq i32 %inc, %N 257 br i1 %exitcond, label %for.end.loopexit, label %for.body 258 259for.end.loopexit: ; preds = %for.body 260 %add.lcssa = phi i32 [ %add, %for.body ] 261 br label %for.end 262 263for.end: ; preds = %for.end.loopexit, %entry 264 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 265 ret i32 %Red.0.lcssa 266} 267 268; Floating-point loops can be vectorizeable with fast-math 269; CHECK: Checking a loop in "redf_fast" 270; CHECK: We can vectorize this loop! 271define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 272entry: 273 %cmp5 = icmp eq i32 %N, 0 274 br i1 %cmp5, label %for.end, label %for.body.preheader 275 276for.body.preheader: ; preds = %entry 277 br label %for.body 278 279for.body: ; preds = %for.body.preheader, %for.body 280 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 281 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 282 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 283 %0 = load float, float* %arrayidx, align 4 284 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 285 %1 = load float, float* %arrayidx1, align 4 286 %mul = fmul fast float %1, %0 287 %add = fadd fast float %mul, %Red.06 288 %inc = add nuw nsw i32 %i.07, 1 289 %exitcond = icmp eq i32 %inc, %N 290 br i1 %exitcond, label %for.end.loopexit, label %for.body 291 292for.end.loopexit: ; preds = %for.body 293 %add.lcssa = phi float [ %add, %for.body ] 294 br label %for.end 295 296for.end: ; preds = %for.end.loopexit, %entry 297 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 298 ret float %Red.0.lcssa 299} 300 301; Make sure calls that turn into builtins are also covered 302; CHECK: Checking a loop in "fabs_fast" 303; CHECK: We can vectorize this loop! 304define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 305entry: 306 %cmp10 = icmp eq i32 %N, 0 307 br i1 %cmp10, label %for.end, label %for.body 308 309for.body: ; preds = %entry, %for.body 310 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 311 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 312 %0 = load float, float* %arrayidx, align 4 313 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 314 %1 = load float, float* %arrayidx1, align 4 315 %fabsf = tail call fast float @fabsf(float %1) #2 316 %conv3 = fmul fast float %fabsf, %0 317 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 318 store float %conv3, float* %arrayidx4, align 4 319 %inc = add nuw nsw i32 %i.011, 1 320 %exitcond = icmp eq i32 %inc, %N 321 br i1 %exitcond, label %for.end, label %for.body 322 323for.end: ; preds = %for.body, %entry 324 ret void 325} 326 327declare float @fabsf(float) 328 329attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } 330attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } 331