1; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT 2; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER 3; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER 4; RUN: opt < %s -loop-vectorize -tail-predication=enabled -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP 5 6target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 7target triple = "thumbv8.1m.main-arm-unknown-eabihf" 8 9; This IR corresponds to this type of C-code: 10; 11; void f(char *a, char *b, char *c, int N) { 12; while (N-- > 0) 13; *c++ = *a++ + *b++; 14; } 15; 16define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 17; COMMON-LABEL: @sgt_loopguard( 18; COMMON: vector.body: 19 20; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0 21; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N) 22; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask 23; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask 24; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask) 25entry: 26 %cmp5 = icmp sgt i32 %N, 0 27 br i1 %cmp5, label %while.body.preheader, label %while.end 28 29while.body.preheader: 30 br label %while.body 31 32while.body: 33 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 34 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 35 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 36 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 37 %dec = add nsw i32 %N.addr.09, -1 38 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 39 %0 = load i8, i8* %a.addr.06, align 1 40 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 41 %1 = load i8, i8* %b.addr.07, align 1 42 %add = add i8 %1, %0 43 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 44 store i8 %add, i8* %c.addr.08, align 1 45 %cmp = icmp sgt i32 %N.addr.09, 1 46 br i1 %cmp, label %while.body, label %while.end.loopexit 47 48while.end.loopexit: 49 br label %while.end 50 51while.end: 52 ret void 53} 54 55; No loop-guard: we need one for this to be valid. 56; 57define dso_local void @sgt_no_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 58; COMMON-LABEL: @sgt_no_loopguard( 59; COMMON: vector.body: 60; CHECK-TF: masked.load 61; CHECK-TF: masked.load 62; CHECK-TF: masked.store 63entry: 64 br label %while.body 65 66while.body: 67 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ] 68 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %entry ] 69 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %entry ] 70 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %entry ] 71 %dec = add nsw i32 %N.addr.09, -1 72 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 73 %0 = load i8, i8* %a.addr.06, align 1 74 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 75 %1 = load i8, i8* %b.addr.07, align 1 76 %add = add i8 %1, %0 77 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 78 store i8 %add, i8* %c.addr.08, align 1 79 %cmp = icmp sgt i32 %N.addr.09, 1 80 br i1 %cmp, label %while.body, label %while.end.loopexit 81 82while.end.loopexit: 83 br label %while.end 84 85while.end: 86 ret void 87} 88 89define dso_local void @sgt_extra_use_cmp(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 90; COMMON-LABEL: @sgt_extra_use_cmp( 91; COMMON: vector.body: 92; CHECK-TF: masked.load 93; CHECK-TF: masked.load 94; CHECK-TF: masked.store 95entry: 96 br label %while.body 97 98while.body: 99 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ] 100 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %entry ] 101 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %entry ] 102 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %entry ] 103 %dec = add nsw i32 %N.addr.09, -1 104 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 105 %0 = load i8, i8* %a.addr.06, align 1 106 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 107 %1 = load i8, i8* %b.addr.07, align 1 108 %add = add i8 %1, %0 109 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 110 store i8 %add, i8* %c.addr.08, align 1 111 %cmp = icmp sgt i32 %N.addr.09, 1 112 %select = select i1 %cmp, i8 %0, i8 %1 113 br i1 %cmp, label %while.body, label %while.end.loopexit 114 115while.end.loopexit: 116 br label %while.end 117 118while.end: 119 ret void 120} 121 122define dso_local void @sgt_const_tripcount(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 123; COMMON-LABEL: @sgt_const_tripcount( 124; COMMON: vector.body: 125; CHECK-TF: masked.load 126; CHECK-TF: masked.load 127; CHECK-TF: masked.store 128entry: 129 %cmp5 = icmp sgt i32 %N, 0 130 br i1 %cmp5, label %while.body.preheader, label %while.end 131 132while.body.preheader: 133 br label %while.body 134 135while.body: 136 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2049, %while.body.preheader ] 137 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 138 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 139 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 140 %dec = add nsw i32 %N.addr.09, -1 141 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 142 %0 = load i8, i8* %a.addr.06, align 1 143 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 144 %1 = load i8, i8* %b.addr.07, align 1 145 %add = add i8 %1, %0 146 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 147 store i8 %add, i8* %c.addr.08, align 1 148 %cmp = icmp sgt i32 %N.addr.09, 1 149 br i1 %cmp, label %while.body, label %while.end.loopexit 150 151while.end.loopexit: 152 br label %while.end 153 154while.end: 155 ret void 156} 157 158define dso_local void @sgt_no_guard_0_startval(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 159; COMMON-LABEL: @sgt_no_guard_0_startval( 160; COMMON-NOT: vector.body: 161entry: 162 br label %while.body 163 164while.body: 165 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 0, %entry ] 166 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %entry ] 167 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %entry ] 168 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %entry] 169 %dec = add nsw i32 %N.addr.09, -1 170 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 171 %0 = load i8, i8* %a.addr.06, align 1 172 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 173 %1 = load i8, i8* %b.addr.07, align 1 174 %add = add i8 %1, %0 175 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 176 store i8 %add, i8* %c.addr.08, align 1 177 %cmp = icmp sgt i32 %N.addr.09, 1 178 br i1 %cmp, label %while.body, label %while.end.loopexit 179 180while.end.loopexit: 181 br label %while.end 182 183while.end: 184 ret void 185} 186 187define dso_local void @sgt_step_minus_two(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 188; COMMON-LABEL: @sgt_step_minus_two( 189; COMMON: vector.body: 190; CHECK-TF: masked.load 191; CHECK-TF: masked.load 192; CHECK-TF: masked.store 193entry: 194 %cmp5 = icmp sgt i32 %N, 0 195 br i1 %cmp5, label %while.body.preheader, label %while.end 196 197while.body.preheader: 198 br label %while.body 199 200while.body: 201 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 202 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 203 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 204 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 205 %dec = add nsw i32 %N.addr.09, -2 206 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 207 %0 = load i8, i8* %a.addr.06, align 1 208 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 209 %1 = load i8, i8* %b.addr.07, align 1 210 %add = add i8 %1, %0 211 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 212 store i8 %add, i8* %c.addr.08, align 1 213 %cmp = icmp sgt i32 %N.addr.09, 1 214 br i1 %cmp, label %while.body, label %while.end.loopexit 215 216while.end.loopexit: 217 br label %while.end 218 219while.end: 220 ret void 221} 222 223define dso_local void @sgt_step_not_constant(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N, i32 %S) local_unnamed_addr #0 { 224; COMMON-LABEL: @sgt_step_not_constant( 225; COMMON-NOT: vector.body: 226entry: 227 %cmp5 = icmp sgt i32 %N, 0 228 br i1 %cmp5, label %while.body.preheader, label %while.end 229 230while.body.preheader: 231 br label %while.body 232 233while.body: 234 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 235 %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 236 %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 237 %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 238 %dec = add nsw i32 %N.addr.09, %S 239 %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 240 %0 = load i8, i8* %a.addr.06, align 1 241 %incdec.ptr1 = getelementptr inbounds i8, i8* %b.addr.07, i32 1 242 %1 = load i8, i8* %b.addr.07, align 1 243 %add = add i8 %1, %0 244 %incdec.ptr4 = getelementptr inbounds i8, i8* %c.addr.08, i32 1 245 store i8 %add, i8* %c.addr.08, align 1 246 %cmp = icmp sgt i32 %N.addr.09, 1 247 br i1 %cmp, label %while.body, label %while.end.loopexit 248 249while.end.loopexit: 250 br label %while.end 251 252while.end: 253 ret void 254} 255 256define dso_local void @icmp_eq(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 { 257; COMMON-LABEL: @icmp_eq 258; COMMON: vector.body: 259entry: 260 %cmp6 = icmp eq i32 %N, 0 261 br i1 %cmp6, label %while.end, label %while.body.preheader 262 263while.body.preheader: 264 br label %while.body 265 266while.body: 267 %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 268 %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ] 269 %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ] 270 %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ] 271 %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1 272 %0 = load i8, i8* %A.addr.07, align 1 273 %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1 274 %1 = load i8, i8* %B.addr.08, align 1 275 %add = add i8 %1, %0 276 %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1 277 store i8 %add, i8* %C.addr.09, align 1 278 %dec = add i32 %N.addr.010, -1 279 %cmp = icmp eq i32 %dec, 0 280 br i1 %cmp, label %while.end.loopexit, label %while.body 281 282while.end.loopexit: 283 br label %while.end 284 285while.end: 286 ret void 287} 288 289; This IR corresponds to this type of C-code: 290; 291; void f(char *a, char *b, char * __restrict c, int N) { 292; #pragma clang loop vectorize_width(16) 293; for (int i = N; i>0; i--) 294; c[i] = a[i] + b[i]; 295; } 296; 297define dso_local void @sgt_for_loop(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 298; COMMON-LABEL: @sgt_for_loop( 299; COMMON: vector.body: 300; CHECK-PREFER: masked.load 301; CHECK-PREFER: masked.load 302; CHECK-PREFER: masked.store 303; 304; TODO: if tail-predication is requested, tail-folding isn't triggered because 305; the profitability check returns "Different strides found, can't tail-predicate", 306; investigate this. 307; 308; CHECK-ENABLE-TP-NOT: masked.load 309; CHECK-ENABLE-TP-NOT: masked.load 310; CHECK-ENABLE-TP-NOT: masked.store 311; 312entry: 313 %cmp5 = icmp sgt i32 %N, 0 314 br i1 %cmp5, label %for.body.preheader, label %for.end 315 316for.body.preheader: 317 br label %for.body 318 319for.body: 320 %i.011 = phi i32 [ %dec, %for.body ], [ %N, %for.body.preheader ] 321 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011 322 %0 = load i8, i8* %arrayidx, align 1 323 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011 324 %1 = load i8, i8* %arrayidx1, align 1 325 %add = add i8 %1, %0 326 %arrayidx4 = getelementptr inbounds i8, i8* %c, i32 %i.011 327 store i8 %add, i8* %arrayidx4, align 1 328 %dec = add nsw i32 %i.011, -1 329 %cmp = icmp sgt i32 %i.011, 1 330 br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1 331 332for.end: 333 ret void 334} 335 336define dso_local void @sgt_for_loop_i64(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 337; COMMON-LABEL: @sgt_for_loop_i64( 338; COMMON: vector.body: 339; 340; CHECK-PREFER: masked.load 341; CHECK-PREFER: masked.load 342; CHECK-PREFER: masked.store 343; 344; With -disable-mve-tail-predication=false, the target hook returns 345; "preferPredicateOverEpilogue: hardware-loop is not profitable." 346; so here we don't expect the tail-folding. TODO: look into this. 347; 348; CHECK-ENABLE-TP-NOT: masked.load 349; CHECK-ENABLE-TP-NOT: masked.load 350; CHECK-ENABLE-TP-NOT: masked.store 351; 352entry: 353 %cmp14 = icmp sgt i32 %N, 0 354 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup 355 356for.body.preheader: 357 %conv16 = zext i32 %N to i64 358 br label %for.body 359 360for.cond.cleanup.loopexit: 361 br label %for.cond.cleanup 362 363for.cond.cleanup: 364 ret void 365 366for.body: 367 %i.015 = phi i64 [ %dec, %for.body ], [ %conv16, %for.body.preheader ] 368 %idxprom = trunc i64 %i.015 to i32 369 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %idxprom 370 %0 = load i8, i8* %arrayidx, align 1 371 %arrayidx4 = getelementptr inbounds i8, i8* %b, i32 %idxprom 372 %1 = load i8, i8* %arrayidx4, align 1 373 %add = add i8 %1, %0 374 %arrayidx8 = getelementptr inbounds i8, i8* %c, i32 %idxprom 375 store i8 %add, i8* %arrayidx8, align 1 376 %dec = add nsw i64 %i.015, -1 377 %cmp = icmp sgt i64 %i.015, 1 378 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !1 379} 380 381; This IR corresponds to this nested-loop: 382; 383; for (int i = 0; i<N; i++) 384; for (int j = i+1; j>0; j--) 385; c[j] = a[j] + b[j]; 386; 387; while the inner-loop looks similar to previous examples, we can't 388; transform this because the inner loop because isGuarded returns 389; false for the inner-loop. 390; 391define dso_local void @sgt_nested_loop(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 392; COMMON-LABEL: @sgt_nested_loop( 393; DEFAULT-NOT: vector.body: 394; CHECK-TF-NOT: masked.load 395; CHECK-TF-NOT: masked.load 396; CHECK-TF-NOT: masked.store 397; COMMON: } 398; 399entry: 400 %cmp21 = icmp sgt i32 %N, 0 401 br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup 402 403for.body.preheader: 404 br label %for.body 405 406for.cond.loopexit: 407 %exitcond = icmp eq i32 %add, %N 408 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 409 410for.cond.cleanup.loopexit: 411 br label %for.cond.cleanup 412 413for.cond.cleanup: 414 ret void 415 416for.body: 417 %i.022 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %for.body.preheader ] 418 %add = add nuw nsw i32 %i.022, 1 419 br label %for.body4 420 421for.body4: ; preds = %for.body, %for.body4 422 %j.020 = phi i32 [ %add, %for.body ], [ %dec, %for.body4 ] 423 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %j.020 424 %0 = load i8, i8* %arrayidx, align 1 425 %arrayidx5 = getelementptr inbounds i8, i8* %b, i32 %j.020 426 %1 = load i8, i8* %arrayidx5, align 1 427 %add7 = add i8 %1, %0 428 %arrayidx9 = getelementptr inbounds i8, i8* %c, i32 %j.020 429 store i8 %add7, i8* %arrayidx9, align 1 430 %dec = add nsw i32 %j.020, -1 431 %cmp2 = icmp sgt i32 %j.020, 1 432 br i1 %cmp2, label %for.body4, label %for.cond.loopexit 433} 434 435attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 436 437!1 = distinct !{!1, !2} 438!2 = !{!"llvm.loop.vectorize.width", i32 16} 439