1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s 3 4; The following functions should all fail to become tail-predicated. 5; CHECK-NOT: call i32 @llvm.arm.vctp 6 7; trip.count.minus.1 has been inserted into element 1, not 0. 8define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 9entry: 10 %cmp8 = icmp eq i32 %N, 0 11 %tmp8 = add i32 %N, 3 12 %tmp9 = lshr i32 %tmp8, 2 13 %tmp10 = shl nuw i32 %tmp9, 2 14 %tmp11 = add i32 %tmp10, -4 15 %tmp12 = lshr i32 %tmp11, 2 16 %tmp13 = add nuw nsw i32 %tmp12, 1 17 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 18 19vector.ph: ; preds = %entry 20 %trip.count.minus.1 = add i32 %N, -1 21 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1 22 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 23 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 24 br label %vector.body 25 26vector.body: ; preds = %vector.body, %vector.ph 27 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 28 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 29 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 30 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 31 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 32 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 33 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 34 %tmp2 = bitcast i32* %tmp to <4 x i32>* 35 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 36 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 37 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 38 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 39 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 40 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 41 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 42 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 43 %index.next = add i32 %index, 4 44 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 45 %tmp16 = icmp ne i32 %tmp15, 0 46 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 47 48for.cond.cleanup: ; preds = %vector.body, %entry 49 ret void 50} 51 52; The insert isn't using an undef for operand 0. 53define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 54entry: 55 %cmp8 = icmp eq i32 %N, 0 56 %tmp8 = add i32 %N, 3 57 %tmp9 = lshr i32 %tmp8, 2 58 %tmp10 = shl nuw i32 %tmp9, 2 59 %tmp11 = add i32 %tmp10, -4 60 %tmp12 = lshr i32 %tmp11, 2 61 %tmp13 = add nuw nsw i32 %tmp12, 1 62 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 63 64vector.ph: ; preds = %entry 65 %trip.count.minus.1 = add i32 %N, -1 66 %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0 67 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 68 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 69 br label %vector.body 70 71vector.body: ; preds = %vector.body, %vector.ph 72 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 73 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 74 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 75 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 76 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 77 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 78 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 79 %tmp2 = bitcast i32* %tmp to <4 x i32>* 80 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 81 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 82 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 83 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 84 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 85 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 86 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 87 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 88 %index.next = add i32 %index, 4 89 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 90 %tmp16 = icmp ne i32 %tmp15, 0 91 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 92 93for.cond.cleanup: ; preds = %vector.body, %entry 94 ret void 95} 96 97; The shuffle uses a defined value for operand 1. 98define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 99entry: 100 %cmp8 = icmp eq i32 %N, 0 101 %tmp8 = add i32 %N, 3 102 %tmp9 = lshr i32 %tmp8, 2 103 %tmp10 = shl nuw i32 %tmp9, 2 104 %tmp11 = add i32 %tmp10, -4 105 %tmp12 = lshr i32 %tmp11, 2 106 %tmp13 = add nuw nsw i32 %tmp12, 1 107 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 108 109vector.ph: ; preds = %entry 110 %trip.count.minus.1 = add i32 %N, -1 111 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 112 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer 113 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 114 br label %vector.body 115 116vector.body: ; preds = %vector.body, %vector.ph 117 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 118 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 119 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 120 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 121 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 122 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 123 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 124 %tmp2 = bitcast i32* %tmp to <4 x i32>* 125 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 126 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 127 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 128 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 129 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 130 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 131 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 132 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 133 %index.next = add i32 %index, 4 134 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 135 %tmp16 = icmp ne i32 %tmp15, 0 136 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 137 138for.cond.cleanup: ; preds = %vector.body, %entry 139 ret void 140} 141 142; The shuffle uses a non zero value for operand 2. 143define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 144entry: 145 %cmp8 = icmp eq i32 %N, 0 146 %tmp8 = add i32 %N, 3 147 %tmp9 = lshr i32 %tmp8, 2 148 %tmp10 = shl nuw i32 %tmp9, 2 149 %tmp11 = add i32 %tmp10, -4 150 %tmp12 = lshr i32 %tmp11, 2 151 %tmp13 = add nuw nsw i32 %tmp12, 1 152 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 153 154vector.ph: ; preds = %entry 155 %trip.count.minus.1 = add i32 %N, -1 156 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 157 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 158 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 159 br label %vector.body 160 161vector.body: ; preds = %vector.body, %vector.ph 162 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 163 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 164 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 165 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 166 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 167 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 168 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 169 %tmp2 = bitcast i32* %tmp to <4 x i32>* 170 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 171 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 172 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 173 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 174 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 175 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 176 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 177 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 178 %index.next = add i32 %index, 4 179 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 180 %tmp16 = icmp ne i32 %tmp15, 0 181 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 182 183for.cond.cleanup: ; preds = %vector.body, %entry 184 ret void 185} 186 187; %N - 2 188define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 189entry: 190 %cmp8 = icmp eq i32 %N, 0 191 %tmp8 = add i32 %N, 3 192 %tmp9 = lshr i32 %tmp8, 2 193 %tmp10 = shl nuw i32 %tmp9, 2 194 %tmp11 = add i32 %tmp10, -4 195 %tmp12 = lshr i32 %tmp11, 2 196 %tmp13 = add nuw nsw i32 %tmp12, 1 197 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 198 199vector.ph: ; preds = %entry 200 %trip.count.minus.2 = add i32 %N, -2 201 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1 202 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 203 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 204 br label %vector.body 205 206vector.body: ; preds = %vector.body, %vector.ph 207 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 208 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 209 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 210 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 211 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 212 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 213 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 214 %tmp2 = bitcast i32* %tmp to <4 x i32>* 215 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 216 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 217 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 218 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 219 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 220 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 221 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 222 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 223 %index.next = add i32 %index, 4 224 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 225 %tmp16 = icmp ne i32 %tmp15, 0 226 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 227 228for.cond.cleanup: ; preds = %vector.body, %entry 229 ret void 230} 231 232; index has been inserted at element 1, not 0. 233define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 234entry: 235 %cmp8 = icmp eq i32 %N, 0 236 %tmp8 = add i32 %N, 3 237 %tmp9 = lshr i32 %tmp8, 2 238 %tmp10 = shl nuw i32 %tmp9, 2 239 %tmp11 = add i32 %tmp10, -4 240 %tmp12 = lshr i32 %tmp11, 2 241 %tmp13 = add nuw nsw i32 %tmp12, 1 242 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 243 244vector.ph: ; preds = %entry 245 %trip.count.minus.1 = add i32 %N, -1 246 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 247 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 248 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 249 br label %vector.body 250 251vector.body: ; preds = %vector.body, %vector.ph 252 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 253 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 254 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1 255 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 256 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 257 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 258 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 259 %tmp2 = bitcast i32* %tmp to <4 x i32>* 260 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 261 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 262 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 263 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 264 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 265 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 266 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 267 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 268 %index.next = add i32 %index, 4 269 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 270 %tmp16 = icmp ne i32 %tmp15, 0 271 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 272 273for.cond.cleanup: ; preds = %vector.body, %entry 274 ret void 275} 276 277define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 278entry: 279 %cmp8 = icmp eq i32 %N, 0 280 %tmp8 = add i32 %N, 3 281 %tmp9 = lshr i32 %tmp8, 2 282 %tmp10 = shl nuw i32 %tmp9, 2 283 %tmp11 = add i32 %tmp10, -4 284 %tmp12 = lshr i32 %tmp11, 2 285 %tmp13 = add nuw nsw i32 %tmp12, 1 286 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 287 288vector.ph: ; preds = %entry 289 %trip.count.minus.1 = add i32 %N, -1 290 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 291 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 292 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 293 br label %vector.body 294 295vector.body: ; preds = %vector.body, %vector.ph 296 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 297 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 298 %incorrect = add i32 %index, 1 299 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0 300 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 301 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 302 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 303 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 304 %tmp2 = bitcast i32* %tmp to <4 x i32>* 305 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 306 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 307 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 308 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 309 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 310 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 311 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 312 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 313 %index.next = add i32 %index, 4 314 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 315 %tmp16 = icmp ne i32 %tmp15, 0 316 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 317 318for.cond.cleanup: ; preds = %vector.body, %entry 319 ret void 320} 321 322; Now using ult, not ule for the vector icmp 323define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 324entry: 325 %cmp8 = icmp eq i32 %N, 0 326 %tmp8 = add i32 %N, 3 327 %tmp9 = lshr i32 %tmp8, 2 328 %tmp10 = shl nuw i32 %tmp9, 2 329 %tmp11 = add i32 %tmp10, -4 330 %tmp12 = lshr i32 %tmp11, 2 331 %tmp13 = add nuw nsw i32 %tmp12, 1 332 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 333 334vector.ph: ; preds = %entry 335 %trip.count.minus.1 = add i32 %N, -1 336 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 337 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 338 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 339 br label %vector.body 340 341vector.body: ; preds = %vector.body, %vector.ph 342 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 343 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 344 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 345 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 346 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 347 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 348 %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11 349 %tmp2 = bitcast i32* %tmp to <4 x i32>* 350 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 351 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 352 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 353 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 354 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 355 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 356 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 357 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 358 %index.next = add i32 %index, 4 359 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 360 %tmp16 = icmp ne i32 %tmp15, 0 361 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 362 363for.cond.cleanup: ; preds = %vector.body, %entry 364 ret void 365} 366 367; The add in the body uses 1, 2, 3, 4 368define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 369entry: 370 %cmp8 = icmp eq i32 %N, 0 371 %tmp8 = add i32 %N, 3 372 %tmp9 = lshr i32 %tmp8, 2 373 %tmp10 = shl nuw i32 %tmp9, 2 374 %tmp11 = add i32 %tmp10, -4 375 %tmp12 = lshr i32 %tmp11, 2 376 %tmp13 = add nuw nsw i32 %tmp12, 1 377 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 378 379vector.ph: ; preds = %entry 380 %trip.count.minus.1 = add i32 %N, -1 381 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 382 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 383 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 384 br label %vector.body 385 386vector.body: ; preds = %vector.body, %vector.ph 387 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 388 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 389 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 390 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 391 %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4> 392 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 393 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 394 %tmp2 = bitcast i32* %tmp to <4 x i32>* 395 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 396 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 397 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 398 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 399 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 400 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 401 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 402 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 403 %index.next = add i32 %index, 4 404 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 405 %tmp16 = icmp ne i32 %tmp15, 0 406 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 407 408for.cond.cleanup: ; preds = %vector.body, %entry 409 ret void 410} 411 412; Using a variable for the loop body broadcast. 413define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) { 414entry: 415 %cmp8 = icmp eq i32 %N, 0 416 %tmp8 = add i32 %N, 3 417 %tmp9 = lshr i32 %tmp8, 2 418 %tmp10 = shl nuw i32 %tmp9, 2 419 %tmp11 = add i32 %tmp10, -4 420 %tmp12 = lshr i32 %tmp11, 2 421 %tmp13 = add nuw nsw i32 %tmp12, 1 422 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 423 424vector.ph: ; preds = %entry 425 %trip.count.minus.1 = add i32 %N, -1 426 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 427 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 428 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 429 br label %vector.body 430 431vector.body: ; preds = %vector.body, %vector.ph 432 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 433 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 434 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 435 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 436 %induction = add <4 x i32> %broadcast.splat, %offsets 437 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 438 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 439 %tmp2 = bitcast i32* %tmp to <4 x i32>* 440 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 441 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 442 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 443 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 444 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 445 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 446 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 447 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 448 %index.next = add i32 %index, 4 449 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 450 %tmp16 = icmp ne i32 %tmp15, 0 451 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 452 453for.cond.cleanup: ; preds = %vector.body, %entry 454 ret void 455} 456 457; adding 5, instead of 4, to index. 458define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 459entry: 460 %cmp8 = icmp eq i32 %N, 0 461 %tmp8 = add i32 %N, 3 462 %tmp9 = lshr i32 %tmp8, 2 463 %tmp10 = shl nuw i32 %tmp9, 2 464 %tmp11 = add i32 %tmp10, -4 465 %tmp12 = lshr i32 %tmp11, 2 466 %tmp13 = add nuw nsw i32 %tmp12, 1 467 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 468 469vector.ph: ; preds = %entry 470 %trip.count.minus.1 = add i32 %N, -1 471 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 472 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 473 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 474 br label %vector.body 475 476vector.body: ; preds = %vector.body, %vector.ph 477 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 478 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 479 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 480 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 481 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 482 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 483 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 484 %tmp2 = bitcast i32* %tmp to <4 x i32>* 485 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 486 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 487 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 488 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 489 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 490 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 491 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 492 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) 493 %index.next = add i32 %index, 5 494 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 495 %tmp16 = icmp ne i32 %tmp15, 0 496 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 497 498for.cond.cleanup: ; preds = %vector.body, %entry 499 ret void 500} 501 502declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 503declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 504declare i32 @llvm.start.loop.iterations.i32(i32) #3 505declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 506 507