1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s 3 4define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 5; CHECK-LABEL: mul_reduce_add: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: cmp r2, #0 8; CHECK-NEXT: itt eq 9; CHECK-NEXT: moveq r0, #0 10; CHECK-NEXT: bxeq lr 11; CHECK-NEXT: .LBB0_1: @ %vector.ph 12; CHECK-NEXT: push {r7, lr} 13; CHECK-NEXT: adds r3, r2, #3 14; CHECK-NEXT: vmov.i32 q0, #0x0 15; CHECK-NEXT: bic r3, r3, #3 16; CHECK-NEXT: sub.w r12, r3, #4 17; CHECK-NEXT: movs r3, #1 18; CHECK-NEXT: add.w lr, r3, r12, lsr #2 19; CHECK-NEXT: movs r3, #0 20; CHECK-NEXT: dls lr, lr 21; CHECK-NEXT: .LBB0_2: @ %vector.body 22; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 23; CHECK-NEXT: vctp.32 r2 24; CHECK-NEXT: vmov q1, q0 25; CHECK-NEXT: vpstt 26; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 27; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 28; CHECK-NEXT: adds r3, #4 29; CHECK-NEXT: vmul.i32 q0, q2, q0 30; CHECK-NEXT: subs r2, #4 31; CHECK-NEXT: vadd.i32 q0, q0, q1 32; CHECK-NEXT: le lr, .LBB0_2 33; CHECK-NEXT: @ %bb.3: @ %middle.block 34; CHECK-NEXT: vpsel q0, q0, q1 35; CHECK-NEXT: vaddv.u32 r0, q0 36; CHECK-NEXT: pop {r7, pc} 37entry: 38 %cmp8 = icmp eq i32 %N, 0 39 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 40 41vector.ph: ; preds = %entry 42 %n.rnd.up = add i32 %N, 3 43 %n.vec = and i32 %n.rnd.up, -4 44 %trip.count.minus.1 = add i32 %N, -1 45 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 46 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer 47 br label %vector.body 48 49vector.body: ; preds = %vector.body, %vector.ph 50 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 51 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] 52 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 53 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 54 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 55 %0 = getelementptr inbounds i32, i32* %a, i32 %index 56 57 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 58 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 59 60 %2 = bitcast i32* %0 to <4 x i32>* 61 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 62 %3 = getelementptr inbounds i32, i32* %b, i32 %index 63 %4 = bitcast i32* %3 to <4 x i32>* 64 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef) 65 %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load 66 %6 = add nsw <4 x i32> %5, %vec.phi 67 %index.next = add i32 %index, 4 68 %7 = icmp eq i32 %index.next, %n.vec 69 br i1 %7, label %middle.block, label %vector.body 70 71middle.block: ; preds = %vector.body 72 %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi 73 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) 74 br label %for.cond.cleanup 75 76for.cond.cleanup: ; preds = %middle.block, %entry 77 %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ] 78 ret i32 %res.0.lcssa 79} 80 81define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { 82; CHECK-LABEL: mul_reduce_add_const: 83; CHECK: @ %bb.0: @ %entry 84; CHECK-NEXT: cmp r2, #0 85; CHECK-NEXT: itt eq 86; CHECK-NEXT: moveq r0, #0 87; CHECK-NEXT: bxeq lr 88; CHECK-NEXT: .LBB1_1: @ %vector.ph 89; CHECK-NEXT: push {r7, lr} 90; CHECK-NEXT: adds r1, r2, #3 91; CHECK-NEXT: movs r3, #1 92; CHECK-NEXT: bic r1, r1, #3 93; CHECK-NEXT: vmov.i32 q0, #0x0 94; CHECK-NEXT: subs r1, #4 95; CHECK-NEXT: add.w lr, r3, r1, lsr #2 96; CHECK-NEXT: movs r1, #0 97; CHECK-NEXT: dls lr, lr 98; CHECK-NEXT: .LBB1_2: @ %vector.body 99; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 100; CHECK-NEXT: vctp.32 r2 101; CHECK-NEXT: vmov q1, q0 102; CHECK-NEXT: vpst 103; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 104; CHECK-NEXT: adds r1, #4 105; CHECK-NEXT: subs r2, #4 106; CHECK-NEXT: vadd.i32 q0, q0, q1 107; CHECK-NEXT: le lr, .LBB1_2 108; CHECK-NEXT: @ %bb.3: @ %middle.block 109; CHECK-NEXT: vpsel q0, q0, q1 110; CHECK-NEXT: vaddv.u32 r0, q0 111; CHECK-NEXT: pop {r7, pc} 112entry: 113 %cmp6 = icmp eq i32 %N, 0 114 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 115 116vector.ph: ; preds = %entry 117 %n.rnd.up = add i32 %N, 3 118 %n.vec = and i32 %n.rnd.up, -4 119 %trip.count.minus.1 = add i32 %N, -1 120 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 121 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer 122 br label %vector.body 123 124vector.body: ; preds = %vector.body, %vector.ph 125 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 126 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 127 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 128 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 129 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 130 %0 = getelementptr inbounds i32, i32* %a, i32 %index 131 132 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 133 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 134 135 %2 = bitcast i32* %0 to <4 x i32>* 136 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 137 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 138 %index.next = add i32 %index, 4 139 %4 = icmp eq i32 %index.next, %n.vec 140 br i1 %4, label %middle.block, label %vector.body 141 142middle.block: ; preds = %vector.body 143 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 144 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 145 br label %for.cond.cleanup 146 147for.cond.cleanup: ; preds = %middle.block, %entry 148 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 149 ret i32 %res.0.lcssa 150} 151 152define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { 153; CHECK-LABEL: add_reduce_add_const: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: cmp r2, #0 156; CHECK-NEXT: itt eq 157; CHECK-NEXT: moveq r0, #0 158; CHECK-NEXT: bxeq lr 159; CHECK-NEXT: .LBB2_1: @ %vector.ph 160; CHECK-NEXT: push {r7, lr} 161; CHECK-NEXT: adds r1, r2, #3 162; CHECK-NEXT: movs r3, #1 163; CHECK-NEXT: bic r1, r1, #3 164; CHECK-NEXT: vmov.i32 q0, #0x0 165; CHECK-NEXT: subs r1, #4 166; CHECK-NEXT: add.w lr, r3, r1, lsr #2 167; CHECK-NEXT: movs r1, #0 168; CHECK-NEXT: dls lr, lr 169; CHECK-NEXT: .LBB2_2: @ %vector.body 170; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 171; CHECK-NEXT: vctp.32 r2 172; CHECK-NEXT: vmov q1, q0 173; CHECK-NEXT: vpst 174; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 175; CHECK-NEXT: adds r1, #4 176; CHECK-NEXT: subs r2, #4 177; CHECK-NEXT: vadd.i32 q0, q0, q1 178; CHECK-NEXT: le lr, .LBB2_2 179; CHECK-NEXT: @ %bb.3: @ %middle.block 180; CHECK-NEXT: vpsel q0, q0, q1 181; CHECK-NEXT: vaddv.u32 r0, q0 182; CHECK-NEXT: pop {r7, pc} 183entry: 184 %cmp6 = icmp eq i32 %N, 0 185 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 186 187vector.ph: ; preds = %entry 188 %n.rnd.up = add i32 %N, 3 189 %n.vec = and i32 %n.rnd.up, -4 190 %trip.count.minus.1 = add i32 %N, -1 191 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 192 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer 193 br label %vector.body 194 195vector.body: ; preds = %vector.body, %vector.ph 196 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 197 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 198 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 199 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 200 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 201 %0 = getelementptr inbounds i32, i32* %a, i32 %index 202 203 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 204 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 205 206 %2 = bitcast i32* %0 to <4 x i32>* 207 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 208 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 209 %index.next = add i32 %index, 4 210 %4 = icmp eq i32 %index.next, %n.vec 211 br i1 %4, label %middle.block, label %vector.body 212 213middle.block: ; preds = %vector.body 214 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 215 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 216 br label %for.cond.cleanup 217 218for.cond.cleanup: ; preds = %middle.block, %entry 219 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 220 ret i32 %res.0.lcssa 221} 222 223define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { 224; CHECK-LABEL: vector_mul_const: 225; CHECK: @ %bb.0: @ %entry 226; CHECK-NEXT: push {r7, lr} 227; CHECK-NEXT: cmp r3, #0 228; CHECK-NEXT: it eq 229; CHECK-NEXT: popeq {r7, pc} 230; CHECK-NEXT: .LBB3_1: @ %vector.ph 231; CHECK-NEXT: mov.w r12, #0 232; CHECK-NEXT: dlstp.32 lr, r3 233; CHECK-NEXT: .LBB3_2: @ %vector.body 234; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 235; CHECK-NEXT: add.w r12, r12, #4 236; CHECK-NEXT: vldrw.u32 q0, [r1], #16 237; CHECK-NEXT: vmul.i32 q0, q0, r2 238; CHECK-NEXT: vstrw.32 q0, [r0], #16 239; CHECK-NEXT: letp lr, .LBB3_2 240; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 241; CHECK-NEXT: pop {r7, pc} 242entry: 243 %cmp6 = icmp eq i32 %N, 0 244 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 245 246vector.ph: ; preds = %entry 247 %n.rnd.up = add i32 %N, 3 248 %n.vec = and i32 %n.rnd.up, -4 249 %trip.count.minus.1 = add i32 %N, -1 250 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 251 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 252 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 253 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 254 br label %vector.body 255 256vector.body: ; preds = %vector.body, %vector.ph 257 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 258 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 259 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 260 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 261 %0 = getelementptr inbounds i32, i32* %b, i32 %index 262 263 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 264 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 265 266 %2 = bitcast i32* %0 to <4 x i32>* 267 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 268 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 269 %4 = getelementptr inbounds i32, i32* %a, i32 %index 270 %5 = bitcast i32* %4 to <4 x i32>* 271 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) 272 %index.next = add i32 %index, 4 273 %6 = icmp eq i32 %index.next, %n.vec 274 br i1 %6, label %for.cond.cleanup, label %vector.body 275 276for.cond.cleanup: ; preds = %vector.body, %entry 277 ret void 278} 279 280define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { 281; CHECK-LABEL: vector_add_const: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: push {r7, lr} 284; CHECK-NEXT: cmp r3, #0 285; CHECK-NEXT: it eq 286; CHECK-NEXT: popeq {r7, pc} 287; CHECK-NEXT: .LBB4_1: @ %vector.ph 288; CHECK-NEXT: mov.w r12, #0 289; CHECK-NEXT: dlstp.32 lr, r3 290; CHECK-NEXT: .LBB4_2: @ %vector.body 291; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 292; CHECK-NEXT: add.w r12, r12, #4 293; CHECK-NEXT: vldrw.u32 q0, [r1], #16 294; CHECK-NEXT: vadd.i32 q0, q0, r2 295; CHECK-NEXT: vstrw.32 q0, [r0], #16 296; CHECK-NEXT: letp lr, .LBB4_2 297; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 298; CHECK-NEXT: pop {r7, pc} 299entry: 300 %cmp6 = icmp eq i32 %N, 0 301 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 302 303vector.ph: ; preds = %entry 304 %n.rnd.up = add i32 %N, 3 305 %n.vec = and i32 %n.rnd.up, -4 306 %trip.count.minus.1 = add i32 %N, -1 307 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 308 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 309 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 310 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 311 br label %vector.body 312 313vector.body: ; preds = %vector.body, %vector.ph 314 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 315 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 316 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 317 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 318 %0 = getelementptr inbounds i32, i32* %b, i32 %index 319 320 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 321 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 322 323 %2 = bitcast i32* %0 to <4 x i32>* 324 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 325 %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 326 %4 = getelementptr inbounds i32, i32* %a, i32 %index 327 %5 = bitcast i32* %4 to <4 x i32>* 328 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) 329 %index.next = add i32 %index, 4 330 %6 = icmp eq i32 %index.next, %n.vec 331 br i1 %6, label %for.cond.cleanup, label %vector.body 332 333for.cond.cleanup: ; preds = %vector.body, %entry 334 ret void 335} 336 337define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { 338; CHECK-LABEL: vector_mul_vector_i8: 339; CHECK: @ %bb.0: @ %entry 340; CHECK-NEXT: push {r7, lr} 341; CHECK-NEXT: cmp r3, #0 342; CHECK-NEXT: it eq 343; CHECK-NEXT: popeq {r7, pc} 344; CHECK-NEXT: .LBB5_1: @ %vector.ph 345; CHECK-NEXT: mov.w r12, #0 346; CHECK-NEXT: dlstp.8 lr, r3 347; CHECK-NEXT: .LBB5_2: @ %vector.body 348; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 349; CHECK-NEXT: add.w r12, r12, #16 350; CHECK-NEXT: vldrb.u8 q0, [r1], #16 351; CHECK-NEXT: vldrb.u8 q1, [r2], #16 352; CHECK-NEXT: vmul.i8 q0, q1, q0 353; CHECK-NEXT: vstrb.8 q0, [r0], #16 354; CHECK-NEXT: letp lr, .LBB5_2 355; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 356; CHECK-NEXT: pop {r7, pc} 357entry: 358 %cmp10 = icmp eq i32 %N, 0 359 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 360 361vector.ph: ; preds = %entry 362 %n.rnd.up = add i32 %N, 15 363 %n.vec = and i32 %n.rnd.up, -16 364 %trip.count.minus.1 = add i32 %N, -1 365 %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 366 %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer 367 br label %vector.body 368 369vector.body: ; preds = %vector.body, %vector.ph 370 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 371 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 372 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 373 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 374 %0 = getelementptr inbounds i8, i8* %b, i32 %index 375 376 ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 377 %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) 378 379 %2 = bitcast i8* %0 to <16 x i8>* 380 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) 381 %3 = getelementptr inbounds i8, i8* %c, i32 %index 382 %4 = bitcast i8* %3 to <16 x i8>* 383 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef) 384 %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load 385 %6 = getelementptr inbounds i8, i8* %a, i32 %index 386 %7 = bitcast i8* %6 to <16 x i8>* 387 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1) 388 %index.next = add i32 %index, 16 389 %8 = icmp eq i32 %index.next, %n.vec 390 br i1 %8, label %for.cond.cleanup, label %vector.body 391 392for.cond.cleanup: ; preds = %vector.body, %entry 393 ret void 394} 395 396; Function Attrs: nofree norecurse nounwind 397define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 { 398; CHECK-LABEL: vector_mul_vector_i16: 399; CHECK: @ %bb.0: @ %entry 400; CHECK-NEXT: push {r7, lr} 401; CHECK-NEXT: cmp r3, #0 402; CHECK-NEXT: it eq 403; CHECK-NEXT: popeq {r7, pc} 404; CHECK-NEXT: .LBB6_1: @ %vector.ph 405; CHECK-NEXT: mov.w r12, #0 406; CHECK-NEXT: dlstp.16 lr, r3 407; CHECK-NEXT: .LBB6_2: @ %vector.body 408; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 409; CHECK-NEXT: add.w r12, r12, #8 410; CHECK-NEXT: vldrh.u16 q0, [r1], #16 411; CHECK-NEXT: vldrh.u16 q1, [r2], #16 412; CHECK-NEXT: vmul.i16 q0, q1, q0 413; CHECK-NEXT: vstrh.16 q0, [r0], #16 414; CHECK-NEXT: letp lr, .LBB6_2 415; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 416; CHECK-NEXT: pop {r7, pc} 417entry: 418 %cmp10 = icmp eq i32 %N, 0 419 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 420 421vector.ph: ; preds = %entry 422 %n.rnd.up = add i32 %N, 7 423 %n.vec = and i32 %n.rnd.up, -8 424 %trip.count.minus.1 = add i32 %N, -1 425 %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 426 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer 427 br label %vector.body 428 429vector.body: ; preds = %vector.body, %vector.ph 430 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 431 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 432 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 433 %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 434 %0 = getelementptr inbounds i16, i16* %b, i32 %index 435 436 ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 437 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) 438 439 %2 = bitcast i16* %0 to <8 x i16>* 440 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) 441 %3 = getelementptr inbounds i16, i16* %c, i32 %index 442 %4 = bitcast i16* %3 to <8 x i16>* 443 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef) 444 %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load 445 %6 = getelementptr inbounds i16, i16* %a, i32 %index 446 %7 = bitcast i16* %6 to <8 x i16>* 447 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1) 448 %index.next = add i32 %index, 8 449 %8 = icmp eq i32 %index.next, %n.vec 450 br i1 %8, label %for.cond.cleanup, label %vector.body 451 452for.cond.cleanup: ; preds = %vector.body, %entry 453 ret void 454} 455 456declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) 457declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) 458declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 459declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) 460declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) 461declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 462declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 463declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 464declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 465declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 466