1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { 5; CHECK-LABEL: gather_inc_mini_4i32: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vmov.i32 q1, #0x4 8; CHECK-NEXT: vadd.i32 q1, q0, q1 9; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 10; CHECK-NEXT: bx lr 11 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4> 12 %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 13 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 14 ret <4 x i32> %wide.masked.gather 15} 16 17define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { 18; CHECK-LABEL: gather_inc_minipred_4i32: 19; CHECK: @ %bb.0: 20; CHECK-NEXT: vmov.i32 q1, #0x4 21; CHECK-NEXT: movw r1, #3855 22; CHECK-NEXT: vadd.i32 q1, q0, q1 23; CHECK-NEXT: vmsr p0, r1 24; CHECK-NEXT: vpst 25; CHECK-NEXT: vldrwt.u32 q0, [r0, q1, uxtw #2] 26; CHECK-NEXT: bx lr 27 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4> 28 %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 29 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) 30 ret <4 x i32> %wide.masked.gather 31} 32 33define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) { 34; CHECK-LABEL: gather_inc_mini_8i16: 35; CHECK: @ %bb.0: 36; CHECK-NEXT: .save {r4, r5, r7, lr} 37; CHECK-NEXT: push {r4, r5, r7, lr} 38; CHECK-NEXT: vshl.i32 q0, q0, #1 39; CHECK-NEXT: vmov.i32 q2, #0x10 40; CHECK-NEXT: vadd.i32 q0, q0, r0 41; CHECK-NEXT: vshl.i32 q1, q1, #1 42; CHECK-NEXT: vadd.i32 q0, q0, q2 43; CHECK-NEXT: vadd.i32 q1, q1, r0 44; CHECK-NEXT: vmov r1, s2 45; CHECK-NEXT: vadd.i32 q1, q1, q2 46; CHECK-NEXT: vmov r2, s3 47; CHECK-NEXT: vmov r3, s0 48; CHECK-NEXT: vmov r5, s1 49; CHECK-NEXT: vmov r0, s4 50; CHECK-NEXT: vmov r4, s7 51; CHECK-NEXT: ldrh.w r12, [r1] 52; CHECK-NEXT: vmov r1, s5 53; CHECK-NEXT: ldrh.w lr, [r2] 54; CHECK-NEXT: vmov r2, s6 55; CHECK-NEXT: ldrh r3, [r3] 56; CHECK-NEXT: ldrh r5, [r5] 57; CHECK-NEXT: vmov.16 q0[0], r3 58; CHECK-NEXT: ldrh r0, [r0] 59; CHECK-NEXT: vmov.16 q0[1], r5 60; CHECK-NEXT: ldrh r4, [r4] 61; CHECK-NEXT: vmov.16 q0[2], r12 62; CHECK-NEXT: vmov.16 q0[3], lr 63; CHECK-NEXT: vmov.16 q0[4], r0 64; CHECK-NEXT: ldrh r1, [r1] 65; CHECK-NEXT: ldrh r2, [r2] 66; CHECK-NEXT: vmov.16 q0[5], r1 67; CHECK-NEXT: vmov.16 q0[6], r2 68; CHECK-NEXT: vmov.16 q0[7], r4 69; CHECK-NEXT: pop {r4, r5, r7, pc} 70 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 71 %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 72 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 73 ret <8 x i16> %wide.masked.gather 74} 75 76define arm_aapcs_vfpcc <8 x i16> @gather_inc_minipred_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) { 77; CHECK-LABEL: gather_inc_minipred_8i16: 78; CHECK: @ %bb.0: 79; CHECK-NEXT: vshl.i32 q0, q0, #1 80; CHECK-NEXT: vmov.i32 q2, #0x10 81; CHECK-NEXT: vadd.i32 q0, q0, r0 82; CHECK-NEXT: vshl.i32 q1, q1, #1 83; CHECK-NEXT: vadd.i32 q0, q0, q2 84; CHECK-NEXT: vadd.i32 q1, q1, r0 85; CHECK-NEXT: vmov r1, s0 86; CHECK-NEXT: vadd.i32 q1, q1, q2 87; CHECK-NEXT: vmov r3, s2 88; CHECK-NEXT: vmov r0, s4 89; CHECK-NEXT: vmov r2, s6 90; CHECK-NEXT: ldrh r1, [r1] 91; CHECK-NEXT: ldrh r3, [r3] 92; CHECK-NEXT: vmov.16 q0[0], r1 93; CHECK-NEXT: ldrh r0, [r0] 94; CHECK-NEXT: vmov.16 q0[2], r3 95; CHECK-NEXT: ldrh r2, [r2] 96; CHECK-NEXT: vmov.16 q0[4], r0 97; CHECK-NEXT: vmov.16 q0[6], r2 98; CHECK-NEXT: bx lr 99 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 100 %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 101 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> undef) 102 ret <8 x i16> %wide.masked.gather 103} 104 105define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) { 106; CHECK-LABEL: gather_inc_mini_16i8: 107; CHECK: @ %bb.0: 108; CHECK-NEXT: .save {r4, r5, r6, lr} 109; CHECK-NEXT: push {r4, r5, r6, lr} 110; CHECK-NEXT: .vsave {d8, d9} 111; CHECK-NEXT: vpush {d8, d9} 112; CHECK-NEXT: vmov.i32 q4, #0x10 113; CHECK-NEXT: vadd.i32 q3, q3, r0 114; CHECK-NEXT: vadd.i32 q3, q3, q4 115; CHECK-NEXT: vadd.i32 q0, q0, r0 116; CHECK-NEXT: vmov r1, s14 117; CHECK-NEXT: vadd.i32 q1, q1, r0 118; CHECK-NEXT: vadd.i32 q1, q1, q4 119; CHECK-NEXT: vadd.i32 q2, q2, r0 120; CHECK-NEXT: vmov r0, s4 121; CHECK-NEXT: vadd.i32 q2, q2, q4 122; CHECK-NEXT: vmov r2, s10 123; CHECK-NEXT: vmov r4, s11 124; CHECK-NEXT: ldrb.w r12, [r1] 125; CHECK-NEXT: vmov r1, s15 126; CHECK-NEXT: ldrb r0, [r0] 127; CHECK-NEXT: ldrb r2, [r2] 128; CHECK-NEXT: ldrb r4, [r4] 129; CHECK-NEXT: ldrb.w lr, [r1] 130; CHECK-NEXT: vmov r1, s12 131; CHECK-NEXT: ldrb r3, [r1] 132; CHECK-NEXT: vmov r1, s13 133; CHECK-NEXT: vadd.i32 q3, q0, q4 134; CHECK-NEXT: vmov r5, s12 135; CHECK-NEXT: vmov r6, s15 136; CHECK-NEXT: ldrb r1, [r1] 137; CHECK-NEXT: ldrb r5, [r5] 138; CHECK-NEXT: ldrb r6, [r6] 139; CHECK-NEXT: vmov.8 q0[0], r5 140; CHECK-NEXT: vmov r5, s13 141; CHECK-NEXT: ldrb r5, [r5] 142; CHECK-NEXT: vmov.8 q0[1], r5 143; CHECK-NEXT: vmov r5, s14 144; CHECK-NEXT: ldrb r5, [r5] 145; CHECK-NEXT: vmov.8 q0[2], r5 146; CHECK-NEXT: vmov r5, s8 147; CHECK-NEXT: vmov.8 q0[3], r6 148; CHECK-NEXT: vmov.8 q0[4], r0 149; CHECK-NEXT: vmov r0, s5 150; CHECK-NEXT: ldrb r5, [r5] 151; CHECK-NEXT: ldrb r0, [r0] 152; CHECK-NEXT: vmov.8 q0[5], r0 153; CHECK-NEXT: vmov r0, s6 154; CHECK-NEXT: ldrb r0, [r0] 155; CHECK-NEXT: vmov.8 q0[6], r0 156; CHECK-NEXT: vmov r0, s7 157; CHECK-NEXT: ldrb r0, [r0] 158; CHECK-NEXT: vmov.8 q0[7], r0 159; CHECK-NEXT: vmov r0, s9 160; CHECK-NEXT: vmov.8 q0[8], r5 161; CHECK-NEXT: ldrb r0, [r0] 162; CHECK-NEXT: vmov.8 q0[9], r0 163; CHECK-NEXT: vmov.8 q0[10], r2 164; CHECK-NEXT: vmov.8 q0[11], r4 165; CHECK-NEXT: vmov.8 q0[12], r3 166; CHECK-NEXT: vmov.8 q0[13], r1 167; CHECK-NEXT: vmov.8 q0[14], r12 168; CHECK-NEXT: vmov.8 q0[15], lr 169; CHECK-NEXT: vpop {d8, d9} 170; CHECK-NEXT: pop {r4, r5, r6, pc} 171 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 172 %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 173 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 174 ret <16 x i8> %wide.masked.gather 175} 176 177define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) { 178; CHECK-LABEL: gather_inc_minipred_16i8: 179; CHECK: @ %bb.0: 180; CHECK-NEXT: .save {r4, r5, r7, lr} 181; CHECK-NEXT: push {r4, r5, r7, lr} 182; CHECK-NEXT: .vsave {d8, d9} 183; CHECK-NEXT: vpush {d8, d9} 184; CHECK-NEXT: vmov.i32 q4, #0x10 185; CHECK-NEXT: vadd.i32 q2, q2, r0 186; CHECK-NEXT: vadd.i32 q2, q2, q4 187; CHECK-NEXT: vadd.i32 q1, q1, r0 188; CHECK-NEXT: vmov r2, s8 189; CHECK-NEXT: vadd.i32 q1, q1, q4 190; CHECK-NEXT: vmov r1, s4 191; CHECK-NEXT: vadd.i32 q0, q0, r0 192; CHECK-NEXT: vadd.i32 q0, q0, q4 193; CHECK-NEXT: vmov r3, s10 194; CHECK-NEXT: vmov r5, s2 195; CHECK-NEXT: ldrb.w lr, [r2] 196; CHECK-NEXT: vmov r2, s0 197; CHECK-NEXT: ldrb.w r12, [r1] 198; CHECK-NEXT: vmov r1, s6 199; CHECK-NEXT: vadd.i32 q1, q3, r0 200; CHECK-NEXT: ldrb r3, [r3] 201; CHECK-NEXT: vadd.i32 q1, q1, q4 202; CHECK-NEXT: ldrb r5, [r5] 203; CHECK-NEXT: vmov r0, s4 204; CHECK-NEXT: vmov r4, s6 205; CHECK-NEXT: ldrb r2, [r2] 206; CHECK-NEXT: ldrb r1, [r1] 207; CHECK-NEXT: vmov.8 q0[0], r2 208; CHECK-NEXT: vmov.8 q0[2], r5 209; CHECK-NEXT: vmov.8 q0[4], r12 210; CHECK-NEXT: ldrb r0, [r0] 211; CHECK-NEXT: vmov.8 q0[6], r1 212; CHECK-NEXT: ldrb r4, [r4] 213; CHECK-NEXT: vmov.8 q0[8], lr 214; CHECK-NEXT: vmov.8 q0[10], r3 215; CHECK-NEXT: vmov.8 q0[12], r0 216; CHECK-NEXT: vmov.8 q0[14], r4 217; CHECK-NEXT: vpop {d8, d9} 218; CHECK-NEXT: pop {r4, r5, r7, pc} 219 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 220 %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 221 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef) 222 ret <16 x i8> %wide.masked.gather 223} 224 225define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { 226; CHECK-LABEL: gather_pre_inc: 227; CHECK: @ %bb.0: @ %vector.ph 228; CHECK-NEXT: adr r3, .LCPI6_0 229; CHECK-NEXT: vldrw.u32 q0, [r3] 230; CHECK-NEXT: vadd.i32 q0, q0, r0 231; CHECK-NEXT: .LBB6_1: @ %vector.body 232; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 233; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! 234; CHECK-NEXT: subs r2, #4 235; CHECK-NEXT: vstrb.8 q1, [r1], #16 236; CHECK-NEXT: bne .LBB6_1 237; CHECK-NEXT: @ %bb.2: @ %end 238; CHECK-NEXT: bx lr 239; CHECK-NEXT: .p2align 4 240; CHECK-NEXT: @ %bb.3: 241; CHECK-NEXT: .LCPI6_0: 242; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 243; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 244; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 245; CHECK-NEXT: .long 0 @ 0x0 246vector.ph: ; preds = %for.body.preheader 247 %ind.end = shl i32 %n.vec, 1 248 br label %vector.body 249 250vector.body: ; preds = %vector.body, %vector.ph 251 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 252 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ] 253 %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 254 %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6> 255 %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 256 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 257 %3 = getelementptr inbounds i32, i32* %dst, i32 %index 258 %4 = bitcast i32* %3 to <4 x i32>* 259 store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4 260 %index.next = add i32 %index, 4 261 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8> 262 %5 = icmp eq i32 %index.next, %n.vec 263 br i1 %5, label %end, label %vector.body 264 265end: 266 ret void; 267} 268 269define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec43) { 270; CHECK-LABEL: gather_post_inc: 271; CHECK: @ %bb.0: @ %vector.ph41 272; CHECK-NEXT: adr r3, .LCPI7_0 273; CHECK-NEXT: vldrw.u32 q0, [r3] 274; CHECK-NEXT: vadd.i32 q0, q0, r0 275; CHECK-NEXT: .LBB7_1: @ %vector.body39 276; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 277; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! 278; CHECK-NEXT: subs r2, #4 279; CHECK-NEXT: vstrb.8 q1, [r1], #16 280; CHECK-NEXT: bne .LBB7_1 281; CHECK-NEXT: @ %bb.2: @ %end 282; CHECK-NEXT: bx lr 283; CHECK-NEXT: .p2align 4 284; CHECK-NEXT: @ %bb.3: 285; CHECK-NEXT: .LCPI7_0: 286; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 287; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 288; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 289; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 290vector.ph41: ; preds = %for.body6.preheader 291 %ind.end47 = shl i32 %n.vec43, 1 292 br label %vector.body39 293 294vector.body39: ; preds = %vector.body39, %vector.ph41 295 %index44 = phi i32 [ 0, %vector.ph41 ], [ %index.next45, %vector.body39 ] 296 %vec.ind50 = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph41 ], [ %vec.ind.next51, %vector.body39 ] 297 %0 = mul nuw nsw <4 x i32> %vec.ind50, <i32 3, i32 3, i32 3, i32 3> 298 %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0 299 %wide.masked.gather55 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 300 %2 = getelementptr inbounds i32, i32* %dst, i32 %index44 301 %3 = bitcast i32* %2 to <4 x i32>* 302 store <4 x i32> %wide.masked.gather55, <4 x i32>* %3, align 4 303 %index.next45 = add i32 %index44, 4 304 %vec.ind.next51 = add <4 x i32> %vec.ind50, <i32 8, i32 8, i32 8, i32 8> 305 %4 = icmp eq i32 %index.next45, %n.vec43 306 br i1 %4, label %end, label %vector.body39 307 308end: 309 ret void; 310} 311 312define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { 313; CHECK-LABEL: gather_inc_v4i32_simple: 314; CHECK: @ %bb.0: @ %entry 315; CHECK-NEXT: .save {r4, lr} 316; CHECK-NEXT: push {r4, lr} 317; CHECK-NEXT: cmp r2, #1 318; CHECK-NEXT: it lt 319; CHECK-NEXT: poplt {r4, pc} 320; CHECK-NEXT: .LBB8_1: @ %vector.ph.preheader 321; CHECK-NEXT: bic r12, r2, #3 322; CHECK-NEXT: movs r3, #1 323; CHECK-NEXT: sub.w lr, r12, #4 324; CHECK-NEXT: add.w r4, r3, lr, lsr #2 325; CHECK-NEXT: adr r3, .LCPI8_0 326; CHECK-NEXT: vldrw.u32 q0, [r3] 327; CHECK-NEXT: vadd.i32 q0, q0, r0 328; CHECK-NEXT: .LBB8_2: @ %vector.ph 329; CHECK-NEXT: @ =>This Loop Header: Depth=1 330; CHECK-NEXT: @ Child Loop BB8_3 Depth 2 331; CHECK-NEXT: dls lr, r4 332; CHECK-NEXT: mov r0, r1 333; CHECK-NEXT: vmov q1, q0 334; CHECK-NEXT: .LBB8_3: @ %vector.body 335; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1 336; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 337; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! 338; CHECK-NEXT: vstrb.8 q2, [r0], #16 339; CHECK-NEXT: le lr, .LBB8_3 340; CHECK-NEXT: @ %bb.4: @ %middle.block 341; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1 342; CHECK-NEXT: cmp r12, r2 343; CHECK-NEXT: bne .LBB8_2 344; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup 345; CHECK-NEXT: pop {r4, pc} 346; CHECK-NEXT: .p2align 4 347; CHECK-NEXT: @ %bb.6: 348; CHECK-NEXT: .LCPI8_0: 349; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 350; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 351; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 352; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 353entry: 354 %cmp22 = icmp sgt i32 %n, 0 355 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 356 357vector.ph: ; preds = %for.body.preheader 358 %n.vec = and i32 %n, -4 359 br label %vector.body 360 361vector.body: ; preds = %vector.body, %vector.ph 362 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 363 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 364 %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind 365 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 366 %1 = getelementptr inbounds i32, i32* %dst, i32 %index 367 %2 = bitcast i32* %1 to <4 x i32>* 368 store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4 369 %index.next = add i32 %index, 4 370 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 371 %3 = icmp eq i32 %index.next, %n.vec 372 br i1 %3, label %middle.block, label %vector.body 373 374middle.block: ; preds = %vector.body 375 %cmp.n = icmp eq i32 %n.vec, %n 376 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 377 378for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 379 ret void 380} 381 382define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { 383; CHECK-LABEL: gather_inc_v4i32_complex: 384; CHECK: @ %bb.0: @ %entry 385; CHECK-NEXT: .save {r4, r5, r7, lr} 386; CHECK-NEXT: push {r4, r5, r7, lr} 387; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 388; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 389; CHECK-NEXT: cmp r2, #1 390; CHECK-NEXT: blt .LBB9_5 391; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 392; CHECK-NEXT: bic r12, r2, #3 393; CHECK-NEXT: movs r3, #1 394; CHECK-NEXT: sub.w lr, r12, #4 395; CHECK-NEXT: adr r4, .LCPI9_1 396; CHECK-NEXT: adr r5, .LCPI9_2 397; CHECK-NEXT: vldrw.u32 q1, [r4] 398; CHECK-NEXT: add.w r3, r3, lr, lsr #2 399; CHECK-NEXT: adr.w lr, .LCPI9_0 400; CHECK-NEXT: vldrw.u32 q0, [r5] 401; CHECK-NEXT: vldrw.u32 q2, [lr] 402; CHECK-NEXT: vadd.i32 q1, q1, r0 403; CHECK-NEXT: vadd.i32 q0, q0, r0 404; CHECK-NEXT: vadd.i32 q2, q2, r0 405; CHECK-NEXT: .LBB9_2: @ %vector.ph 406; CHECK-NEXT: @ =>This Loop Header: Depth=1 407; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 408; CHECK-NEXT: dls lr, r3 409; CHECK-NEXT: mov r0, r1 410; CHECK-NEXT: vmov q3, q1 411; CHECK-NEXT: vmov q4, q0 412; CHECK-NEXT: vmov q5, q2 413; CHECK-NEXT: .LBB9_3: @ %vector.body 414; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 415; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 416; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! 417; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! 418; CHECK-NEXT: vadd.i32 q6, q7, q6 419; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! 420; CHECK-NEXT: vadd.i32 q6, q6, q7 421; CHECK-NEXT: vstrb.8 q6, [r0], #16 422; CHECK-NEXT: le lr, .LBB9_3 423; CHECK-NEXT: @ %bb.4: @ %middle.block 424; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 425; CHECK-NEXT: cmp r12, r2 426; CHECK-NEXT: bne .LBB9_2 427; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup 428; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 429; CHECK-NEXT: pop {r4, r5, r7, pc} 430; CHECK-NEXT: .p2align 4 431; CHECK-NEXT: @ %bb.6: 432; CHECK-NEXT: .LCPI9_0: 433; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 434; CHECK-NEXT: .long 4294967260 @ 0xffffffdc 435; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 436; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 437; CHECK-NEXT: .LCPI9_1: 438; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 439; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 440; CHECK-NEXT: .long 4294967276 @ 0xffffffec 441; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 442; CHECK-NEXT: .LCPI9_2: 443; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 444; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 445; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 446; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 447entry: 448 %cmp22 = icmp sgt i32 %n, 0 449 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 450 451vector.ph: ; preds = %for.body.preheader 452 %n.vec = and i32 %n, -4 453 br label %vector.body 454 455vector.body: ; preds = %vector.body, %vector.ph 456 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 457 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 458 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 459 %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0 460 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 461 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 462 %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2 463 %wide.masked.gather24 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 464 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2> 465 %5 = getelementptr inbounds i32, i32* %data, <4 x i32> %4 466 %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 467 %6 = add nsw <4 x i32> %wide.masked.gather24, %wide.masked.gather 468 %7 = add nsw <4 x i32> %6, %wide.masked.gather25 469 %8 = getelementptr inbounds i32, i32* %dst, i32 %index 470 %9 = bitcast i32* %8 to <4 x i32>* 471 store <4 x i32> %7, <4 x i32>* %9, align 4 472 %index.next = add i32 %index, 4 473 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 474 %10 = icmp eq i32 %index.next, %n.vec 475 br i1 %10, label %middle.block, label %vector.body 476 477middle.block: ; preds = %vector.body 478 %cmp.n = icmp eq i32 %n.vec, %n 479 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 480 481for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 482 ret void 483} 484 485define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { 486; CHECK-LABEL: gather_inc_v4i32_large: 487; CHECK: @ %bb.0: @ %entry 488; CHECK-NEXT: .save {r4, lr} 489; CHECK-NEXT: push {r4, lr} 490; CHECK-NEXT: cmp r2, #1 491; CHECK-NEXT: it lt 492; CHECK-NEXT: poplt {r4, pc} 493; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader 494; CHECK-NEXT: bic r12, r2, #3 495; CHECK-NEXT: movs r3, #1 496; CHECK-NEXT: sub.w lr, r12, #4 497; CHECK-NEXT: add.w r4, r3, lr, lsr #2 498; CHECK-NEXT: adr r3, .LCPI10_0 499; CHECK-NEXT: vldrw.u32 q0, [r3] 500; CHECK-NEXT: vadd.i32 q0, q0, r0 501; CHECK-NEXT: .LBB10_2: @ %vector.ph 502; CHECK-NEXT: @ =>This Loop Header: Depth=1 503; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 504; CHECK-NEXT: dls lr, r4 505; CHECK-NEXT: mov r0, r1 506; CHECK-NEXT: vmov q1, q0 507; CHECK-NEXT: .LBB10_3: @ %vector.body 508; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 509; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 510; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! 511; CHECK-NEXT: vstrb.8 q2, [r0], #16 512; CHECK-NEXT: le lr, .LBB10_3 513; CHECK-NEXT: @ %bb.4: @ %middle.block 514; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 515; CHECK-NEXT: cmp r12, r2 516; CHECK-NEXT: bne .LBB10_2 517; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup 518; CHECK-NEXT: pop {r4, pc} 519; CHECK-NEXT: .p2align 4 520; CHECK-NEXT: @ %bb.6: 521; CHECK-NEXT: .LCPI10_0: 522; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 523; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 524; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c 525; CHECK-NEXT: .long 4294966800 @ 0xfffffe10 526entry: 527 %cmp22 = icmp sgt i32 %n, 0 528 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 529 530vector.ph: ; preds = %for.body.preheader 531 %n.vec = and i32 %n, -4 532 br label %vector.body 533 534vector.body: ; preds = %vector.body, %vector.ph 535 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 536 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 537 %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind 538 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 539 %1 = getelementptr inbounds i32, i32* %dst, i32 %index 540 %2 = bitcast i32* %1 to <4 x i32>* 541 store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4 542 %index.next = add i32 %index, 4 543 %vec.ind.next = add <4 x i32> %vec.ind, <i32 127, i32 127, i32 127, i32 127> 544 %3 = icmp eq i32 %index.next, %n.vec 545 br i1 %3, label %middle.block, label %vector.body 546 547middle.block: ; preds = %vector.body 548 %cmp.n = icmp eq i32 %n.vec, %n 549 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 550 551for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 552 ret void 553} 554 555; TODO: uneven - I think it's not possible to create such an example, because vec.ind will always be increased by a vector with 4 elements (=> x*4 = even) 556 557; TODO: What is sxth? 558define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) { 559; CHECK-LABEL: gather_inc_v8i16_simple: 560; CHECK: @ %bb.0: @ %entry 561; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 562; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 563; CHECK-NEXT: .pad #4 564; CHECK-NEXT: sub sp, #4 565; CHECK-NEXT: .vsave {d8, d9} 566; CHECK-NEXT: vpush {d8, d9} 567; CHECK-NEXT: .pad #8 568; CHECK-NEXT: sub sp, #8 569; CHECK-NEXT: cmp r2, #1 570; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 571; CHECK-NEXT: blt .LBB11_5 572; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 573; CHECK-NEXT: bic r8, r2, #7 574; CHECK-NEXT: movs r5, #1 575; CHECK-NEXT: sub.w r6, r8, #8 576; CHECK-NEXT: vmov.i16 q1, #0x8 577; CHECK-NEXT: add.w r1, r5, r6, lsr #3 578; CHECK-NEXT: adr r6, .LCPI11_0 579; CHECK-NEXT: vldrw.u32 q0, [r6] 580; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 581; CHECK-NEXT: .LBB11_2: @ %vector.ph 582; CHECK-NEXT: @ =>This Loop Header: Depth=1 583; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 584; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload 585; CHECK-NEXT: vmov q2, q0 586; CHECK-NEXT: dls lr, r1 587; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload 588; CHECK-NEXT: .LBB11_3: @ %vector.body 589; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 590; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 591; CHECK-NEXT: vmov.u16 r7, q2[4] 592; CHECK-NEXT: vmov.u16 r5, q2[0] 593; CHECK-NEXT: vmov.32 q4[0], r7 594; CHECK-NEXT: vmov.u16 r7, q2[5] 595; CHECK-NEXT: vmov.32 q4[1], r7 596; CHECK-NEXT: vmov.u16 r7, q2[6] 597; CHECK-NEXT: vmov.32 q4[2], r7 598; CHECK-NEXT: vmov.u16 r7, q2[7] 599; CHECK-NEXT: vmov.32 q4[3], r7 600; CHECK-NEXT: vmov.32 q3[0], r5 601; CHECK-NEXT: vmovlb.s16 q4, q4 602; CHECK-NEXT: vmov.u16 r5, q2[1] 603; CHECK-NEXT: vshl.i32 q4, q4, #1 604; CHECK-NEXT: vmov.32 q3[1], r5 605; CHECK-NEXT: vadd.i32 q4, q4, r0 606; CHECK-NEXT: vmov.u16 r5, q2[2] 607; CHECK-NEXT: vmov r7, s16 608; CHECK-NEXT: vmov.32 q3[2], r5 609; CHECK-NEXT: vmov.u16 r5, q2[3] 610; CHECK-NEXT: vmov r3, s17 611; CHECK-NEXT: vmov.32 q3[3], r5 612; CHECK-NEXT: vadd.i16 q2, q2, q1 613; CHECK-NEXT: vmovlb.s16 q3, q3 614; CHECK-NEXT: vshl.i32 q3, q3, #1 615; CHECK-NEXT: vadd.i32 q3, q3, r0 616; CHECK-NEXT: vmov r5, s15 617; CHECK-NEXT: vmov r6, s14 618; CHECK-NEXT: vmov r12, s13 619; CHECK-NEXT: ldrh.w r11, [r7] 620; CHECK-NEXT: vmov r7, s12 621; CHECK-NEXT: ldrh r3, [r3] 622; CHECK-NEXT: ldrh.w r9, [r5] 623; CHECK-NEXT: vmov r5, s18 624; CHECK-NEXT: ldrh.w r10, [r6] 625; CHECK-NEXT: vmov r6, s19 626; CHECK-NEXT: ldrh.w r1, [r12] 627; CHECK-NEXT: ldrh r7, [r7] 628; CHECK-NEXT: vmov.16 q3[0], r7 629; CHECK-NEXT: vmov.16 q3[1], r1 630; CHECK-NEXT: vmov.16 q3[2], r10 631; CHECK-NEXT: vmov.16 q3[3], r9 632; CHECK-NEXT: vmov.16 q3[4], r11 633; CHECK-NEXT: ldrh r5, [r5] 634; CHECK-NEXT: vmov.16 q3[5], r3 635; CHECK-NEXT: ldrh r6, [r6] 636; CHECK-NEXT: vmov.16 q3[6], r5 637; CHECK-NEXT: vmov.16 q3[7], r6 638; CHECK-NEXT: vstrb.8 q3, [r4], #16 639; CHECK-NEXT: le lr, .LBB11_3 640; CHECK-NEXT: @ %bb.4: @ %middle.block 641; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 642; CHECK-NEXT: cmp r8, r2 643; CHECK-NEXT: bne .LBB11_2 644; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup 645; CHECK-NEXT: add sp, #8 646; CHECK-NEXT: vpop {d8, d9} 647; CHECK-NEXT: add sp, #4 648; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 649; CHECK-NEXT: .p2align 4 650; CHECK-NEXT: @ %bb.6: 651; CHECK-NEXT: .LCPI11_0: 652; CHECK-NEXT: .short 0 @ 0x0 653; CHECK-NEXT: .short 1 @ 0x1 654; CHECK-NEXT: .short 2 @ 0x2 655; CHECK-NEXT: .short 3 @ 0x3 656; CHECK-NEXT: .short 4 @ 0x4 657; CHECK-NEXT: .short 5 @ 0x5 658; CHECK-NEXT: .short 6 @ 0x6 659; CHECK-NEXT: .short 7 @ 0x7 660 661 662entry: 663 %cmp22 = icmp sgt i32 %n, 0 664 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 665 666vector.ph: ; preds = %for.body.preheader 667 %n.vec = and i32 %n, -8 668 br label %vector.body 669 670vector.body: ; preds = %vector.body, %vector.ph 671 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 672 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ] 673 %0 = getelementptr inbounds i16, i16* %data, <8 x i16> %vec.ind 674 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %0, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 675 %1 = getelementptr inbounds i16, i16* %dst, i32 %index 676 %2 = bitcast i16* %1 to <8 x i16>* 677 store <8 x i16> %wide.masked.gather, <8 x i16>* %2, align 2 678 %index.next = add i32 %index, 8 679 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 680 %3 = icmp eq i32 %index.next, %n.vec 681 br i1 %3, label %middle.block, label %vector.body 682 683middle.block: ; preds = %vector.body 684 %cmp.n = icmp eq i32 %n.vec, %n 685 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 686 687for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 688 ret void 689} 690 691; TODO: This looks absolutely terrifying :( 692define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) { 693; CHECK-LABEL: gather_inc_v8i16_complex: 694; CHECK: @ %bb.0: @ %entry 695; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 696; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 697; CHECK-NEXT: .pad #4 698; CHECK-NEXT: sub sp, #4 699; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 700; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 701; CHECK-NEXT: .pad #104 702; CHECK-NEXT: sub sp, #104 703; CHECK-NEXT: cmp r2, #1 704; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill 705; CHECK-NEXT: blt.w .LBB12_5 706; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 707; CHECK-NEXT: bic r8, r2, #7 708; CHECK-NEXT: adr r6, .LCPI12_2 709; CHECK-NEXT: sub.w r3, r8, #8 710; CHECK-NEXT: vldrw.u32 q0, [r6] 711; CHECK-NEXT: movs r7, #1 712; CHECK-NEXT: vmov.i16 q3, #0x18 713; CHECK-NEXT: add.w r1, r7, r3, lsr #3 714; CHECK-NEXT: adr r3, .LCPI12_0 715; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 716; CHECK-NEXT: vldrw.u32 q0, [r3] 717; CHECK-NEXT: adr r7, .LCPI12_1 718; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill 719; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 720; CHECK-NEXT: vldrw.u32 q0, [r7] 721; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill 722; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 723; CHECK-NEXT: .LBB12_2: @ %vector.ph 724; CHECK-NEXT: @ =>This Loop Header: Depth=1 725; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 726; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload 727; CHECK-NEXT: dls lr, r1 728; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload 729; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload 730; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload 731; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload 732; CHECK-NEXT: .LBB12_3: @ %vector.body 733; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 734; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 735; CHECK-NEXT: vmov.u16 r3, q5[0] 736; CHECK-NEXT: vmov.u16 r7, q7[4] 737; CHECK-NEXT: vmov.32 q0[0], r3 738; CHECK-NEXT: vmov.u16 r3, q5[1] 739; CHECK-NEXT: vmov.32 q0[1], r3 740; CHECK-NEXT: vmov.u16 r3, q5[2] 741; CHECK-NEXT: vmov.32 q0[2], r3 742; CHECK-NEXT: vmov.u16 r3, q5[3] 743; CHECK-NEXT: vmov.32 q0[3], r3 744; CHECK-NEXT: vmov.u16 r12, q6[0] 745; CHECK-NEXT: vmovlb.s16 q0, q0 746; CHECK-NEXT: vmov.32 q1[0], r12 747; CHECK-NEXT: vshl.i32 q0, q0, #1 748; CHECK-NEXT: vmov.u16 r1, q6[1] 749; CHECK-NEXT: vadd.i32 q2, q0, r0 750; CHECK-NEXT: vmov.32 q1[1], r1 751; CHECK-NEXT: vmov r3, s10 752; CHECK-NEXT: vmov.u16 r1, q6[2] 753; CHECK-NEXT: vmov.32 q1[2], r1 754; CHECK-NEXT: vmov.u16 r1, q6[3] 755; CHECK-NEXT: vmov.32 q1[3], r1 756; CHECK-NEXT: vmov.u16 r1, q6[4] 757; CHECK-NEXT: vmovlb.s16 q1, q1 758; CHECK-NEXT: vmov r6, s11 759; CHECK-NEXT: vshl.i32 q1, q1, #1 760; CHECK-NEXT: vadd.i32 q4, q1, r0 761; CHECK-NEXT: ldrh.w r9, [r3] 762; CHECK-NEXT: vmov.u16 r3, q5[4] 763; CHECK-NEXT: vmov.32 q0[0], r3 764; CHECK-NEXT: vmov.u16 r3, q5[5] 765; CHECK-NEXT: vmov.32 q0[1], r3 766; CHECK-NEXT: vmov.u16 r3, q5[6] 767; CHECK-NEXT: vmov.32 q0[2], r3 768; CHECK-NEXT: vmov.u16 r3, q5[7] 769; CHECK-NEXT: vmov.32 q0[3], r3 770; CHECK-NEXT: ldrh r6, [r6] 771; CHECK-NEXT: vmovlb.s16 q0, q0 772; CHECK-NEXT: vshl.i32 q0, q0, #1 773; CHECK-NEXT: vadd.i32 q0, q0, r0 774; CHECK-NEXT: vmov r3, s0 775; CHECK-NEXT: vmov r5, s3 776; CHECK-NEXT: ldrh.w r10, [r3] 777; CHECK-NEXT: vmov r3, s1 778; CHECK-NEXT: ldrh r5, [r5] 779; CHECK-NEXT: ldrh.w r11, [r3] 780; CHECK-NEXT: vmov r3, s2 781; CHECK-NEXT: vmov.32 q0[0], r7 782; CHECK-NEXT: vmov.u16 r7, q7[5] 783; CHECK-NEXT: vmov.32 q0[1], r7 784; CHECK-NEXT: vmov.u16 r7, q7[6] 785; CHECK-NEXT: vmov.32 q0[2], r7 786; CHECK-NEXT: vmov.u16 r7, q7[7] 787; CHECK-NEXT: vmov.32 q0[3], r7 788; CHECK-NEXT: vmovlb.s16 q0, q0 789; CHECK-NEXT: vshl.i32 q0, q0, #1 790; CHECK-NEXT: vadd.i32 q0, q0, r0 791; CHECK-NEXT: vmov r7, s2 792; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill 793; CHECK-NEXT: vmov.32 q0[0], r1 794; CHECK-NEXT: vmov.u16 r1, q6[5] 795; CHECK-NEXT: vmov.32 q0[1], r1 796; CHECK-NEXT: vmov.u16 r1, q6[6] 797; CHECK-NEXT: vmov.32 q0[2], r1 798; CHECK-NEXT: vmov.u16 r1, q6[7] 799; CHECK-NEXT: vmov.32 q0[3], r1 800; CHECK-NEXT: vmov.u16 r1, q7[0] 801; CHECK-NEXT: vmov.32 q3[0], r1 802; CHECK-NEXT: vmov.u16 r1, q7[1] 803; CHECK-NEXT: vmov.32 q3[1], r1 804; CHECK-NEXT: vmov.u16 r1, q7[2] 805; CHECK-NEXT: vmov.32 q3[2], r1 806; CHECK-NEXT: vmov.u16 r1, q7[3] 807; CHECK-NEXT: vmov.32 q3[3], r1 808; CHECK-NEXT: vmov r1, s8 809; CHECK-NEXT: vmovlb.s16 q0, q0 810; CHECK-NEXT: vmovlb.s16 q3, q3 811; CHECK-NEXT: vshl.i32 q0, q0, #1 812; CHECK-NEXT: vshl.i32 q3, q3, #1 813; CHECK-NEXT: vadd.i32 q0, q0, r0 814; CHECK-NEXT: vadd.i32 q3, q3, r0 815; CHECK-NEXT: ldrh r3, [r3] 816; CHECK-NEXT: ldrh r7, [r7] 817; CHECK-NEXT: ldrh r1, [r1] 818; CHECK-NEXT: vmov.16 q1[0], r1 819; CHECK-NEXT: vmov r1, s9 820; CHECK-NEXT: ldrh r1, [r1] 821; CHECK-NEXT: vmov.16 q1[1], r1 822; CHECK-NEXT: vmov r1, s16 823; CHECK-NEXT: vmov.16 q1[2], r9 824; CHECK-NEXT: vmov.16 q1[3], r6 825; CHECK-NEXT: vmov.16 q1[4], r10 826; CHECK-NEXT: vmov.16 q1[5], r11 827; CHECK-NEXT: vmov.16 q1[6], r3 828; CHECK-NEXT: vmov.16 q1[7], r5 829; CHECK-NEXT: ldrh r1, [r1] 830; CHECK-NEXT: vmov.16 q2[0], r1 831; CHECK-NEXT: vmov r1, s17 832; CHECK-NEXT: ldrh r1, [r1] 833; CHECK-NEXT: vmov.16 q2[1], r1 834; CHECK-NEXT: vmov r1, s18 835; CHECK-NEXT: ldrh r1, [r1] 836; CHECK-NEXT: vmov.16 q2[2], r1 837; CHECK-NEXT: vmov r1, s19 838; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload 839; CHECK-NEXT: ldrh r1, [r1] 840; CHECK-NEXT: vmov.16 q2[3], r1 841; CHECK-NEXT: vmov r1, s0 842; CHECK-NEXT: ldrh r1, [r1] 843; CHECK-NEXT: vmov.16 q2[4], r1 844; CHECK-NEXT: vmov r1, s1 845; CHECK-NEXT: ldrh r1, [r1] 846; CHECK-NEXT: vmov.16 q2[5], r1 847; CHECK-NEXT: vmov r1, s2 848; CHECK-NEXT: ldrh r1, [r1] 849; CHECK-NEXT: vmov.16 q2[6], r1 850; CHECK-NEXT: vmov r1, s3 851; CHECK-NEXT: ldrh r1, [r1] 852; CHECK-NEXT: vmov.16 q2[7], r1 853; CHECK-NEXT: vmov r1, s12 854; CHECK-NEXT: ldrh r1, [r1] 855; CHECK-NEXT: vmov.16 q0[0], r1 856; CHECK-NEXT: vmov r1, s13 857; CHECK-NEXT: ldrh r1, [r1] 858; CHECK-NEXT: vmov.16 q0[1], r1 859; CHECK-NEXT: vmov r1, s14 860; CHECK-NEXT: ldrh r1, [r1] 861; CHECK-NEXT: vmov.16 q0[2], r1 862; CHECK-NEXT: vmov r1, s15 863; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 864; CHECK-NEXT: vadd.i16 q6, q6, q3 865; CHECK-NEXT: vadd.i16 q5, q5, q3 866; CHECK-NEXT: vadd.i16 q7, q7, q3 867; CHECK-NEXT: ldrh r1, [r1] 868; CHECK-NEXT: vmov.16 q0[3], r1 869; CHECK-NEXT: vmov r1, s16 870; CHECK-NEXT: ldrh r1, [r1] 871; CHECK-NEXT: vmov.16 q0[4], r1 872; CHECK-NEXT: vmov r1, s17 873; CHECK-NEXT: ldrh r1, [r1] 874; CHECK-NEXT: vmov.16 q0[5], r1 875; CHECK-NEXT: vmov r1, s19 876; CHECK-NEXT: vmov.16 q0[6], r7 877; CHECK-NEXT: ldrh r1, [r1] 878; CHECK-NEXT: vmov.16 q0[7], r1 879; CHECK-NEXT: vadd.i16 q0, q0, q2 880; CHECK-NEXT: vadd.i16 q0, q0, q1 881; CHECK-NEXT: vstrb.8 q0, [r4], #16 882; CHECK-NEXT: le lr, .LBB12_3 883; CHECK-NEXT: @ %bb.4: @ %middle.block 884; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 885; CHECK-NEXT: cmp r8, r2 886; CHECK-NEXT: bne.w .LBB12_2 887; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup 888; CHECK-NEXT: add sp, #104 889; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 890; CHECK-NEXT: add sp, #4 891; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 892; CHECK-NEXT: .p2align 4 893; CHECK-NEXT: @ %bb.6: 894; CHECK-NEXT: .LCPI12_0: 895; CHECK-NEXT: .short 1 @ 0x1 896; CHECK-NEXT: .short 4 @ 0x4 897; CHECK-NEXT: .short 7 @ 0x7 898; CHECK-NEXT: .short 10 @ 0xa 899; CHECK-NEXT: .short 13 @ 0xd 900; CHECK-NEXT: .short 16 @ 0x10 901; CHECK-NEXT: .short 19 @ 0x13 902; CHECK-NEXT: .short 22 @ 0x16 903; CHECK-NEXT: .LCPI12_1: 904; CHECK-NEXT: .short 0 @ 0x0 905; CHECK-NEXT: .short 3 @ 0x3 906; CHECK-NEXT: .short 6 @ 0x6 907; CHECK-NEXT: .short 9 @ 0x9 908; CHECK-NEXT: .short 12 @ 0xc 909; CHECK-NEXT: .short 15 @ 0xf 910; CHECK-NEXT: .short 18 @ 0x12 911; CHECK-NEXT: .short 21 @ 0x15 912; CHECK-NEXT: .LCPI12_2: 913; CHECK-NEXT: .short 2 @ 0x2 914; CHECK-NEXT: .short 5 @ 0x5 915; CHECK-NEXT: .short 8 @ 0x8 916; CHECK-NEXT: .short 11 @ 0xb 917; CHECK-NEXT: .short 14 @ 0xe 918; CHECK-NEXT: .short 17 @ 0x11 919; CHECK-NEXT: .short 20 @ 0x14 920; CHECK-NEXT: .short 23 @ 0x17 921 922 923entry: 924 %cmp22 = icmp sgt i32 %n, 0 925 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 926 927vector.ph: ; preds = %for.body.preheader 928 %n.vec = and i32 %n, -8 929 br label %vector.body 930 931vector.body: ; preds = %vector.body, %vector.ph 932 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 933 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ] 934 %0 = mul nuw nsw <8 x i16> %vec.ind, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 935 %1 = getelementptr inbounds i16, i16* %data, <8 x i16> %0 936 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 937 %2 = add nuw nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 938 %3 = getelementptr inbounds i16, i16* %data, <8 x i16> %2 939 %wide.masked.gather24 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %3, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 940 %4 = add nuw nsw <8 x i16> %0, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 941 %5 = getelementptr inbounds i16, i16* %data, <8 x i16> %4 942 %wide.masked.gather25 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %5, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 943 %6 = add nsw <8 x i16> %wide.masked.gather24, %wide.masked.gather 944 %7 = add nsw <8 x i16> %6, %wide.masked.gather25 945 %8 = getelementptr inbounds i16, i16* %dst, i32 %index 946 %9 = bitcast i16* %8 to <8 x i16>* 947 store <8 x i16> %7, <8 x i16>* %9, align 2 948 %index.next = add i32 %index, 8 949 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 950 %10 = icmp eq i32 %index.next, %n.vec 951 br i1 %10, label %middle.block, label %vector.body 952 953middle.block: ; preds = %vector.body 954 %cmp.n = icmp eq i32 %n.vec, %n 955 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 956 957for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 958 ret void 959} 960 961 962define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) { 963; CHECK-LABEL: gather_inc_v16i8_complex: 964; CHECK: @ %bb.0: @ %entry 965; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 966; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 967; CHECK-NEXT: .pad #4 968; CHECK-NEXT: sub sp, #4 969; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 970; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 971; CHECK-NEXT: .pad #328 972; CHECK-NEXT: sub sp, #328 973; CHECK-NEXT: cmp r2, #1 974; CHECK-NEXT: str r1, [sp, #120] @ 4-byte Spill 975; CHECK-NEXT: mov r1, r2 976; CHECK-NEXT: str r2, [sp, #124] @ 4-byte Spill 977; CHECK-NEXT: blt.w .LBB13_5 978; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 979; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload 980; CHECK-NEXT: adr.w r6, .LCPI13_8 981; CHECK-NEXT: adr.w r7, .LCPI13_7 982; CHECK-NEXT: adr.w r3, .LCPI13_6 983; CHECK-NEXT: bic r11, r1, #7 984; CHECK-NEXT: adr r1, .LCPI13_0 985; CHECK-NEXT: vldrw.u32 q0, [r1] 986; CHECK-NEXT: adr r1, .LCPI13_1 987; CHECK-NEXT: vmov.i32 q5, #0x30 988; CHECK-NEXT: str.w r11, [sp, #116] @ 4-byte Spill 989; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 990; CHECK-NEXT: vldrw.u32 q0, [r1] 991; CHECK-NEXT: adr r1, .LCPI13_5 992; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill 993; CHECK-NEXT: vldrw.u32 q0, [r6] 994; CHECK-NEXT: adr.w r6, .LCPI13_9 995; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 996; CHECK-NEXT: vldrw.u32 q0, [r7] 997; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill 998; CHECK-NEXT: vldrw.u32 q0, [r6] 999; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 1000; CHECK-NEXT: vldrw.u32 q0, [r1] 1001; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 1002; CHECK-NEXT: vldrw.u32 q0, [r3] 1003; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 1004; CHECK-NEXT: .LBB13_2: @ %vector.ph 1005; CHECK-NEXT: @ =>This Loop Header: Depth=1 1006; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 1007; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload 1008; CHECK-NEXT: adr r1, .LCPI13_3 1009; CHECK-NEXT: vldrw.u32 q1, [r1] 1010; CHECK-NEXT: adr r1, .LCPI13_4 1011; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill 1012; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload 1013; CHECK-NEXT: vldrw.u32 q3, [r1] 1014; CHECK-NEXT: adr r1, .LCPI13_2 1015; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill 1016; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload 1017; CHECK-NEXT: vldrw.u32 q0, [r1] 1018; CHECK-NEXT: adr r1, .LCPI13_10 1019; CHECK-NEXT: vstrw.32 q2, [sp, #272] @ 16-byte Spill 1020; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload 1021; CHECK-NEXT: vstrw.32 q0, [sp, #304] @ 16-byte Spill 1022; CHECK-NEXT: vldrw.u32 q0, [r1] 1023; CHECK-NEXT: adr r1, .LCPI13_11 1024; CHECK-NEXT: ldr.w r9, [sp, #120] @ 4-byte Reload 1025; CHECK-NEXT: vstrw.32 q2, [sp, #208] @ 16-byte Spill 1026; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload 1027; CHECK-NEXT: vldrw.u32 q6, [r1] 1028; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 1029; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload 1030; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill 1031; CHECK-NEXT: .LBB13_3: @ %vector.body 1032; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 1033; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1034; CHECK-NEXT: vstrw.32 q3, [sp, #240] @ 16-byte Spill 1035; CHECK-NEXT: vadd.i32 q3, q6, r0 1036; CHECK-NEXT: vmov r1, s15 1037; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill 1038; CHECK-NEXT: vadd.i32 q1, q0, r0 1039; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill 1040; CHECK-NEXT: vadd.i32 q0, q7, r0 1041; CHECK-NEXT: vstrw.32 q6, [sp, #160] @ 16-byte Spill 1042; CHECK-NEXT: vldrw.u32 q6, [sp, #256] @ 16-byte Reload 1043; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill 1044; CHECK-NEXT: vmov r5, s7 1045; CHECK-NEXT: vldrw.u32 q2, [sp, #240] @ 16-byte Reload 1046; CHECK-NEXT: vadd.i32 q6, q6, r0 1047; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill 1048; CHECK-NEXT: subs.w r11, r11, #16 1049; CHECK-NEXT: ldrb.w r12, [r1] 1050; CHECK-NEXT: vmov r1, s5 1051; CHECK-NEXT: ldrb r5, [r5] 1052; CHECK-NEXT: ldrb.w lr, [r1] 1053; CHECK-NEXT: vmov r1, s0 1054; CHECK-NEXT: ldrb r6, [r1] 1055; CHECK-NEXT: vmov r1, s2 1056; CHECK-NEXT: ldrb.w r10, [r1] 1057; CHECK-NEXT: vmov r1, s3 1058; CHECK-NEXT: ldrb r4, [r1] 1059; CHECK-NEXT: vmov r1, s6 1060; CHECK-NEXT: ldrb.w r8, [r1] 1061; CHECK-NEXT: vmov r1, s24 1062; CHECK-NEXT: ldrb r1, [r1] 1063; CHECK-NEXT: vmov.8 q7[0], r1 1064; CHECK-NEXT: vmov r1, s25 1065; CHECK-NEXT: ldrb r1, [r1] 1066; CHECK-NEXT: vmov.8 q7[1], r1 1067; CHECK-NEXT: vmov r1, s26 1068; CHECK-NEXT: ldrb r1, [r1] 1069; CHECK-NEXT: vmov.8 q7[2], r1 1070; CHECK-NEXT: vmov r1, s27 1071; CHECK-NEXT: ldrb r1, [r1] 1072; CHECK-NEXT: vmov.8 q7[3], r1 1073; CHECK-NEXT: vmov r1, s12 1074; CHECK-NEXT: vmov.8 q7[4], r6 1075; CHECK-NEXT: ldrb r1, [r1] 1076; CHECK-NEXT: vmov.8 q6[0], r1 1077; CHECK-NEXT: vmov r1, s13 1078; CHECK-NEXT: ldrb r1, [r1] 1079; CHECK-NEXT: vmov.8 q6[1], r1 1080; CHECK-NEXT: vmov r1, s14 1081; CHECK-NEXT: vadd.i32 q3, q4, r0 1082; CHECK-NEXT: vldrw.u32 q4, [sp, #224] @ 16-byte Reload 1083; CHECK-NEXT: vmov r2, s12 1084; CHECK-NEXT: vmov r3, s15 1085; CHECK-NEXT: ldrb r1, [r1] 1086; CHECK-NEXT: vmov.8 q6[2], r1 1087; CHECK-NEXT: vmov r1, s4 1088; CHECK-NEXT: vmov.8 q6[3], r12 1089; CHECK-NEXT: ldrb r2, [r2] 1090; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload 1091; CHECK-NEXT: ldrb r3, [r3] 1092; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill 1093; CHECK-NEXT: vadd.i32 q1, q1, r0 1094; CHECK-NEXT: ldrb r1, [r1] 1095; CHECK-NEXT: vmov.8 q6[4], r1 1096; CHECK-NEXT: vmov r1, s13 1097; CHECK-NEXT: vmov.8 q6[5], lr 1098; CHECK-NEXT: vmov.8 q6[6], r8 1099; CHECK-NEXT: vmov.8 q6[7], r5 1100; CHECK-NEXT: vmov r5, s1 1101; CHECK-NEXT: vldrw.u32 q0, [sp, #288] @ 16-byte Reload 1102; CHECK-NEXT: vstrw.32 q0, [sp, #288] @ 16-byte Spill 1103; CHECK-NEXT: vadd.i32 q0, q0, r0 1104; CHECK-NEXT: vmov r6, s0 1105; CHECK-NEXT: ldrb r7, [r1] 1106; CHECK-NEXT: vmov r1, s14 1107; CHECK-NEXT: vldrw.u32 q3, [sp, #208] @ 16-byte Reload 1108; CHECK-NEXT: ldrb r5, [r5] 1109; CHECK-NEXT: vmov.8 q7[5], r5 1110; CHECK-NEXT: vmov r5, s1 1111; CHECK-NEXT: vmov.8 q7[6], r10 1112; CHECK-NEXT: vmov.8 q7[7], r4 1113; CHECK-NEXT: vmov r4, s2 1114; CHECK-NEXT: vmov.8 q7[8], r2 1115; CHECK-NEXT: ldrb r6, [r6] 1116; CHECK-NEXT: vmov.8 q7[9], r7 1117; CHECK-NEXT: ldrb r1, [r1] 1118; CHECK-NEXT: vmov.8 q7[10], r1 1119; CHECK-NEXT: vmov r1, s4 1120; CHECK-NEXT: vmov.8 q7[11], r3 1121; CHECK-NEXT: vmov.8 q7[12], r6 1122; CHECK-NEXT: ldrb r5, [r5] 1123; CHECK-NEXT: ldrb r4, [r4] 1124; CHECK-NEXT: vmov.8 q7[13], r5 1125; CHECK-NEXT: vmov.8 q7[14], r4 1126; CHECK-NEXT: ldrb r1, [r1] 1127; CHECK-NEXT: vmov.8 q6[8], r1 1128; CHECK-NEXT: vmov r1, s5 1129; CHECK-NEXT: ldrb r1, [r1] 1130; CHECK-NEXT: vmov.8 q6[9], r1 1131; CHECK-NEXT: vmov r1, s6 1132; CHECK-NEXT: ldrb r1, [r1] 1133; CHECK-NEXT: vmov.8 q6[10], r1 1134; CHECK-NEXT: vmov r1, s7 1135; CHECK-NEXT: vadd.i32 q1, q2, r0 1136; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload 1137; CHECK-NEXT: ldrb r1, [r1] 1138; CHECK-NEXT: vmov.8 q6[11], r1 1139; CHECK-NEXT: vmov r1, s4 1140; CHECK-NEXT: ldrb r1, [r1] 1141; CHECK-NEXT: vmov.8 q6[12], r1 1142; CHECK-NEXT: vmov r1, s5 1143; CHECK-NEXT: ldrb r1, [r1] 1144; CHECK-NEXT: vmov.8 q6[13], r1 1145; CHECK-NEXT: vmov r1, s6 1146; CHECK-NEXT: ldrb r1, [r1] 1147; CHECK-NEXT: vmov.8 q6[14], r1 1148; CHECK-NEXT: vmov r1, s7 1149; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload 1150; CHECK-NEXT: vadd.i32 q1, q1, q5 1151; CHECK-NEXT: ldrb r1, [r1] 1152; CHECK-NEXT: vmov.8 q6[15], r1 1153; CHECK-NEXT: vmov r1, s3 1154; CHECK-NEXT: vadd.i32 q0, q4, r0 1155; CHECK-NEXT: vadd.i32 q4, q4, q5 1156; CHECK-NEXT: vmov r2, s1 1157; CHECK-NEXT: vstrw.32 q4, [sp, #224] @ 16-byte Spill 1158; CHECK-NEXT: ldrb r1, [r1] 1159; CHECK-NEXT: vmov.8 q7[15], r1 1160; CHECK-NEXT: vmov r1, s0 1161; CHECK-NEXT: vadd.i8 q6, q7, q6 1162; CHECK-NEXT: ldrb r2, [r2] 1163; CHECK-NEXT: ldrb r1, [r1] 1164; CHECK-NEXT: vmov.8 q7[0], r1 1165; CHECK-NEXT: vmov r1, s2 1166; CHECK-NEXT: vmov.8 q7[1], r2 1167; CHECK-NEXT: ldrb r1, [r1] 1168; CHECK-NEXT: vmov.8 q7[2], r1 1169; CHECK-NEXT: vmov r1, s3 1170; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload 1171; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill 1172; CHECK-NEXT: vadd.i32 q0, q0, r0 1173; CHECK-NEXT: vldrw.u32 q4, [sp, #272] @ 16-byte Reload 1174; CHECK-NEXT: vadd.i32 q4, q4, q5 1175; CHECK-NEXT: vstrw.32 q4, [sp, #272] @ 16-byte Spill 1176; CHECK-NEXT: vldrw.u32 q4, [sp, #304] @ 16-byte Reload 1177; CHECK-NEXT: vadd.i32 q4, q4, q5 1178; CHECK-NEXT: vstrw.32 q4, [sp, #304] @ 16-byte Spill 1179; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload 1180; CHECK-NEXT: vadd.i32 q4, q4, q5 1181; CHECK-NEXT: ldrb r1, [r1] 1182; CHECK-NEXT: vmov.8 q7[3], r1 1183; CHECK-NEXT: vmov r1, s0 1184; CHECK-NEXT: ldrb r1, [r1] 1185; CHECK-NEXT: vmov.8 q7[4], r1 1186; CHECK-NEXT: vmov r1, s1 1187; CHECK-NEXT: ldrb r1, [r1] 1188; CHECK-NEXT: vmov.8 q7[5], r1 1189; CHECK-NEXT: vmov r1, s2 1190; CHECK-NEXT: ldrb r1, [r1] 1191; CHECK-NEXT: vmov.8 q7[6], r1 1192; CHECK-NEXT: vmov r1, s3 1193; CHECK-NEXT: vadd.i32 q0, q3, r0 1194; CHECK-NEXT: vadd.i32 q3, q3, q5 1195; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill 1196; CHECK-NEXT: vldrw.u32 q3, [sp, #240] @ 16-byte Reload 1197; CHECK-NEXT: vadd.i32 q3, q3, q5 1198; CHECK-NEXT: ldrb r1, [r1] 1199; CHECK-NEXT: vmov.8 q7[7], r1 1200; CHECK-NEXT: vmov r1, s0 1201; CHECK-NEXT: ldrb r1, [r1] 1202; CHECK-NEXT: vmov.8 q7[8], r1 1203; CHECK-NEXT: vmov r1, s1 1204; CHECK-NEXT: ldrb r1, [r1] 1205; CHECK-NEXT: vmov.8 q7[9], r1 1206; CHECK-NEXT: vmov r1, s2 1207; CHECK-NEXT: ldrb r1, [r1] 1208; CHECK-NEXT: vmov.8 q7[10], r1 1209; CHECK-NEXT: vmov r1, s3 1210; CHECK-NEXT: vadd.i32 q0, q2, r0 1211; CHECK-NEXT: vadd.i32 q2, q2, q5 1212; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill 1213; CHECK-NEXT: vldrw.u32 q2, [sp, #288] @ 16-byte Reload 1214; CHECK-NEXT: vadd.i32 q2, q2, q5 1215; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill 1216; CHECK-NEXT: ldrb r1, [r1] 1217; CHECK-NEXT: vmov.8 q7[11], r1 1218; CHECK-NEXT: vmov r1, s0 1219; CHECK-NEXT: ldrb r1, [r1] 1220; CHECK-NEXT: vmov.8 q7[12], r1 1221; CHECK-NEXT: vmov r1, s1 1222; CHECK-NEXT: ldrb r1, [r1] 1223; CHECK-NEXT: vmov.8 q7[13], r1 1224; CHECK-NEXT: vmov r1, s2 1225; CHECK-NEXT: ldrb r1, [r1] 1226; CHECK-NEXT: vmov.8 q7[14], r1 1227; CHECK-NEXT: vmov r1, s3 1228; CHECK-NEXT: ldrb r1, [r1] 1229; CHECK-NEXT: vmov.8 q7[15], r1 1230; CHECK-NEXT: vadd.i8 q0, q6, q7 1231; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload 1232; CHECK-NEXT: vldrw.u32 q6, [sp, #160] @ 16-byte Reload 1233; CHECK-NEXT: vstrb.8 q0, [r9], #16 1234; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload 1235; CHECK-NEXT: vadd.i32 q7, q7, q5 1236; CHECK-NEXT: vadd.i32 q6, q6, q5 1237; CHECK-NEXT: vadd.i32 q0, q0, q5 1238; CHECK-NEXT: bne.w .LBB13_3 1239; CHECK-NEXT: @ %bb.4: @ %middle.block 1240; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 1241; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload 1242; CHECK-NEXT: ldr.w r11, [sp, #116] @ 4-byte Reload 1243; CHECK-NEXT: cmp r11, r1 1244; CHECK-NEXT: bne.w .LBB13_2 1245; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup 1246; CHECK-NEXT: add sp, #328 1247; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1248; CHECK-NEXT: add sp, #4 1249; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1250; CHECK-NEXT: .p2align 4 1251; CHECK-NEXT: @ %bb.6: 1252; CHECK-NEXT: .LCPI13_0: 1253; CHECK-NEXT: .long 38 @ 0x26 1254; CHECK-NEXT: .long 41 @ 0x29 1255; CHECK-NEXT: .long 44 @ 0x2c 1256; CHECK-NEXT: .long 47 @ 0x2f 1257; CHECK-NEXT: .LCPI13_1: 1258; CHECK-NEXT: .long 14 @ 0xe 1259; CHECK-NEXT: .long 17 @ 0x11 1260; CHECK-NEXT: .long 20 @ 0x14 1261; CHECK-NEXT: .long 23 @ 0x17 1262; CHECK-NEXT: .LCPI13_2: 1263; CHECK-NEXT: .long 24 @ 0x18 1264; CHECK-NEXT: .long 27 @ 0x1b 1265; CHECK-NEXT: .long 30 @ 0x1e 1266; CHECK-NEXT: .long 33 @ 0x21 1267; CHECK-NEXT: .LCPI13_3: 1268; CHECK-NEXT: .long 1 @ 0x1 1269; CHECK-NEXT: .long 4 @ 0x4 1270; CHECK-NEXT: .long 7 @ 0x7 1271; CHECK-NEXT: .long 10 @ 0xa 1272; CHECK-NEXT: .LCPI13_4: 1273; CHECK-NEXT: .long 36 @ 0x24 1274; CHECK-NEXT: .long 39 @ 0x27 1275; CHECK-NEXT: .long 42 @ 0x2a 1276; CHECK-NEXT: .long 45 @ 0x2d 1277; CHECK-NEXT: .LCPI13_5: 1278; CHECK-NEXT: .long 25 @ 0x19 1279; CHECK-NEXT: .long 28 @ 0x1c 1280; CHECK-NEXT: .long 31 @ 0x1f 1281; CHECK-NEXT: .long 34 @ 0x22 1282; CHECK-NEXT: .LCPI13_6: 1283; CHECK-NEXT: .long 13 @ 0xd 1284; CHECK-NEXT: .long 16 @ 0x10 1285; CHECK-NEXT: .long 19 @ 0x13 1286; CHECK-NEXT: .long 22 @ 0x16 1287; CHECK-NEXT: .LCPI13_7: 1288; CHECK-NEXT: .long 2 @ 0x2 1289; CHECK-NEXT: .long 5 @ 0x5 1290; CHECK-NEXT: .long 8 @ 0x8 1291; CHECK-NEXT: .long 11 @ 0xb 1292; CHECK-NEXT: .LCPI13_8: 1293; CHECK-NEXT: .long 26 @ 0x1a 1294; CHECK-NEXT: .long 29 @ 0x1d 1295; CHECK-NEXT: .long 32 @ 0x20 1296; CHECK-NEXT: .long 35 @ 0x23 1297; CHECK-NEXT: .LCPI13_9: 1298; CHECK-NEXT: .long 37 @ 0x25 1299; CHECK-NEXT: .long 40 @ 0x28 1300; CHECK-NEXT: .long 43 @ 0x2b 1301; CHECK-NEXT: .long 46 @ 0x2e 1302; CHECK-NEXT: .LCPI13_10: 1303; CHECK-NEXT: .long 12 @ 0xc 1304; CHECK-NEXT: .long 15 @ 0xf 1305; CHECK-NEXT: .long 18 @ 0x12 1306; CHECK-NEXT: .long 21 @ 0x15 1307; CHECK-NEXT: .LCPI13_11: 1308; CHECK-NEXT: .long 0 @ 0x0 1309; CHECK-NEXT: .long 3 @ 0x3 1310; CHECK-NEXT: .long 6 @ 0x6 1311; CHECK-NEXT: .long 9 @ 0x9 1312 1313 1314entry: 1315 %cmp22 = icmp sgt i32 %n, 0 1316 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 1317 1318vector.ph: ; preds = %for.body.preheader 1319 %n.vec = and i32 %n, -8 1320 br label %vector.body 1321 1322vector.body: ; preds = %vector.body, %vector.ph 1323 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1324 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1325 %0 = mul nuw nsw <16 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1326 %1 = getelementptr inbounds i8, i8* %data, <16 x i32> %0 1327 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1328 %2 = add nuw nsw <16 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1329 %3 = getelementptr inbounds i8, i8* %data, <16 x i32> %2 1330 %wide.masked.gather24 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %3, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1331 %4 = add nuw nsw <16 x i32> %0, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1332 %5 = getelementptr inbounds i8, i8* %data, <16 x i32> %4 1333 %wide.masked.gather25 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %5, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1334 %6 = add nsw <16 x i8> %wide.masked.gather24, %wide.masked.gather 1335 %7 = add nsw <16 x i8> %6, %wide.masked.gather25 1336 %8 = getelementptr inbounds i8, i8* %dst, i32 %index 1337 %9 = bitcast i8* %8 to <16 x i8>* 1338 store <16 x i8> %7, <16 x i8>* %9, align 2 1339 %index.next = add i32 %index, 16 1340 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1341 %10 = icmp eq i32 %index.next, %n.vec 1342 br i1 %10, label %middle.block, label %vector.body 1343 1344middle.block: ; preds = %vector.body 1345 %cmp.n = icmp eq i32 %n.vec, %n 1346 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 1347 1348for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1349 ret void 1350} 1351 1352define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) { 1353; CHECK-LABEL: gather_inc_v16i8_simple: 1354; CHECK: @ %bb.0: @ %entry 1355; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1356; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1357; CHECK-NEXT: .pad #4 1358; CHECK-NEXT: sub sp, #4 1359; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1360; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1361; CHECK-NEXT: .pad #72 1362; CHECK-NEXT: sub sp, #72 1363; CHECK-NEXT: cmp r2, #1 1364; CHECK-NEXT: blt.w .LBB14_5 1365; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 1366; CHECK-NEXT: adr r5, .LCPI14_3 1367; CHECK-NEXT: adr r7, .LCPI14_1 1368; CHECK-NEXT: vldrw.u32 q0, [r5] 1369; CHECK-NEXT: adr r6, .LCPI14_2 1370; CHECK-NEXT: adr r3, .LCPI14_0 1371; CHECK-NEXT: bic r12, r2, #7 1372; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill 1373; CHECK-NEXT: vldrw.u32 q0, [r7] 1374; CHECK-NEXT: vmov.i32 q4, #0x10 1375; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 1376; CHECK-NEXT: vldrw.u32 q0, [r6] 1377; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 1378; CHECK-NEXT: vldrw.u32 q0, [r3] 1379; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 1380; CHECK-NEXT: .LBB14_2: @ %vector.ph 1381; CHECK-NEXT: @ =>This Loop Header: Depth=1 1382; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 1383; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload 1384; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload 1385; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 1386; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload 1387; CHECK-NEXT: mov lr, r1 1388; CHECK-NEXT: mov r3, r12 1389; CHECK-NEXT: .LBB14_3: @ %vector.body 1390; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 1391; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1392; CHECK-NEXT: vadd.i32 q1, q7, r0 1393; CHECK-NEXT: vadd.i32 q2, q0, r0 1394; CHECK-NEXT: vmov r4, s6 1395; CHECK-NEXT: vadd.i32 q3, q5, r0 1396; CHECK-NEXT: vmov r6, s12 1397; CHECK-NEXT: subs r3, #16 1398; CHECK-NEXT: vmov r5, s8 1399; CHECK-NEXT: vadd.i32 q5, q5, q4 1400; CHECK-NEXT: vadd.i32 q7, q7, q4 1401; CHECK-NEXT: vadd.i32 q0, q0, q4 1402; CHECK-NEXT: ldrb.w r8, [r4] 1403; CHECK-NEXT: vmov r4, s7 1404; CHECK-NEXT: ldrb r6, [r6] 1405; CHECK-NEXT: ldrb r5, [r5] 1406; CHECK-NEXT: ldrb.w r10, [r4] 1407; CHECK-NEXT: vmov r4, s9 1408; CHECK-NEXT: ldrb.w r9, [r4] 1409; CHECK-NEXT: vmov r4, s10 1410; CHECK-NEXT: ldrb.w r11, [r4] 1411; CHECK-NEXT: vmov r4, s11 1412; CHECK-NEXT: vmov.8 q2[0], r6 1413; CHECK-NEXT: vmov r6, s13 1414; CHECK-NEXT: ldrb r4, [r4] 1415; CHECK-NEXT: ldrb r6, [r6] 1416; CHECK-NEXT: vmov.8 q2[1], r6 1417; CHECK-NEXT: vmov r6, s14 1418; CHECK-NEXT: ldrb r6, [r6] 1419; CHECK-NEXT: vmov.8 q2[2], r6 1420; CHECK-NEXT: vmov r6, s15 1421; CHECK-NEXT: vadd.i32 q3, q6, r0 1422; CHECK-NEXT: vadd.i32 q6, q6, q4 1423; CHECK-NEXT: vmov r7, s12 1424; CHECK-NEXT: ldrb r6, [r6] 1425; CHECK-NEXT: ldrb r7, [r7] 1426; CHECK-NEXT: vmov.8 q2[3], r6 1427; CHECK-NEXT: vmov r6, s5 1428; CHECK-NEXT: vmov.8 q2[4], r7 1429; CHECK-NEXT: vmov r7, s13 1430; CHECK-NEXT: ldrb r6, [r6] 1431; CHECK-NEXT: ldrb r7, [r7] 1432; CHECK-NEXT: vmov.8 q2[5], r7 1433; CHECK-NEXT: vmov r7, s14 1434; CHECK-NEXT: ldrb r7, [r7] 1435; CHECK-NEXT: vmov.8 q2[6], r7 1436; CHECK-NEXT: vmov r7, s15 1437; CHECK-NEXT: ldrb r7, [r7] 1438; CHECK-NEXT: vmov.8 q2[7], r7 1439; CHECK-NEXT: vmov r7, s4 1440; CHECK-NEXT: ldrb r7, [r7] 1441; CHECK-NEXT: vmov.8 q2[8], r7 1442; CHECK-NEXT: vmov.8 q2[9], r6 1443; CHECK-NEXT: vmov.8 q2[10], r8 1444; CHECK-NEXT: vmov.8 q2[11], r10 1445; CHECK-NEXT: vmov.8 q2[12], r5 1446; CHECK-NEXT: vmov.8 q2[13], r9 1447; CHECK-NEXT: vmov.8 q2[14], r11 1448; CHECK-NEXT: vmov.8 q2[15], r4 1449; CHECK-NEXT: vstrb.8 q2, [lr], #16 1450; CHECK-NEXT: bne .LBB14_3 1451; CHECK-NEXT: @ %bb.4: @ %middle.block 1452; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 1453; CHECK-NEXT: cmp r12, r2 1454; CHECK-NEXT: bne .LBB14_2 1455; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup 1456; CHECK-NEXT: add sp, #72 1457; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1458; CHECK-NEXT: add sp, #4 1459; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1460; CHECK-NEXT: .p2align 4 1461; CHECK-NEXT: @ %bb.6: 1462; CHECK-NEXT: .LCPI14_0: 1463; CHECK-NEXT: .long 0 @ 0x0 1464; CHECK-NEXT: .long 1 @ 0x1 1465; CHECK-NEXT: .long 2 @ 0x2 1466; CHECK-NEXT: .long 3 @ 0x3 1467; CHECK-NEXT: .LCPI14_1: 1468; CHECK-NEXT: .long 8 @ 0x8 1469; CHECK-NEXT: .long 9 @ 0x9 1470; CHECK-NEXT: .long 10 @ 0xa 1471; CHECK-NEXT: .long 11 @ 0xb 1472; CHECK-NEXT: .LCPI14_2: 1473; CHECK-NEXT: .long 4 @ 0x4 1474; CHECK-NEXT: .long 5 @ 0x5 1475; CHECK-NEXT: .long 6 @ 0x6 1476; CHECK-NEXT: .long 7 @ 0x7 1477; CHECK-NEXT: .LCPI14_3: 1478; CHECK-NEXT: .long 12 @ 0xc 1479; CHECK-NEXT: .long 13 @ 0xd 1480; CHECK-NEXT: .long 14 @ 0xe 1481; CHECK-NEXT: .long 15 @ 0xf 1482 1483 1484entry: 1485 %cmp22 = icmp sgt i32 %n, 0 1486 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 1487 1488vector.ph: ; preds = %for.body.preheader 1489 %n.vec = and i32 %n, -8 1490 br label %vector.body 1491 1492vector.body: ; preds = %vector.body, %vector.ph 1493 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1494 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1495 %0 = getelementptr inbounds i8, i8* %data, <16 x i32> %vec.ind 1496 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %0, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1497 %1 = getelementptr inbounds i8, i8* %dst, i32 %index 1498 %2 = bitcast i8* %1 to <16 x i8>* 1499 store <16 x i8> %wide.masked.gather, <16 x i8>* %2, align 2 1500 %index.next = add i32 %index, 16 1501 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1502 %3 = icmp eq i32 %index.next, %n.vec 1503 br i1 %3, label %middle.block, label %vector.body 1504 1505middle.block: ; preds = %vector.body 1506 %cmp.n = icmp eq i32 %n.vec, %n 1507 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 1508 1509for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1510 ret void 1511} 1512 1513 1514declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 1515declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) 1516declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) 1517declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 1518declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 1519declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 1520declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) 1521declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) 1522declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) 1523declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) 1524declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) 1525declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) 1526declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) 1527declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>) 1528declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) 1529declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) 1530declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) 1531declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) 1532declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) 1533