1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4 5define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %dst, <4 x i32> %offs) { 6; CHECK-LABEL: scatter_inc_minipred_4i32: 7; CHECK: @ %bb.0: 8; CHECK-NEXT: movw r1, #3855 9; CHECK-NEXT: vmov.i32 q2, #0x4 10; CHECK-NEXT: vadd.i32 q1, q1, q2 11; CHECK-NEXT: vmsr p0, r1 12; CHECK-NEXT: vpst 13; CHECK-NEXT: vstrwt.32 q0, [r0, q1, uxtw #2] 14; CHECK-NEXT: bx lr 15 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4> 16 %2 = getelementptr inbounds i32, i32* %dst, <4 x i32> %1 17 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) 18 ret void 19} 20 21define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) { 22; CHECK-LABEL: scatter_inc_mini_8i16: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: .vsave {d8, d9} 25; CHECK-NEXT: vpush {d8, d9} 26; CHECK-NEXT: vshl.i32 q1, q1, #1 27; CHECK-NEXT: vmov.i32 q3, #0x10 28; CHECK-NEXT: vadd.i32 q1, q1, r0 29; CHECK-NEXT: vmov.u16 r2, q0[0] 30; CHECK-NEXT: vadd.i32 q4, q1, q3 31; CHECK-NEXT: vshl.i32 q1, q2, #1 32; CHECK-NEXT: vmov r1, s16 33; CHECK-NEXT: vadd.i32 q1, q1, r0 34; CHECK-NEXT: vmov r0, s17 35; CHECK-NEXT: vadd.i32 q1, q1, q3 36; CHECK-NEXT: strh r2, [r1] 37; CHECK-NEXT: vmov.u16 r1, q0[1] 38; CHECK-NEXT: strh r1, [r0] 39; CHECK-NEXT: vmov r0, s18 40; CHECK-NEXT: vmov.u16 r1, q0[2] 41; CHECK-NEXT: strh r1, [r0] 42; CHECK-NEXT: vmov r0, s19 43; CHECK-NEXT: vmov.u16 r1, q0[3] 44; CHECK-NEXT: strh r1, [r0] 45; CHECK-NEXT: vmov r0, s4 46; CHECK-NEXT: vmov.u16 r1, q0[4] 47; CHECK-NEXT: strh r1, [r0] 48; CHECK-NEXT: vmov r0, s5 49; CHECK-NEXT: vmov.u16 r1, q0[5] 50; CHECK-NEXT: strh r1, [r0] 51; CHECK-NEXT: vmov r0, s6 52; CHECK-NEXT: vmov.u16 r1, q0[6] 53; CHECK-NEXT: strh r1, [r0] 54; CHECK-NEXT: vmov r0, s7 55; CHECK-NEXT: vmov.u16 r1, q0[7] 56; CHECK-NEXT: strh r1, [r0] 57; CHECK-NEXT: vpop {d8, d9} 58; CHECK-NEXT: bx lr 59 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 60 %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1 61 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 62 ret void 63} 64 65define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) { 66; CHECK-LABEL: scatter_inc_mini_16i8: 67; CHECK: @ %bb.0: 68; CHECK-NEXT: .vsave {d8, d9, d10, d11} 69; CHECK-NEXT: vpush {d8, d9, d10, d11} 70; CHECK-NEXT: vmov.i32 q5, #0x10 71; CHECK-NEXT: vadd.i32 q1, q1, r0 72; CHECK-NEXT: vadd.i32 q4, q1, q5 73; CHECK-NEXT: vmov.u8 r2, q0[0] 74; CHECK-NEXT: vmov r1, s16 75; CHECK-NEXT: vadd.i32 q3, q3, r0 76; CHECK-NEXT: vadd.i32 q2, q2, r0 77; CHECK-NEXT: vadd.i32 q3, q3, q5 78; CHECK-NEXT: vadd.i32 q2, q2, q5 79; CHECK-NEXT: strb r2, [r1] 80; CHECK-NEXT: add r1, sp, #32 81; CHECK-NEXT: vldrw.u32 q1, [r1] 82; CHECK-NEXT: vmov.u8 r1, q0[1] 83; CHECK-NEXT: vadd.i32 q1, q1, r0 84; CHECK-NEXT: vmov r0, s17 85; CHECK-NEXT: vadd.i32 q1, q1, q5 86; CHECK-NEXT: strb r1, [r0] 87; CHECK-NEXT: vmov r0, s18 88; CHECK-NEXT: vmov.u8 r1, q0[2] 89; CHECK-NEXT: strb r1, [r0] 90; CHECK-NEXT: vmov r0, s19 91; CHECK-NEXT: vmov.u8 r1, q0[3] 92; CHECK-NEXT: strb r1, [r0] 93; CHECK-NEXT: vmov r0, s8 94; CHECK-NEXT: vmov.u8 r1, q0[4] 95; CHECK-NEXT: strb r1, [r0] 96; CHECK-NEXT: vmov r0, s9 97; CHECK-NEXT: vmov.u8 r1, q0[5] 98; CHECK-NEXT: strb r1, [r0] 99; CHECK-NEXT: vmov r0, s10 100; CHECK-NEXT: vmov.u8 r1, q0[6] 101; CHECK-NEXT: strb r1, [r0] 102; CHECK-NEXT: vmov r0, s11 103; CHECK-NEXT: vmov.u8 r1, q0[7] 104; CHECK-NEXT: strb r1, [r0] 105; CHECK-NEXT: vmov r0, s12 106; CHECK-NEXT: vmov.u8 r1, q0[8] 107; CHECK-NEXT: strb r1, [r0] 108; CHECK-NEXT: vmov r0, s13 109; CHECK-NEXT: vmov.u8 r1, q0[9] 110; CHECK-NEXT: strb r1, [r0] 111; CHECK-NEXT: vmov r0, s14 112; CHECK-NEXT: vmov.u8 r1, q0[10] 113; CHECK-NEXT: strb r1, [r0] 114; CHECK-NEXT: vmov r0, s15 115; CHECK-NEXT: vmov.u8 r1, q0[11] 116; CHECK-NEXT: strb r1, [r0] 117; CHECK-NEXT: vmov r0, s4 118; CHECK-NEXT: vmov.u8 r1, q0[12] 119; CHECK-NEXT: strb r1, [r0] 120; CHECK-NEXT: vmov r0, s5 121; CHECK-NEXT: vmov.u8 r1, q0[13] 122; CHECK-NEXT: strb r1, [r0] 123; CHECK-NEXT: vmov r0, s6 124; CHECK-NEXT: vmov.u8 r1, q0[14] 125; CHECK-NEXT: strb r1, [r0] 126; CHECK-NEXT: vmov r0, s7 127; CHECK-NEXT: vmov.u8 r1, q0[15] 128; CHECK-NEXT: strb r1, [r0] 129; CHECK-NEXT: vpop {d8, d9, d10, d11} 130; CHECK-NEXT: bx lr 131 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 132 %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1 133 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 134 ret void 135} 136 137define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, i32* %dst, i32 %n) { 138; CHECK-LABEL: scatter_inc_v4i32_complex: 139; CHECK: @ %bb.0: @ %entry 140; CHECK-NEXT: .save {r4, lr} 141; CHECK-NEXT: push {r4, lr} 142; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 143; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 144; CHECK-NEXT: .pad #24 145; CHECK-NEXT: sub sp, #24 146; CHECK-NEXT: cmp r1, #1 147; CHECK-NEXT: blt .LBB3_5 148; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 149; CHECK-NEXT: adr r4, .LCPI3_2 150; CHECK-NEXT: bic r2, r1, #3 151; CHECK-NEXT: vldrw.u32 q3, [r4] 152; CHECK-NEXT: sub.w r12, r2, #4 153; CHECK-NEXT: adr.w lr, .LCPI3_1 154; CHECK-NEXT: movs r3, #1 155; CHECK-NEXT: vadd.i32 q3, q3, r0 156; CHECK-NEXT: add.w r3, r3, r12, lsr #2 157; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill 158; CHECK-NEXT: vldrw.u32 q3, [lr] 159; CHECK-NEXT: adr.w r12, .LCPI3_0 160; CHECK-NEXT: vadd.i32 q4, q3, r0 161; CHECK-NEXT: vldrw.u32 q3, [r12] 162; CHECK-NEXT: vadd.i32 q3, q3, r0 163; CHECK-NEXT: .LBB3_2: @ %vector.ph 164; CHECK-NEXT: @ =>This Loop Header: Depth=1 165; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 166; CHECK-NEXT: dls lr, r3 167; CHECK-NEXT: vmov q6, q4 168; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 169; CHECK-NEXT: vmov q5, q3 170; CHECK-NEXT: .LBB3_3: @ %vector.body 171; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 172; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 173; CHECK-NEXT: vstrw.32 q0, [q5, #48]! 174; CHECK-NEXT: vstrw.32 q1, [q6, #48]! 175; CHECK-NEXT: vstrw.32 q2, [q7, #48]! 176; CHECK-NEXT: le lr, .LBB3_3 177; CHECK-NEXT: @ %bb.4: @ %middle.block 178; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 179; CHECK-NEXT: cmp r2, r1 180; CHECK-NEXT: bne .LBB3_2 181; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup 182; CHECK-NEXT: add sp, #24 183; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 184; CHECK-NEXT: pop {r4, pc} 185; CHECK-NEXT: .p2align 4 186; CHECK-NEXT: @ %bb.6: 187; CHECK-NEXT: .LCPI3_0: 188; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 189; CHECK-NEXT: .long 4294967260 @ 0xffffffdc 190; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 191; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 192; CHECK-NEXT: .LCPI3_1: 193; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 194; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 195; CHECK-NEXT: .long 4294967276 @ 0xffffffec 196; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 197; CHECK-NEXT: .LCPI3_2: 198; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 199; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 200; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 201; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 202entry: 203 %cmp22 = icmp sgt i32 %n, 0 204 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 205 206vector.ph: ; preds = %for.body.preheader 207 %n.vec = and i32 %n, -4 208 br label %vector.body 209 210vector.body: ; preds = %vector.body, %vector.ph 211 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 212 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 213 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 214 %1 = getelementptr inbounds i32, i32* %dst, <4 x i32> %0 215 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data1, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 216 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 217 %3 = getelementptr inbounds i32, i32* %dst, <4 x i32> %2 218 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data2, <4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 219 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2> 220 %5 = getelementptr inbounds i32, i32* %dst, <4 x i32> %4 221 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data3, <4 x i32*> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 222 %index.next = add i32 %index, 4 223 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 224 %6 = icmp eq i32 %index.next, %n.vec 225 br i1 %6, label %middle.block, label %vector.body 226 227middle.block: ; preds = %vector.body 228 %cmp.n = icmp eq i32 %n.vec, %n 229 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 230 231for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 232 ret void 233} 234 235 236declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) 237declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) 238declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) 239declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) 240declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) 241declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) 242declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) 243declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) 244declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) 245