1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s 3 4; VLDRH.u32 Qd, [base, offs, #uxtw #1] 5define arm_aapcs_vfpcc void @ext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { 6; CHECK-LABEL: ext_scaled_i16_i32: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrw.u32 q1, [r1] 9; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 10; CHECK-NEXT: bx lr 11entry: 12 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 13 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs 14 %t = trunc <4 x i32> %input to <4 x i16> 15 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 16 ret void 17} 18 19; VSTRW.32 Qd, [base, offs, uxtw #2] 20define arm_aapcs_vfpcc void @scaled_i32_i32(i32* %base, <4 x i32>* %offptr, <4 x i32> %input) { 21; CHECK-LABEL: scaled_i32_i32: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: vldrw.u32 q1, [r1] 24; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 25; CHECK-NEXT: bx lr 26entry: 27 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 28 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs 29 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 30 ret void 31} 32 33; VSTRW.32 Qd, [base, offs, uxtw #2] 34define arm_aapcs_vfpcc void @scaled_f32_i32(i32* %base, <4 x i32>* %offptr, <4 x float> %input) { 35; CHECK-LABEL: scaled_f32_i32: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: vldrw.u32 q1, [r1] 38; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 39; CHECK-NEXT: bx lr 40entry: 41 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 42 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs 43 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 44 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 45 ret void 46} 47 48; VSTRW.32 Qd, [base, offs.zext, uxtw #2] 49define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) { 50; CHECK-LABEL: unsigned_scaled_b_i32_i16: 51; CHECK: @ %bb.0: @ %entry 52; CHECK-NEXT: vldrh.u32 q1, [r1] 53; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 54; CHECK-NEXT: bx lr 55entry: 56 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 57 %offs.zext = zext <4 x i16> %offs to <4 x i32> 58 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 59 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 60 ret void 61} 62 63; VSTRW.32 Qd, [base, offs.sext, uxtw #2] 64define arm_aapcs_vfpcc void @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) { 65; CHECK-LABEL: signed_scaled_i32_i16: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vldrh.s32 q1, [r1] 68; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 69; CHECK-NEXT: bx lr 70entry: 71 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 72 %offs.sext = sext <4 x i16> %offs to <4 x i32> 73 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 74 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 75 ret void 76} 77 78; VSTRW.32 Qd, [base, offs.zext, uxtw #2] 79define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) { 80; CHECK-LABEL: a_unsigned_scaled_f32_i16: 81; CHECK: @ %bb.0: @ %entry 82; CHECK-NEXT: vldrh.u32 q1, [r1] 83; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 84; CHECK-NEXT: bx lr 85entry: 86 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 87 %offs.zext = zext <4 x i16> %offs to <4 x i32> 88 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 89 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 90 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 91 ret void 92} 93 94; VSTRW.32 Qd, [base, offs.sext, uxtw #2] 95define arm_aapcs_vfpcc void @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) { 96; CHECK-LABEL: b_signed_scaled_f32_i16: 97; CHECK: @ %bb.0: @ %entry 98; CHECK-NEXT: vldrh.s32 q1, [r1] 99; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 100; CHECK-NEXT: bx lr 101entry: 102 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 103 %offs.sext = sext <4 x i16> %offs to <4 x i32> 104 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 105 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 106 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 107 ret void 108} 109 110; VLDRH.u32 Qd, [base, offs.sext, uxtw #1] 111define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) { 112; CHECK-LABEL: ext_signed_scaled_i16_i16: 113; CHECK: @ %bb.0: @ %entry 114; CHECK-NEXT: vldrh.s32 q1, [r1] 115; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 116; CHECK-NEXT: bx lr 117entry: 118 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 119 %offs.sext = sext <4 x i16> %offs to <4 x i32> 120 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 121 %t = trunc <4 x i32> %input to <4 x i16> 122 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 123 ret void 124} 125 126; VSTRH.32 Qd, [base, offs.sext, uxtw #1] 127define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) { 128; CHECK-LABEL: ext_unsigned_scaled_i16_i16: 129; CHECK: @ %bb.0: @ %entry 130; CHECK-NEXT: vldrh.u32 q1, [r1] 131; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 132; CHECK-NEXT: bx lr 133entry: 134 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 135 %offs.zext = zext <4 x i16> %offs to <4 x i32> 136 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 137 %t = trunc <4 x i32> %input to <4 x i16> 138 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 139 ret void 140} 141 142; VSTRW.32 Qd, [base, offs.zext, uxtw #2] 143define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) { 144; CHECK-LABEL: unsigned_scaled_b_i32_i8: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: vldrb.u32 q1, [r1] 147; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 148; CHECK-NEXT: bx lr 149entry: 150 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 151 %offs.zext = zext <4 x i8> %offs to <4 x i32> 152 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 153 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 154 ret void 155} 156 157; VSTRW.32 Qd, [base, offs.sext, uxtw #2] 158define arm_aapcs_vfpcc void @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) { 159; CHECK-LABEL: signed_scaled_i32_i8: 160; CHECK: @ %bb.0: @ %entry 161; CHECK-NEXT: vldrb.s32 q1, [r1] 162; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 163; CHECK-NEXT: bx lr 164entry: 165 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 166 %offs.sext = sext <4 x i8> %offs to <4 x i32> 167 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 168 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 169 ret void 170} 171 172; VSTRW.32 Qd, [base, offs.zext, uxtw #2] 173define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) { 174; CHECK-LABEL: a_unsigned_scaled_f32_i8: 175; CHECK: @ %bb.0: @ %entry 176; CHECK-NEXT: vldrb.u32 q1, [r1] 177; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 178; CHECK-NEXT: bx lr 179entry: 180 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 181 %offs.zext = zext <4 x i8> %offs to <4 x i32> 182 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 183 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 184 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 185 ret void 186} 187 188; VSTRW.32 Qd, [base, offs.sext, uxtw #2] 189define arm_aapcs_vfpcc void @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) { 190; CHECK-LABEL: b_signed_scaled_f32_i8: 191; CHECK: @ %bb.0: @ %entry 192; CHECK-NEXT: vldrb.s32 q1, [r1] 193; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] 194; CHECK-NEXT: bx lr 195entry: 196 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 197 %offs.sext = sext <4 x i8> %offs to <4 x i32> 198 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 199 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 200 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 201 ret void 202} 203 204; VLDRH.z32 Qd, [base, offs.sext, uxtw #1] 205define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) { 206; CHECK-LABEL: ext_signed_scaled_i16_i8: 207; CHECK: @ %bb.0: @ %entry 208; CHECK-NEXT: vldrb.s32 q1, [r1] 209; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 210; CHECK-NEXT: bx lr 211entry: 212 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 213 %offs.sext = sext <4 x i8> %offs to <4 x i32> 214 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 215 %t = trunc <4 x i32> %input to <4 x i16> 216 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 217 ret void 218} 219 220; VLDRH.z32 Qd, [base, offs.zext, uxtw #1] 221define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) { 222; CHECK-LABEL: ext_unsigned_scaled_i16_i8: 223; CHECK: @ %bb.0: @ %entry 224; CHECK-NEXT: vldrb.u32 q1, [r1] 225; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 226; CHECK-NEXT: bx lr 227entry: 228 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 229 %offs.zext = zext <4 x i8> %offs to <4 x i32> 230 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 231 %t = trunc <4 x i32> %input to <4 x i16> 232 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 233 ret void 234} 235 236define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { 237; CHECK-LABEL: ext_scaled_i16_i32_2gep: 238; CHECK: @ %bb.0: @ %entry 239; CHECK-NEXT: vldrw.u32 q2, [r1] 240; CHECK-NEXT: vmov.i32 q1, #0xa 241; CHECK-NEXT: vmov r1, s0 242; CHECK-NEXT: vshl.i32 q2, q2, #1 243; CHECK-NEXT: vadd.i32 q2, q2, r0 244; CHECK-NEXT: vadd.i32 q1, q2, q1 245; CHECK-NEXT: vmov r0, s4 246; CHECK-NEXT: strh r1, [r0] 247; CHECK-NEXT: vmov r0, s5 248; CHECK-NEXT: vmov r1, s1 249; CHECK-NEXT: strh r1, [r0] 250; CHECK-NEXT: vmov r0, s6 251; CHECK-NEXT: vmov r1, s2 252; CHECK-NEXT: strh r1, [r0] 253; CHECK-NEXT: vmov r0, s7 254; CHECK-NEXT: vmov r1, s3 255; CHECK-NEXT: strh r1, [r0] 256; CHECK-NEXT: bx lr 257entry: 258 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 259 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs 260 %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 261 %t = trunc <4 x i32> %input to <4 x i16> 262 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 263 ret void 264} 265 266define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { 267; CHECK-LABEL: ext_scaled_i16_i32_2gep2: 268; CHECK: @ %bb.0: @ %entry 269; CHECK-NEXT: adr r1, .LCPI16_0 270; CHECK-NEXT: vldrw.u32 q1, [r1] 271; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] 272; CHECK-NEXT: bx lr 273; CHECK-NEXT: .p2align 4 274; CHECK-NEXT: @ %bb.1: 275; CHECK-NEXT: .LCPI16_0: 276; CHECK-NEXT: .long 5 @ 0x5 277; CHECK-NEXT: .long 8 @ 0x8 278; CHECK-NEXT: .long 11 @ 0xb 279; CHECK-NEXT: .long 14 @ 0xe 280entry: 281 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> <i16 0, i16 3, i16 6, i16 9> 282 %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 283 %t = trunc <4 x i32> %input to <4 x i16> 284 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 285 ret void 286} 287 288declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) 289declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) 290declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) 291declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) 292declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) 293