1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { 5; CHECK-LABEL: zext_scaled_i16_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vldrw.u32 q1, [r1] 8; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] 9; CHECK-NEXT: bx lr 10entry: 11 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 12 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs 13 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 14 %gather.zext = zext <4 x i16> %gather to <4 x i32> 15 ret <4 x i32> %gather.zext 16} 17 18define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { 19; CHECK-LABEL: sext_scaled_i16_i32: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vldrw.u32 q1, [r1] 22; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] 23; CHECK-NEXT: bx lr 24entry: 25 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 26 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs 27 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 28 %gather.sext = sext <4 x i16> %gather to <4 x i32> 29 ret <4 x i32> %gather.sext 30} 31 32define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) { 33; CHECK-LABEL: scaled_i32_i32: 34; CHECK: @ %bb.0: @ %entry 35; CHECK-NEXT: vldrw.u32 q1, [r1] 36; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 37; CHECK-NEXT: bx lr 38entry: 39 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 40 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs 41 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 42 ret <4 x i32> %gather 43} 44 45; TODO: scaled_f16_i32 46 47define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) { 48; CHECK-LABEL: scaled_f32_i32: 49; CHECK: @ %bb.0: @ %entry 50; CHECK-NEXT: vldrw.u32 q1, [r1] 51; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 52; CHECK-NEXT: bx lr 53entry: 54 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 55 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs 56 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 57 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 58 ret <4 x float> %gather 59} 60 61define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) { 62; CHECK-LABEL: unsigned_scaled_b_i32_i16: 63; CHECK: @ %bb.0: @ %entry 64; CHECK-NEXT: vldrh.u32 q1, [r1] 65; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 66; CHECK-NEXT: bx lr 67entry: 68 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 69 %offs.zext = zext <4 x i16> %offs to <4 x i32> 70 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 71 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 72 ret <4 x i32> %gather 73} 74 75define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) { 76; CHECK-LABEL: signed_scaled_i32_i16: 77; CHECK: @ %bb.0: @ %entry 78; CHECK-NEXT: vldrh.s32 q1, [r1] 79; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 80; CHECK-NEXT: bx lr 81entry: 82 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 83 %offs.sext = sext <4 x i16> %offs to <4 x i32> 84 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 85 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 86 ret <4 x i32> %gather 87} 88 89define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { 90; CHECK-LABEL: a_unsigned_scaled_f32_i16: 91; CHECK: @ %bb.0: @ %entry 92; CHECK-NEXT: vldrh.u32 q1, [r1] 93; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 94; CHECK-NEXT: bx lr 95entry: 96 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 97 %offs.zext = zext <4 x i16> %offs to <4 x i32> 98 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 99 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 100 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 101 ret <4 x float> %gather 102} 103 104define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { 105; CHECK-LABEL: b_signed_scaled_f32_i16: 106; CHECK: @ %bb.0: @ %entry 107; CHECK-NEXT: vldrh.s32 q1, [r1] 108; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 109; CHECK-NEXT: bx lr 110entry: 111 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 112 %offs.sext = sext <4 x i16> %offs to <4 x i32> 113 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 114 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 115 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 116 ret <4 x float> %gather 117} 118 119define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { 120; CHECK-LABEL: zext_signed_scaled_i16_i16: 121; CHECK: @ %bb.0: @ %entry 122; CHECK-NEXT: vldrh.s32 q1, [r1] 123; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] 124; CHECK-NEXT: bx lr 125entry: 126 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 127 %offs.sext = sext <4 x i16> %offs to <4 x i32> 128 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 129 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 130 %gather.zext = zext <4 x i16> %gather to <4 x i32> 131 ret <4 x i32> %gather.zext 132} 133 134define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { 135; CHECK-LABEL: sext_signed_scaled_i16_i16: 136; CHECK: @ %bb.0: @ %entry 137; CHECK-NEXT: vldrh.s32 q1, [r1] 138; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] 139; CHECK-NEXT: bx lr 140entry: 141 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 142 %offs.sext = sext <4 x i16> %offs to <4 x i32> 143 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 144 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 145 %gather.sext = sext <4 x i16> %gather to <4 x i32> 146 ret <4 x i32> %gather.sext 147} 148 149define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { 150; CHECK-LABEL: zext_unsigned_scaled_i16_i16: 151; CHECK: @ %bb.0: @ %entry 152; CHECK-NEXT: vldrh.u32 q1, [r1] 153; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] 154; CHECK-NEXT: bx lr 155entry: 156 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 157 %offs.zext = zext <4 x i16> %offs to <4 x i32> 158 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 159 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 160 %gather.zext = zext <4 x i16> %gather to <4 x i32> 161 ret <4 x i32> %gather.zext 162} 163 164define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { 165; CHECK-LABEL: sext_unsigned_scaled_i16_i16: 166; CHECK: @ %bb.0: @ %entry 167; CHECK-NEXT: vldrh.u32 q1, [r1] 168; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] 169; CHECK-NEXT: bx lr 170entry: 171 %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 172 %offs.zext = zext <4 x i16> %offs to <4 x i32> 173 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 174 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 175 %gather.sext = sext <4 x i16> %gather to <4 x i32> 176 ret <4 x i32> %gather.sext 177} 178 179define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) { 180; CHECK-LABEL: unsigned_scaled_b_i32_i8: 181; CHECK: @ %bb.0: @ %entry 182; CHECK-NEXT: vldrb.u32 q1, [r1] 183; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 184; CHECK-NEXT: bx lr 185entry: 186 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 187 %offs.zext = zext <4 x i8> %offs to <4 x i32> 188 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 189 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 190 ret <4 x i32> %gather 191} 192 193define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) { 194; CHECK-LABEL: signed_scaled_i32_i8: 195; CHECK: @ %bb.0: @ %entry 196; CHECK-NEXT: vldrb.s32 q1, [r1] 197; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 198; CHECK-NEXT: bx lr 199entry: 200 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 201 %offs.sext = sext <4 x i8> %offs to <4 x i32> 202 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 203 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 204 ret <4 x i32> %gather 205} 206 207define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { 208; CHECK-LABEL: a_unsigned_scaled_f32_i8: 209; CHECK: @ %bb.0: @ %entry 210; CHECK-NEXT: vldrb.u32 q1, [r1] 211; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 212; CHECK-NEXT: bx lr 213entry: 214 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 215 %offs.zext = zext <4 x i8> %offs to <4 x i32> 216 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext 217 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 218 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 219 ret <4 x float> %gather 220} 221 222define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { 223; CHECK-LABEL: b_signed_scaled_f32_i8: 224; CHECK: @ %bb.0: @ %entry 225; CHECK-NEXT: vldrb.s32 q1, [r1] 226; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 227; CHECK-NEXT: bx lr 228entry: 229 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 230 %offs.sext = sext <4 x i8> %offs to <4 x i32> 231 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext 232 %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> 233 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 234 ret <4 x float> %gather 235} 236 237define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { 238; CHECK-LABEL: zext_signed_scaled_i16_i8: 239; CHECK: @ %bb.0: @ %entry 240; CHECK-NEXT: vldrb.s32 q1, [r1] 241; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] 242; CHECK-NEXT: bx lr 243entry: 244 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 245 %offs.sext = sext <4 x i8> %offs to <4 x i32> 246 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 247 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 248 %gather.zext = zext <4 x i16> %gather to <4 x i32> 249 ret <4 x i32> %gather.zext 250} 251 252define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { 253; CHECK-LABEL: sext_signed_scaled_i16_i8: 254; CHECK: @ %bb.0: @ %entry 255; CHECK-NEXT: vldrb.s32 q1, [r1] 256; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] 257; CHECK-NEXT: bx lr 258entry: 259 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 260 %offs.sext = sext <4 x i8> %offs to <4 x i32> 261 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext 262 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 263 %gather.sext = sext <4 x i16> %gather to <4 x i32> 264 ret <4 x i32> %gather.sext 265} 266 267define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { 268; CHECK-LABEL: zext_unsigned_scaled_i16_i8: 269; CHECK: @ %bb.0: @ %entry 270; CHECK-NEXT: vldrb.u32 q1, [r1] 271; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] 272; CHECK-NEXT: bx lr 273entry: 274 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 275 %offs.zext = zext <4 x i8> %offs to <4 x i32> 276 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 277 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 278 %gather.zext = zext <4 x i16> %gather to <4 x i32> 279 ret <4 x i32> %gather.zext 280} 281 282define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { 283; CHECK-LABEL: sext_unsigned_scaled_i16_i8: 284; CHECK: @ %bb.0: @ %entry 285; CHECK-NEXT: vldrb.u32 q1, [r1] 286; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] 287; CHECK-NEXT: bx lr 288entry: 289 %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 290 %offs.zext = zext <4 x i8> %offs to <4 x i32> 291 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext 292 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 293 %gather.sext = sext <4 x i16> %gather to <4 x i32> 294 ret <4 x i32> %gather.sext 295} 296 297define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) { 298; CHECK-LABEL: scaled_i32_i32_2gep: 299; CHECK: @ %bb.0: @ %entry 300; CHECK-NEXT: vldrw.u32 q1, [r1] 301; CHECK-NEXT: vmov.i32 q0, #0x14 302; CHECK-NEXT: vshl.i32 q1, q1, #2 303; CHECK-NEXT: vadd.i32 q1, q1, r0 304; CHECK-NEXT: vadd.i32 q1, q1, q0 305; CHECK-NEXT: vldrw.u32 q0, [q1] 306; CHECK-NEXT: bx lr 307entry: 308 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 309 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs 310 %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 311 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 312 ret <4 x i32> %gather 313} 314 315define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base, <4 x i32>* %offptr) { 316; CHECK-LABEL: scaled_i32_i32_2gep2: 317; CHECK: @ %bb.0: @ %entry 318; CHECK-NEXT: adr r1, .LCPI21_0 319; CHECK-NEXT: vldrw.u32 q1, [r1] 320; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 321; CHECK-NEXT: bx lr 322; CHECK-NEXT: .p2align 4 323; CHECK-NEXT: @ %bb.1: 324; CHECK-NEXT: .LCPI21_0: 325; CHECK-NEXT: .long 5 @ 0x5 326; CHECK-NEXT: .long 8 @ 0x8 327; CHECK-NEXT: .long 11 @ 0xb 328; CHECK-NEXT: .long 14 @ 0xe 329entry: 330 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 331 %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 332 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 333 ret <4 x i32> %gather 334} 335 336declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) 337declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) 338declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) 339declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) 340declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 341