1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr) { 5; CHECK-LABEL: scaled_v8i16_i16: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vldrh.u16 q1, [r1] 8; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 9; CHECK-NEXT: bx lr 10entry: 11 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 12 %offs.zext = zext <8 x i16> %offs to <8 x i32> 13 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 14 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 15 ret <8 x i16> %gather 16} 17 18define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr) { 19; CHECK-LABEL: scaled_v8f16_i16: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vldrh.u16 q1, [r1] 22; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 23; CHECK-NEXT: bx lr 24entry: 25 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 26 %offs.zext = zext <8 x i16> %offs to <8 x i32> 27 %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 28 %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> 29 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 30 ret <8 x half> %gather 31} 32 33define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_half(half* %base, <8 x i16>* %offptr) { 34; CHECK-LABEL: scaled_v8f16_half: 35; CHECK: @ %bb.0: @ %entry 36; CHECK-NEXT: vldrh.u16 q1, [r1] 37; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 38; CHECK-NEXT: bx lr 39entry: 40 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 41 %offs.zext = zext <8 x i16> %offs to <8 x i32> 42 %ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext 43 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 44 ret <8 x half> %gather 45} 46 47define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr) { 48; CHECK-LABEL: scaled_v8i16_sext: 49; CHECK: @ %bb.0: @ %entry 50; CHECK-NEXT: .save {r4, r5, r7, lr} 51; CHECK-NEXT: push {r4, r5, r7, lr} 52; CHECK-NEXT: vldrh.s32 q0, [r1] 53; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 54; CHECK-NEXT: vshl.i32 q0, q0, #1 55; CHECK-NEXT: vshl.i32 q1, q1, #1 56; CHECK-NEXT: vadd.i32 q0, q0, r0 57; CHECK-NEXT: vadd.i32 q1, q1, r0 58; CHECK-NEXT: vmov r2, s2 59; CHECK-NEXT: vmov r3, s3 60; CHECK-NEXT: vmov r5, s1 61; CHECK-NEXT: vmov r0, s4 62; CHECK-NEXT: vmov r1, s5 63; CHECK-NEXT: vmov r4, s7 64; CHECK-NEXT: ldrh.w r12, [r2] 65; CHECK-NEXT: vmov r2, s0 66; CHECK-NEXT: ldrh.w lr, [r3] 67; CHECK-NEXT: vmov r3, s6 68; CHECK-NEXT: ldrh r5, [r5] 69; CHECK-NEXT: ldrh r0, [r0] 70; CHECK-NEXT: ldrh r1, [r1] 71; CHECK-NEXT: ldrh r4, [r4] 72; CHECK-NEXT: ldrh r2, [r2] 73; CHECK-NEXT: ldrh r3, [r3] 74; CHECK-NEXT: vmov.16 q0[0], r2 75; CHECK-NEXT: vmov.16 q0[1], r5 76; CHECK-NEXT: vmov.16 q0[2], r12 77; CHECK-NEXT: vmov.16 q0[3], lr 78; CHECK-NEXT: vmov.16 q0[4], r0 79; CHECK-NEXT: vmov.16 q0[5], r1 80; CHECK-NEXT: vmov.16 q0[6], r3 81; CHECK-NEXT: vmov.16 q0[7], r4 82; CHECK-NEXT: pop {r4, r5, r7, pc} 83entry: 84 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 85 %offs.sext = sext <8 x i16> %offs to <8 x i32> 86 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext 87 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 88 ret <8 x i16> %gather 89} 90 91define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr) { 92; CHECK-LABEL: scaled_v8f16_sext: 93; CHECK: @ %bb.0: @ %entry 94; CHECK-NEXT: vldrh.s32 q0, [r1] 95; CHECK-NEXT: vshl.i32 q0, q0, #1 96; CHECK-NEXT: vadd.i32 q1, q0, r0 97; CHECK-NEXT: vmov r2, s5 98; CHECK-NEXT: vldr.16 s0, [r2] 99; CHECK-NEXT: vmov r3, s4 100; CHECK-NEXT: vmov r2, s0 101; CHECK-NEXT: vldr.16 s0, [r3] 102; CHECK-NEXT: vmov r3, s0 103; CHECK-NEXT: vmov.16 q0[0], r3 104; CHECK-NEXT: vmov.16 q0[1], r2 105; CHECK-NEXT: vmov r2, s6 106; CHECK-NEXT: vldr.16 s8, [r2] 107; CHECK-NEXT: vmov r2, s8 108; CHECK-NEXT: vmov.16 q0[2], r2 109; CHECK-NEXT: vmov r2, s7 110; CHECK-NEXT: vldr.16 s4, [r2] 111; CHECK-NEXT: vmov r2, s4 112; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 113; CHECK-NEXT: vmov.16 q0[3], r2 114; CHECK-NEXT: vshl.i32 q1, q1, #1 115; CHECK-NEXT: vadd.i32 q1, q1, r0 116; CHECK-NEXT: vmov r0, s4 117; CHECK-NEXT: vldr.16 s8, [r0] 118; CHECK-NEXT: vmov r0, s8 119; CHECK-NEXT: vmov.16 q0[4], r0 120; CHECK-NEXT: vmov r0, s5 121; CHECK-NEXT: vldr.16 s8, [r0] 122; CHECK-NEXT: vmov r0, s8 123; CHECK-NEXT: vmov.16 q0[5], r0 124; CHECK-NEXT: vmov r0, s6 125; CHECK-NEXT: vldr.16 s8, [r0] 126; CHECK-NEXT: vmov r0, s8 127; CHECK-NEXT: vmov.16 q0[6], r0 128; CHECK-NEXT: vmov r0, s7 129; CHECK-NEXT: vldr.16 s4, [r0] 130; CHECK-NEXT: vmov r0, s4 131; CHECK-NEXT: vmov.16 q0[7], r0 132; CHECK-NEXT: bx lr 133entry: 134 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 135 %offs.sext = sext <8 x i16> %offs to <8 x i32> 136 %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext 137 %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> 138 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 139 ret <8 x half> %gather 140} 141 142define arm_aapcs_vfpcc <8 x i16> @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr) { 143; CHECK-LABEL: unsigned_scaled_v8i16_i8: 144; CHECK: @ %bb.0: @ %entry 145; CHECK-NEXT: vldrb.u16 q1, [r1] 146; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 147; CHECK-NEXT: bx lr 148entry: 149 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 150 %offs.zext = zext <8 x i8> %offs to <8 x i32> 151 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 152 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 153 ret <8 x i16> %gather 154} 155 156define arm_aapcs_vfpcc <8 x half> @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr) { 157; CHECK-LABEL: unsigned_scaled_v8f16_i8: 158; CHECK: @ %bb.0: @ %entry 159; CHECK-NEXT: vldrb.u16 q1, [r1] 160; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 161; CHECK-NEXT: bx lr 162entry: 163 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 164 %offs.zext = zext <8 x i8> %offs to <8 x i32> 165 %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 166 %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> 167 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 168 ret <8 x half> %gather 169} 170 171define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0t(i16* %base, <8 x i16>* %offptr) { 172; CHECK-LABEL: scaled_v8i16_i16_passthru0t: 173; CHECK: @ %bb.0: @ %entry 174; CHECK-NEXT: vldrh.u16 q1, [r1] 175; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 176; CHECK-NEXT: bx lr 177entry: 178 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 179 %offs.zext = zext <8 x i16> %offs to <8 x i32> 180 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 181 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> zeroinitializer) 182 ret <8 x i16> %gather 183} 184 185define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1t(i16* %base, <8 x i16>* %offptr) { 186; CHECK-LABEL: scaled_v8i16_i16_passthru1t: 187; CHECK: @ %bb.0: @ %entry 188; CHECK-NEXT: vldrh.u16 q1, [r1] 189; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 190; CHECK-NEXT: bx lr 191entry: 192 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 193 %offs.zext = zext <8 x i16> %offs to <8 x i32> 194 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 195 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) 196 ret <8 x i16> %gather 197} 198 199define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1f(i16* %base, <8 x i16>* %offptr) { 200; CHECK-LABEL: scaled_v8i16_i16_passthru1f: 201; CHECK: @ %bb.0: @ %entry 202; CHECK-NEXT: movw r2, #65487 203; CHECK-NEXT: vmov.i16 q0, #0x1 204; CHECK-NEXT: vmsr p0, r2 205; CHECK-NEXT: vldrh.u16 q1, [r1] 206; CHECK-NEXT: vpst 207; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] 208; CHECK-NEXT: vpsel q0, q2, q0 209; CHECK-NEXT: bx lr 210entry: 211 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 212 %offs.zext = zext <8 x i16> %offs to <8 x i32> 213 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 214 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) 215 ret <8 x i16> %gather 216} 217 218define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0f(i16* %base, <8 x i16>* %offptr) { 219; CHECK-LABEL: scaled_v8i16_i16_passthru0f: 220; CHECK: @ %bb.0: @ %entry 221; CHECK-NEXT: movw r2, #65523 222; CHECK-NEXT: vmsr p0, r2 223; CHECK-NEXT: vldrh.u16 q1, [r1] 224; CHECK-NEXT: vpst 225; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] 226; CHECK-NEXT: bx lr 227entry: 228 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 229 %offs.zext = zext <8 x i16> %offs to <8 x i32> 230 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 231 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>) 232 ret <8 x i16> %gather 233} 234 235define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr) { 236; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0: 237; CHECK: @ %bb.0: @ %entry 238; CHECK-NEXT: vldrh.u16 q1, [r1] 239; CHECK-NEXT: vpt.s16 gt, q1, zr 240; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] 241; CHECK-NEXT: bx lr 242entry: 243 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 244 %offs.zext = zext <8 x i16> %offs to <8 x i32> 245 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 246 %mask = icmp sgt <8 x i16> %offs, zeroinitializer 247 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>) 248 ret <8 x i16> %gather 249} 250 251define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) { 252; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1: 253; CHECK: @ %bb.0: @ %entry 254; CHECK-NEXT: vmov.i16 q0, #0x1 255; CHECK-NEXT: vldrh.u16 q1, [r1] 256; CHECK-NEXT: vpt.s16 gt, q1, zr 257; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] 258; CHECK-NEXT: vpsel q0, q2, q0 259; CHECK-NEXT: bx lr 260entry: 261 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 262 %offs.zext = zext <8 x i16> %offs to <8 x i32> 263 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext 264 %mask = icmp sgt <8 x i16> %offs, zeroinitializer 265 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) 266 ret <8 x i16> %gather 267} 268 269define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr) { 270; CHECK-LABEL: scaled_v8i16_i16_2gep: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: .save {r4, r5, r7, lr} 273; CHECK-NEXT: push {r4, r5, r7, lr} 274; CHECK-NEXT: vldrh.s32 q0, [r1] 275; CHECK-NEXT: vmov.i32 q1, #0x28 276; CHECK-NEXT: vldrh.s32 q2, [r1, #8] 277; CHECK-NEXT: vshl.i32 q0, q0, #1 278; CHECK-NEXT: vadd.i32 q0, q0, r0 279; CHECK-NEXT: vshl.i32 q2, q2, #1 280; CHECK-NEXT: vadd.i32 q0, q0, q1 281; CHECK-NEXT: vadd.i32 q2, q2, r0 282; CHECK-NEXT: vmov r2, s2 283; CHECK-NEXT: vadd.i32 q1, q2, q1 284; CHECK-NEXT: vmov r3, s3 285; CHECK-NEXT: vmov r5, s1 286; CHECK-NEXT: vmov r0, s4 287; CHECK-NEXT: vmov r1, s5 288; CHECK-NEXT: vmov r4, s7 289; CHECK-NEXT: ldrh.w r12, [r2] 290; CHECK-NEXT: vmov r2, s0 291; CHECK-NEXT: ldrh.w lr, [r3] 292; CHECK-NEXT: vmov r3, s6 293; CHECK-NEXT: ldrh r5, [r5] 294; CHECK-NEXT: ldrh r0, [r0] 295; CHECK-NEXT: ldrh r1, [r1] 296; CHECK-NEXT: ldrh r4, [r4] 297; CHECK-NEXT: ldrh r2, [r2] 298; CHECK-NEXT: ldrh r3, [r3] 299; CHECK-NEXT: vmov.16 q0[0], r2 300; CHECK-NEXT: vmov.16 q0[1], r5 301; CHECK-NEXT: vmov.16 q0[2], r12 302; CHECK-NEXT: vmov.16 q0[3], lr 303; CHECK-NEXT: vmov.16 q0[4], r0 304; CHECK-NEXT: vmov.16 q0[5], r1 305; CHECK-NEXT: vmov.16 q0[6], r3 306; CHECK-NEXT: vmov.16 q0[7], r4 307; CHECK-NEXT: pop {r4, r5, r7, pc} 308entry: 309 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 310 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs 311 %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 312 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 313 ret <8 x i16> %gather 314} 315 316define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr) { 317; CHECK-LABEL: scaled_v8i16_i16_2gep2: 318; CHECK: @ %bb.0: @ %entry 319; CHECK-NEXT: adr r1, .LCPI14_0 320; CHECK-NEXT: vldrw.u32 q1, [r1] 321; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 322; CHECK-NEXT: bx lr 323; CHECK-NEXT: .p2align 4 324; CHECK-NEXT: @ %bb.1: 325; CHECK-NEXT: .LCPI14_0: 326; CHECK-NEXT: .short 20 @ 0x14 327; CHECK-NEXT: .short 23 @ 0x17 328; CHECK-NEXT: .short 26 @ 0x1a 329; CHECK-NEXT: .short 29 @ 0x1d 330; CHECK-NEXT: .short 32 @ 0x20 331; CHECK-NEXT: .short 35 @ 0x23 332; CHECK-NEXT: .short 38 @ 0x26 333; CHECK-NEXT: .short 41 @ 0x29 334entry: 335 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21> 336 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 337 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 338 ret <8 x i16> %gather 339} 340 341define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { 342; CHECK-LABEL: scaled_v8i16_i16_biggep: 343; CHECK: @ %bb.0: @ %entry 344; CHECK-NEXT: adr r1, .LCPI15_0 345; CHECK-NEXT: vldrw.u32 q1, [r1] 346; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 347; CHECK-NEXT: bx lr 348; CHECK-NEXT: .p2align 4 349; CHECK-NEXT: @ %bb.1: 350; CHECK-NEXT: .LCPI15_0: 351; CHECK-NEXT: .short 20 @ 0x14 352; CHECK-NEXT: .short 23 @ 0x17 353; CHECK-NEXT: .short 26 @ 0x1a 354; CHECK-NEXT: .short 29 @ 0x1d 355; CHECK-NEXT: .short 32 @ 0x20 356; CHECK-NEXT: .short 35 @ 0x23 357; CHECK-NEXT: .short 38 @ 0x26 358; CHECK-NEXT: .short 41 @ 0x29 359entry: 360 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 361 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 362 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 363 ret <8 x i16> %gather 364} 365 366define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep2(i16* %base) { 367; CHECK-LABEL: scaled_v8i16_i16_biggep2: 368; CHECK: @ %bb.0: @ %entry 369; CHECK-NEXT: adr r1, .LCPI16_0 370; CHECK-NEXT: vldrw.u32 q1, [r1] 371; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] 372; CHECK-NEXT: bx lr 373; CHECK-NEXT: .p2align 4 374; CHECK-NEXT: @ %bb.1: 375; CHECK-NEXT: .LCPI16_0: 376; CHECK-NEXT: .short 0 @ 0x0 377; CHECK-NEXT: .short 3 @ 0x3 378; CHECK-NEXT: .short 6 @ 0x6 379; CHECK-NEXT: .short 9 @ 0x9 380; CHECK-NEXT: .short 12 @ 0xc 381; CHECK-NEXT: .short 15 @ 0xf 382; CHECK-NEXT: .short 18 @ 0x12 383; CHECK-NEXT: .short 21 @ 0x15 384entry: 385 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 386 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 387 ret <8 x i16> %gather 388} 389 390define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) { 391; CHECK-LABEL: scaled_v8i16_i16_biggep3: 392; CHECK: @ %bb.0: @ %entry 393; CHECK-NEXT: .save {r4, r5, r6, lr} 394; CHECK-NEXT: push {r4, r5, r6, lr} 395; CHECK-NEXT: adr r1, .LCPI17_0 396; CHECK-NEXT: adr.w r12, .LCPI17_1 397; CHECK-NEXT: vldrw.u32 q0, [r1] 398; CHECK-NEXT: vldrw.u32 q1, [r12] 399; CHECK-NEXT: vadd.i32 q0, q0, r0 400; CHECK-NEXT: vadd.i32 q1, q1, r0 401; CHECK-NEXT: vmov r1, s2 402; CHECK-NEXT: vmov r2, s3 403; CHECK-NEXT: vmov r3, s0 404; CHECK-NEXT: vmov r5, s1 405; CHECK-NEXT: vmov r0, s4 406; CHECK-NEXT: vmov r4, s7 407; CHECK-NEXT: ldrh.w lr, [r1] 408; CHECK-NEXT: vmov r1, s5 409; CHECK-NEXT: ldrh r6, [r2] 410; CHECK-NEXT: vmov r2, s6 411; CHECK-NEXT: ldrh r3, [r3] 412; CHECK-NEXT: ldrh r5, [r5] 413; CHECK-NEXT: vmov.16 q0[0], r3 414; CHECK-NEXT: ldrh r0, [r0] 415; CHECK-NEXT: vmov.16 q0[1], r5 416; CHECK-NEXT: ldrh r4, [r4] 417; CHECK-NEXT: vmov.16 q0[2], lr 418; CHECK-NEXT: vmov.16 q0[3], r6 419; CHECK-NEXT: vmov.16 q0[4], r0 420; CHECK-NEXT: ldrh r1, [r1] 421; CHECK-NEXT: ldrh r2, [r2] 422; CHECK-NEXT: vmov.16 q0[5], r1 423; CHECK-NEXT: vmov.16 q0[6], r2 424; CHECK-NEXT: vmov.16 q0[7], r4 425; CHECK-NEXT: pop {r4, r5, r6, pc} 426; CHECK-NEXT: .p2align 4 427; CHECK-NEXT: @ %bb.1: 428; CHECK-NEXT: .LCPI17_0: 429; CHECK-NEXT: .long 131072 @ 0x20000 430; CHECK-NEXT: .long 131078 @ 0x20006 431; CHECK-NEXT: .long 131084 @ 0x2000c 432; CHECK-NEXT: .long 131090 @ 0x20012 433; CHECK-NEXT: .LCPI17_1: 434; CHECK-NEXT: .long 131096 @ 0x20018 435; CHECK-NEXT: .long 131102 @ 0x2001e 436; CHECK-NEXT: .long 131108 @ 0x20024 437; CHECK-NEXT: .long 131114 @ 0x2002a 438entry: 439 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 440 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536 441 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 442 ret <8 x i16> %gather 443} 444 445define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) { 446; CHECK-LABEL: scaled_v8i16_i16_biggep4: 447; CHECK: @ %bb.0: @ %entry 448; CHECK-NEXT: .save {r4, r5, r6, lr} 449; CHECK-NEXT: push {r4, r5, r6, lr} 450; CHECK-NEXT: adr r1, .LCPI18_0 451; CHECK-NEXT: adr.w r12, .LCPI18_1 452; CHECK-NEXT: vldrw.u32 q0, [r1] 453; CHECK-NEXT: vldrw.u32 q1, [r12] 454; CHECK-NEXT: vadd.i32 q0, q0, r0 455; CHECK-NEXT: vadd.i32 q1, q1, r0 456; CHECK-NEXT: vmov r1, s2 457; CHECK-NEXT: vmov r2, s3 458; CHECK-NEXT: vmov r3, s0 459; CHECK-NEXT: vmov r5, s1 460; CHECK-NEXT: vmov r0, s4 461; CHECK-NEXT: vmov r4, s7 462; CHECK-NEXT: ldrh.w lr, [r1] 463; CHECK-NEXT: vmov r1, s5 464; CHECK-NEXT: ldrh r6, [r2] 465; CHECK-NEXT: vmov r2, s6 466; CHECK-NEXT: ldrh r3, [r3] 467; CHECK-NEXT: ldrh r5, [r5] 468; CHECK-NEXT: vmov.16 q0[0], r3 469; CHECK-NEXT: ldrh r0, [r0] 470; CHECK-NEXT: vmov.16 q0[1], r5 471; CHECK-NEXT: ldrh r4, [r4] 472; CHECK-NEXT: vmov.16 q0[2], lr 473; CHECK-NEXT: vmov.16 q0[3], r6 474; CHECK-NEXT: vmov.16 q0[4], r0 475; CHECK-NEXT: ldrh r1, [r1] 476; CHECK-NEXT: ldrh r2, [r2] 477; CHECK-NEXT: vmov.16 q0[5], r1 478; CHECK-NEXT: vmov.16 q0[6], r2 479; CHECK-NEXT: vmov.16 q0[7], r4 480; CHECK-NEXT: pop {r4, r5, r6, pc} 481; CHECK-NEXT: .p2align 4 482; CHECK-NEXT: @ %bb.1: 483; CHECK-NEXT: .LCPI18_0: 484; CHECK-NEXT: .long 0 @ 0x0 485; CHECK-NEXT: .long 6 @ 0x6 486; CHECK-NEXT: .long 12 @ 0xc 487; CHECK-NEXT: .long 18 @ 0x12 488; CHECK-NEXT: .LCPI18_1: 489; CHECK-NEXT: .long 24 @ 0x18 490; CHECK-NEXT: .long 131072 @ 0x20000 491; CHECK-NEXT: .long 36 @ 0x24 492; CHECK-NEXT: .long 42 @ 0x2a 493entry: 494 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 65536, i32 18, i32 21> 495 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 496 ret <8 x i16> %gather 497} 498 499define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) { 500; CHECK-LABEL: scaled_v8i16_i16_biggep5: 501; CHECK: @ %bb.0: @ %entry 502; CHECK-NEXT: .save {r4, r5, r7, lr} 503; CHECK-NEXT: push {r4, r5, r7, lr} 504; CHECK-NEXT: vmov.i32 q2, #0x20000 505; CHECK-NEXT: vadd.i32 q0, q0, q2 506; CHECK-NEXT: vadd.i32 q1, q1, q2 507; CHECK-NEXT: vmov r0, s2 508; CHECK-NEXT: vmov r1, s3 509; CHECK-NEXT: vmov r2, s0 510; CHECK-NEXT: vmov r5, s1 511; CHECK-NEXT: vmov r3, s4 512; CHECK-NEXT: vmov r4, s7 513; CHECK-NEXT: ldrh.w r12, [r0] 514; CHECK-NEXT: vmov r0, s5 515; CHECK-NEXT: ldrh.w lr, [r1] 516; CHECK-NEXT: vmov r1, s6 517; CHECK-NEXT: ldrh r2, [r2] 518; CHECK-NEXT: ldrh r5, [r5] 519; CHECK-NEXT: vmov.16 q0[0], r2 520; CHECK-NEXT: ldrh r3, [r3] 521; CHECK-NEXT: vmov.16 q0[1], r5 522; CHECK-NEXT: ldrh r4, [r4] 523; CHECK-NEXT: vmov.16 q0[2], r12 524; CHECK-NEXT: vmov.16 q0[3], lr 525; CHECK-NEXT: vmov.16 q0[4], r3 526; CHECK-NEXT: ldrh r0, [r0] 527; CHECK-NEXT: ldrh r1, [r1] 528; CHECK-NEXT: vmov.16 q0[5], r0 529; CHECK-NEXT: vmov.16 q0[6], r1 530; CHECK-NEXT: vmov.16 q0[7], r4 531; CHECK-NEXT: pop {r4, r5, r7, pc} 532entry: 533 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 534 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 535 ret <8 x i16> %gather 536} 537 538define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) { 539; CHECK-LABEL: scaled_v8i16_i16_biggep6: 540; CHECK: @ %bb.0: @ %entry 541; CHECK-NEXT: .save {r4, r5, r6, lr} 542; CHECK-NEXT: push {r4, r5, r6, lr} 543; CHECK-NEXT: adr r1, .LCPI20_0 544; CHECK-NEXT: adr.w r12, .LCPI20_1 545; CHECK-NEXT: vldrw.u32 q0, [r1] 546; CHECK-NEXT: vldrw.u32 q1, [r12] 547; CHECK-NEXT: vadd.i32 q0, q0, r0 548; CHECK-NEXT: vadd.i32 q1, q1, r0 549; CHECK-NEXT: vmov r1, s2 550; CHECK-NEXT: vmov r2, s3 551; CHECK-NEXT: vmov r3, s0 552; CHECK-NEXT: vmov r5, s1 553; CHECK-NEXT: vmov r0, s4 554; CHECK-NEXT: vmov r4, s7 555; CHECK-NEXT: ldrh.w lr, [r1] 556; CHECK-NEXT: vmov r1, s5 557; CHECK-NEXT: ldrh r6, [r2] 558; CHECK-NEXT: vmov r2, s6 559; CHECK-NEXT: ldrh r3, [r3] 560; CHECK-NEXT: ldrh r5, [r5] 561; CHECK-NEXT: vmov.16 q0[0], r3 562; CHECK-NEXT: ldrh r0, [r0] 563; CHECK-NEXT: vmov.16 q0[1], r5 564; CHECK-NEXT: ldrh r4, [r4] 565; CHECK-NEXT: vmov.16 q0[2], lr 566; CHECK-NEXT: vmov.16 q0[3], r6 567; CHECK-NEXT: vmov.16 q0[4], r0 568; CHECK-NEXT: ldrh r1, [r1] 569; CHECK-NEXT: ldrh r2, [r2] 570; CHECK-NEXT: vmov.16 q0[5], r1 571; CHECK-NEXT: vmov.16 q0[6], r2 572; CHECK-NEXT: vmov.16 q0[7], r4 573; CHECK-NEXT: pop {r4, r5, r6, pc} 574; CHECK-NEXT: .p2align 4 575; CHECK-NEXT: @ %bb.1: 576; CHECK-NEXT: .LCPI20_0: 577; CHECK-NEXT: .long 2 @ 0x2 578; CHECK-NEXT: .long 8 @ 0x8 579; CHECK-NEXT: .long 14 @ 0xe 580; CHECK-NEXT: .long 20 @ 0x14 581; CHECK-NEXT: .LCPI20_1: 582; CHECK-NEXT: .long 131074 @ 0x20002 583; CHECK-NEXT: .long 32 @ 0x20 584; CHECK-NEXT: .long 38 @ 0x26 585; CHECK-NEXT: .long 44 @ 0x2c 586entry: 587 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 65536, i32 15, i32 18, i32 21> 588 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1 589 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 590 ret <8 x i16> %gather 591} 592 593define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) { 594; CHECK-LABEL: scaled_v8i16_i16_biggep7: 595; CHECK: @ %bb.0: @ %entry 596; CHECK-NEXT: .save {r4, r5, r6, lr} 597; CHECK-NEXT: push {r4, r5, r6, lr} 598; CHECK-NEXT: adr r1, .LCPI21_0 599; CHECK-NEXT: adr.w r12, .LCPI21_1 600; CHECK-NEXT: vldrw.u32 q0, [r1] 601; CHECK-NEXT: vldrw.u32 q1, [r12] 602; CHECK-NEXT: vadd.i32 q0, q0, r0 603; CHECK-NEXT: vadd.i32 q1, q1, r0 604; CHECK-NEXT: vmov r1, s2 605; CHECK-NEXT: vmov r2, s3 606; CHECK-NEXT: vmov r3, s0 607; CHECK-NEXT: vmov r5, s1 608; CHECK-NEXT: vmov r0, s4 609; CHECK-NEXT: vmov r4, s7 610; CHECK-NEXT: ldrh.w lr, [r1] 611; CHECK-NEXT: vmov r1, s5 612; CHECK-NEXT: ldrh r6, [r2] 613; CHECK-NEXT: vmov r2, s6 614; CHECK-NEXT: ldrh r3, [r3] 615; CHECK-NEXT: ldrh r5, [r5] 616; CHECK-NEXT: vmov.16 q0[0], r3 617; CHECK-NEXT: ldrh r0, [r0] 618; CHECK-NEXT: vmov.16 q0[1], r5 619; CHECK-NEXT: ldrh r4, [r4] 620; CHECK-NEXT: vmov.16 q0[2], lr 621; CHECK-NEXT: vmov.16 q0[3], r6 622; CHECK-NEXT: vmov.16 q0[4], r0 623; CHECK-NEXT: ldrh r1, [r1] 624; CHECK-NEXT: ldrh r2, [r2] 625; CHECK-NEXT: vmov.16 q0[5], r1 626; CHECK-NEXT: vmov.16 q0[6], r2 627; CHECK-NEXT: vmov.16 q0[7], r4 628; CHECK-NEXT: pop {r4, r5, r6, pc} 629; CHECK-NEXT: .p2align 4 630; CHECK-NEXT: @ %bb.1: 631; CHECK-NEXT: .LCPI21_0: 632; CHECK-NEXT: .long 128 @ 0x80 633; CHECK-NEXT: .long 1206 @ 0x4b6 634; CHECK-NEXT: .long 1212 @ 0x4bc 635; CHECK-NEXT: .long 1218 @ 0x4c2 636; CHECK-NEXT: .LCPI21_1: 637; CHECK-NEXT: .long 1224 @ 0x4c8 638; CHECK-NEXT: .long 1230 @ 0x4ce 639; CHECK-NEXT: .long 1236 @ 0x4d4 640; CHECK-NEXT: .long 1242 @ 0x4da 641entry: 642 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 65000, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21> 643 %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600 644 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 645 ret <8 x i16> %gather 646} 647 648define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(i32* %base, <8 x i16>* %offptr) { 649; CHECK-LABEL: scaled_v8i16_i16_basei32: 650; CHECK: @ %bb.0: @ %entry 651; CHECK-NEXT: .save {r4, r5, r7, lr} 652; CHECK-NEXT: push {r4, r5, r7, lr} 653; CHECK-NEXT: vldrh.u32 q0, [r1] 654; CHECK-NEXT: vldrh.u32 q1, [r1, #8] 655; CHECK-NEXT: vshl.i32 q0, q0, #2 656; CHECK-NEXT: vshl.i32 q1, q1, #2 657; CHECK-NEXT: vadd.i32 q0, q0, r0 658; CHECK-NEXT: vadd.i32 q1, q1, r0 659; CHECK-NEXT: vmov r2, s2 660; CHECK-NEXT: vmov r3, s3 661; CHECK-NEXT: vmov r5, s1 662; CHECK-NEXT: vmov r0, s4 663; CHECK-NEXT: vmov r1, s5 664; CHECK-NEXT: vmov r4, s7 665; CHECK-NEXT: ldrh.w r12, [r2] 666; CHECK-NEXT: vmov r2, s0 667; CHECK-NEXT: ldrh.w lr, [r3] 668; CHECK-NEXT: vmov r3, s6 669; CHECK-NEXT: ldrh r5, [r5] 670; CHECK-NEXT: ldrh r0, [r0] 671; CHECK-NEXT: ldrh r1, [r1] 672; CHECK-NEXT: ldrh r4, [r4] 673; CHECK-NEXT: ldrh r2, [r2] 674; CHECK-NEXT: ldrh r3, [r3] 675; CHECK-NEXT: vmov.16 q0[0], r2 676; CHECK-NEXT: vmov.16 q0[1], r5 677; CHECK-NEXT: vmov.16 q0[2], r12 678; CHECK-NEXT: vmov.16 q0[3], lr 679; CHECK-NEXT: vmov.16 q0[4], r0 680; CHECK-NEXT: vmov.16 q0[5], r1 681; CHECK-NEXT: vmov.16 q0[6], r3 682; CHECK-NEXT: vmov.16 q0[7], r4 683; CHECK-NEXT: pop {r4, r5, r7, pc} 684entry: 685 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 686 %offs.zext = zext <8 x i16> %offs to <8 x i32> 687 %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %offs.zext 688 %ptrs.cast = bitcast <8 x i32*> %ptrs to <8 x i16*> 689 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs.cast, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 690 ret <8 x i16> %gather 691} 692 693declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 694declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 695declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 696