1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; VLDRB.u16 Qd, [base, offs] 5define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { 6; CHECK-LABEL: ext_unscaled_i8_i16: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrh.u16 q1, [r1] 9; CHECK-NEXT: vstrb.16 q0, [r0, q1] 10; CHECK-NEXT: bx lr 11entry: 12 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 13 %offs.zext = zext <8 x i16> %offs to <8 x i32> 14 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 15 %t = trunc <8 x i16> %input to <8 x i8> 16 call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x i8*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 17 ret void 18} 19 20; VLDRB.u16 Qd, [base, offs] 21define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { 22; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vldrb.u16 q1, [r1] 25; CHECK-NEXT: vstrb.16 q0, [r0, q1] 26; CHECK-NEXT: bx lr 27entry: 28 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 29 %offs.zext = zext <8 x i8> %offs to <8 x i32> 30 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 31 %input.trunc = trunc <8 x i16> %input to <8 x i8> 32 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 33 ret void 34} 35 36; VLDRH.16 Qd, [base, offs] 37define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { 38; CHECK-LABEL: unscaled_i16_i16: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vldrh.u16 q1, [r1] 41; CHECK-NEXT: vstrh.16 q0, [r0, q1] 42; CHECK-NEXT: bx lr 43entry: 44 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 45 %offs.zext = zext <8 x i16> %offs to <8 x i32> 46 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 47 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 48 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 49 ret void 50} 51 52; VLDRH.s16 Qd, [base, offs] 53define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) { 54; CHECK-LABEL: unscaled_v8f16_i16: 55; CHECK: @ %bb.0: @ %entry 56; CHECK-NEXT: vldrh.u16 q1, [r1] 57; CHECK-NEXT: vstrh.16 q0, [r0, q1] 58; CHECK-NEXT: bx lr 59entry: 60 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 61 %offs.zext = zext <8 x i16> %offs to <8 x i32> 62 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 63 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> 64 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 65 ret void 66} 67 68; Expand 69define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { 70; CHECK-LABEL: unscaled_v8i16_sext: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: vldrh.s32 q2, [r1] 73; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 74; CHECK-NEXT: vmov.u16 r1, q0[0] 75; CHECK-NEXT: vadd.i32 q2, q2, r0 76; CHECK-NEXT: vadd.i32 q1, q1, r0 77; CHECK-NEXT: vmov r0, s8 78; CHECK-NEXT: strh r1, [r0] 79; CHECK-NEXT: vmov r0, s9 80; CHECK-NEXT: vmov.u16 r1, q0[1] 81; CHECK-NEXT: strh r1, [r0] 82; CHECK-NEXT: vmov r0, s10 83; CHECK-NEXT: vmov.u16 r1, q0[2] 84; CHECK-NEXT: strh r1, [r0] 85; CHECK-NEXT: vmov r0, s11 86; CHECK-NEXT: vmov.u16 r1, q0[3] 87; CHECK-NEXT: strh r1, [r0] 88; CHECK-NEXT: vmov r0, s4 89; CHECK-NEXT: vmov.u16 r1, q0[4] 90; CHECK-NEXT: strh r1, [r0] 91; CHECK-NEXT: vmov r0, s5 92; CHECK-NEXT: vmov.u16 r1, q0[5] 93; CHECK-NEXT: strh r1, [r0] 94; CHECK-NEXT: vmov r0, s6 95; CHECK-NEXT: vmov.u16 r1, q0[6] 96; CHECK-NEXT: strh r1, [r0] 97; CHECK-NEXT: vmov r0, s7 98; CHECK-NEXT: vmov.u16 r1, q0[7] 99; CHECK-NEXT: strh r1, [r0] 100; CHECK-NEXT: bx lr 101entry: 102 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 103 %offs.sext = sext <8 x i16> %offs to <8 x i32> 104 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext 105 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 106 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 107 ret void 108} 109 110; Expand 111define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) { 112; CHECK-LABEL: unscaled_v8f16_sext: 113; CHECK: @ %bb.0: @ %entry 114; CHECK-NEXT: vldrh.s32 q2, [r1] 115; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 116; CHECK-NEXT: vmovx.f16 s12, s0 117; CHECK-NEXT: vadd.i32 q2, q2, r0 118; CHECK-NEXT: vadd.i32 q1, q1, r0 119; CHECK-NEXT: vmov r1, s8 120; CHECK-NEXT: vstr.16 s0, [r1] 121; CHECK-NEXT: vmov r1, s9 122; CHECK-NEXT: vstr.16 s12, [r1] 123; CHECK-NEXT: vmov r1, s10 124; CHECK-NEXT: vstr.16 s1, [r1] 125; CHECK-NEXT: vmov r1, s11 126; CHECK-NEXT: vmovx.f16 s8, s1 127; CHECK-NEXT: vmov r0, s4 128; CHECK-NEXT: vstr.16 s8, [r1] 129; CHECK-NEXT: vstr.16 s2, [r0] 130; CHECK-NEXT: vmov r0, s5 131; CHECK-NEXT: vmovx.f16 s8, s2 132; CHECK-NEXT: vstr.16 s8, [r0] 133; CHECK-NEXT: vmov r0, s6 134; CHECK-NEXT: vstr.16 s3, [r0] 135; CHECK-NEXT: vmov r0, s7 136; CHECK-NEXT: vmovx.f16 s0, s3 137; CHECK-NEXT: vstr.16 s0, [r0] 138; CHECK-NEXT: bx lr 139entry: 140 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 141 %offs.sext = sext <8 x i16> %offs to <8 x i32> 142 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext 143 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> 144 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 145 ret void 146} 147 148; Expand 149define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) { 150; CHECK-LABEL: unscaled_v8i16_noext: 151; CHECK: @ %bb.0: @ %entry 152; CHECK-NEXT: vldrw.u32 q2, [r1] 153; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 154; CHECK-NEXT: vmov.u16 r1, q0[0] 155; CHECK-NEXT: vadd.i32 q2, q2, r0 156; CHECK-NEXT: vadd.i32 q1, q1, r0 157; CHECK-NEXT: vmov r0, s8 158; CHECK-NEXT: strh r1, [r0] 159; CHECK-NEXT: vmov r0, s9 160; CHECK-NEXT: vmov.u16 r1, q0[1] 161; CHECK-NEXT: strh r1, [r0] 162; CHECK-NEXT: vmov r0, s10 163; CHECK-NEXT: vmov.u16 r1, q0[2] 164; CHECK-NEXT: strh r1, [r0] 165; CHECK-NEXT: vmov r0, s11 166; CHECK-NEXT: vmov.u16 r1, q0[3] 167; CHECK-NEXT: strh r1, [r0] 168; CHECK-NEXT: vmov r0, s4 169; CHECK-NEXT: vmov.u16 r1, q0[4] 170; CHECK-NEXT: strh r1, [r0] 171; CHECK-NEXT: vmov r0, s5 172; CHECK-NEXT: vmov.u16 r1, q0[5] 173; CHECK-NEXT: strh r1, [r0] 174; CHECK-NEXT: vmov r0, s6 175; CHECK-NEXT: vmov.u16 r1, q0[6] 176; CHECK-NEXT: strh r1, [r0] 177; CHECK-NEXT: vmov r0, s7 178; CHECK-NEXT: vmov.u16 r1, q0[7] 179; CHECK-NEXT: strh r1, [r0] 180; CHECK-NEXT: bx lr 181entry: 182 %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 183 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs 184 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 185 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 186 ret void 187} 188 189; Expand 190define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) { 191; CHECK-LABEL: unscaled_v8f16_noext: 192; CHECK: @ %bb.0: @ %entry 193; CHECK-NEXT: vldrw.u32 q2, [r1] 194; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 195; CHECK-NEXT: vmovx.f16 s12, s0 196; CHECK-NEXT: vadd.i32 q2, q2, r0 197; CHECK-NEXT: vadd.i32 q1, q1, r0 198; CHECK-NEXT: vmov r1, s8 199; CHECK-NEXT: vstr.16 s0, [r1] 200; CHECK-NEXT: vmov r1, s9 201; CHECK-NEXT: vstr.16 s12, [r1] 202; CHECK-NEXT: vmov r1, s10 203; CHECK-NEXT: vstr.16 s1, [r1] 204; CHECK-NEXT: vmov r1, s11 205; CHECK-NEXT: vmovx.f16 s8, s1 206; CHECK-NEXT: vmov r0, s4 207; CHECK-NEXT: vstr.16 s8, [r1] 208; CHECK-NEXT: vstr.16 s2, [r0] 209; CHECK-NEXT: vmov r0, s5 210; CHECK-NEXT: vmovx.f16 s8, s2 211; CHECK-NEXT: vstr.16 s8, [r0] 212; CHECK-NEXT: vmov r0, s6 213; CHECK-NEXT: vstr.16 s3, [r0] 214; CHECK-NEXT: vmov r0, s7 215; CHECK-NEXT: vmovx.f16 s0, s3 216; CHECK-NEXT: vstr.16 s0, [r0] 217; CHECK-NEXT: bx lr 218entry: 219 %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 220 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs 221 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> 222 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 223 ret void 224} 225 226; VLDRH.16 Qd, [base, zext(offs)] 227define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { 228; CHECK-LABEL: unsigned_unscaled_i16_i8: 229; CHECK: @ %bb.0: @ %entry 230; CHECK-NEXT: vldrb.u16 q1, [r1] 231; CHECK-NEXT: vstrh.16 q0, [r0, q1] 232; CHECK-NEXT: bx lr 233entry: 234 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 235 %offs.zext = zext <8 x i8> %offs to <8 x i32> 236 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 237 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 238 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 239 ret void 240} 241 242; VLDRH.16 Qd, [base, zext(offs)] 243define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) { 244; CHECK-LABEL: unsigned_unscaled_f16_i8: 245; CHECK: @ %bb.0: @ %entry 246; CHECK-NEXT: vldrb.u16 q1, [r1] 247; CHECK-NEXT: vstrh.16 q0, [r0, q1] 248; CHECK-NEXT: bx lr 249entry: 250 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 251 %offs.zext = zext <8 x i8> %offs to <8 x i32> 252 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 253 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> 254 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 255 ret void 256} 257 258; Expand ? 259define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { 260; CHECK-LABEL: trunc_signed_unscaled_i64_i8: 261; CHECK: @ %bb.0: @ %entry 262; CHECK-NEXT: .vsave {d8, d9, d10, d11} 263; CHECK-NEXT: vpush {d8, d9, d10, d11} 264; CHECK-NEXT: vldrb.s32 q5, [r1] 265; CHECK-NEXT: vldrb.s32 q4, [r1, #4] 266; CHECK-NEXT: vmov r1, s0 267; CHECK-NEXT: vadd.i32 q5, q5, r0 268; CHECK-NEXT: vadd.i32 q4, q4, r0 269; CHECK-NEXT: vmov r0, s20 270; CHECK-NEXT: strh r1, [r0] 271; CHECK-NEXT: vmov r0, s21 272; CHECK-NEXT: vmov r1, s2 273; CHECK-NEXT: strh r1, [r0] 274; CHECK-NEXT: vmov r0, s22 275; CHECK-NEXT: vmov r1, s4 276; CHECK-NEXT: strh r1, [r0] 277; CHECK-NEXT: vmov r0, s23 278; CHECK-NEXT: vmov r1, s6 279; CHECK-NEXT: strh r1, [r0] 280; CHECK-NEXT: vmov r0, s16 281; CHECK-NEXT: vmov r1, s8 282; CHECK-NEXT: strh r1, [r0] 283; CHECK-NEXT: vmov r0, s17 284; CHECK-NEXT: vmov r1, s10 285; CHECK-NEXT: strh r1, [r0] 286; CHECK-NEXT: vmov r0, s18 287; CHECK-NEXT: vmov r1, s12 288; CHECK-NEXT: strh r1, [r0] 289; CHECK-NEXT: vmov r0, s19 290; CHECK-NEXT: vmov r1, s14 291; CHECK-NEXT: strh r1, [r0] 292; CHECK-NEXT: vpop {d8, d9, d10, d11} 293; CHECK-NEXT: bx lr 294entry: 295 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 296 %offs.sext = sext <8 x i8> %offs to <8 x i32> 297 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext 298 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 299 %input.trunc = trunc <8 x i64> %input to <8 x i16> 300 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 301 ret void 302} 303 304; Expand ? 305define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { 306; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: 307; CHECK: @ %bb.0: @ %entry 308; CHECK-NEXT: .vsave {d8, d9} 309; CHECK-NEXT: vpush {d8, d9} 310; CHECK-NEXT: vmov r3, s0 311; CHECK-NEXT: vmov.16 q4[0], r3 312; CHECK-NEXT: vmov r3, s2 313; CHECK-NEXT: vmov.16 q4[1], r3 314; CHECK-NEXT: vmov r3, s4 315; CHECK-NEXT: vmov.16 q4[2], r3 316; CHECK-NEXT: vmov r3, s6 317; CHECK-NEXT: vmov.16 q4[3], r3 318; CHECK-NEXT: vmov r3, s8 319; CHECK-NEXT: vmov.16 q4[4], r3 320; CHECK-NEXT: vmov r3, s10 321; CHECK-NEXT: vmov.16 q4[5], r3 322; CHECK-NEXT: vmov r3, s12 323; CHECK-NEXT: vmov r2, s14 324; CHECK-NEXT: vmov.16 q4[6], r3 325; CHECK-NEXT: vldrb.u16 q0, [r1] 326; CHECK-NEXT: vmov.16 q4[7], r2 327; CHECK-NEXT: vstrh.16 q4, [r0, q0] 328; CHECK-NEXT: vpop {d8, d9} 329; CHECK-NEXT: bx lr 330entry: 331 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 332 %offs.zext = zext <8 x i8> %offs to <8 x i32> 333 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 334 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 335 %input.trunc = trunc <8 x i64> %input to <8 x i16> 336 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 337 ret void 338} 339 340; Expand ? 341define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { 342; CHECK-LABEL: trunc_signed_unscaled_i32_i8: 343; CHECK: @ %bb.0: @ %entry 344; CHECK-NEXT: vldrb.s32 q3, [r1] 345; CHECK-NEXT: vldrb.s32 q2, [r1, #4] 346; CHECK-NEXT: vmov r1, s0 347; CHECK-NEXT: vadd.i32 q3, q3, r0 348; CHECK-NEXT: vadd.i32 q2, q2, r0 349; CHECK-NEXT: vmov r0, s12 350; CHECK-NEXT: strh r1, [r0] 351; CHECK-NEXT: vmov r0, s13 352; CHECK-NEXT: vmov r1, s1 353; CHECK-NEXT: strh r1, [r0] 354; CHECK-NEXT: vmov r0, s14 355; CHECK-NEXT: vmov r1, s2 356; CHECK-NEXT: strh r1, [r0] 357; CHECK-NEXT: vmov r0, s15 358; CHECK-NEXT: vmov r1, s3 359; CHECK-NEXT: strh r1, [r0] 360; CHECK-NEXT: vmov r0, s8 361; CHECK-NEXT: vmov r1, s4 362; CHECK-NEXT: strh r1, [r0] 363; CHECK-NEXT: vmov r0, s9 364; CHECK-NEXT: vmov r1, s5 365; CHECK-NEXT: strh r1, [r0] 366; CHECK-NEXT: vmov r0, s10 367; CHECK-NEXT: vmov r1, s6 368; CHECK-NEXT: strh r1, [r0] 369; CHECK-NEXT: vmov r0, s11 370; CHECK-NEXT: vmov r1, s7 371; CHECK-NEXT: strh r1, [r0] 372; CHECK-NEXT: bx lr 373entry: 374 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 375 %offs.sext = sext <8 x i8> %offs to <8 x i32> 376 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext 377 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 378 %input.trunc = trunc <8 x i32> %input to <8 x i16> 379 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 380 ret void 381} 382 383; Expand ? 384define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { 385; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: 386; CHECK: @ %bb.0: @ %entry 387; CHECK-NEXT: vmov r3, s0 388; CHECK-NEXT: vmov.16 q2[0], r3 389; CHECK-NEXT: vmov r3, s1 390; CHECK-NEXT: vmov.16 q2[1], r3 391; CHECK-NEXT: vmov r3, s2 392; CHECK-NEXT: vmov.16 q2[2], r3 393; CHECK-NEXT: vmov r3, s3 394; CHECK-NEXT: vmov.16 q2[3], r3 395; CHECK-NEXT: vmov r3, s4 396; CHECK-NEXT: vmov.16 q2[4], r3 397; CHECK-NEXT: vmov r3, s5 398; CHECK-NEXT: vmov.16 q2[5], r3 399; CHECK-NEXT: vmov r3, s6 400; CHECK-NEXT: vmov r2, s7 401; CHECK-NEXT: vmov.16 q2[6], r3 402; CHECK-NEXT: vldrb.u16 q0, [r1] 403; CHECK-NEXT: vmov.16 q2[7], r2 404; CHECK-NEXT: vstrh.16 q2, [r0, q0] 405; CHECK-NEXT: bx lr 406entry: 407 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 408 %offs.zext = zext <8 x i8> %offs to <8 x i32> 409 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 410 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> 411 %input.trunc = trunc <8 x i32> %input to <8 x i16> 412 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 413 ret void 414} 415 416; Expand ? 417define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { 418; CHECK-LABEL: trunc_signed_unscaled_i16_i8: 419; CHECK: @ %bb.0: @ %entry 420; CHECK-NEXT: vldrb.s32 q2, [r1] 421; CHECK-NEXT: vldrb.s32 q1, [r1, #4] 422; CHECK-NEXT: vmov.u16 r1, q0[0] 423; CHECK-NEXT: vadd.i32 q2, q2, r0 424; CHECK-NEXT: vadd.i32 q1, q1, r0 425; CHECK-NEXT: vmov r0, s8 426; CHECK-NEXT: strb r1, [r0] 427; CHECK-NEXT: vmov r0, s9 428; CHECK-NEXT: vmov.u16 r1, q0[1] 429; CHECK-NEXT: strb r1, [r0] 430; CHECK-NEXT: vmov r0, s10 431; CHECK-NEXT: vmov.u16 r1, q0[2] 432; CHECK-NEXT: strb r1, [r0] 433; CHECK-NEXT: vmov r0, s11 434; CHECK-NEXT: vmov.u16 r1, q0[3] 435; CHECK-NEXT: strb r1, [r0] 436; CHECK-NEXT: vmov r0, s4 437; CHECK-NEXT: vmov.u16 r1, q0[4] 438; CHECK-NEXT: strb r1, [r0] 439; CHECK-NEXT: vmov r0, s5 440; CHECK-NEXT: vmov.u16 r1, q0[5] 441; CHECK-NEXT: strb r1, [r0] 442; CHECK-NEXT: vmov r0, s6 443; CHECK-NEXT: vmov.u16 r1, q0[6] 444; CHECK-NEXT: strb r1, [r0] 445; CHECK-NEXT: vmov r0, s7 446; CHECK-NEXT: vmov.u16 r1, q0[7] 447; CHECK-NEXT: strb r1, [r0] 448; CHECK-NEXT: bx lr 449entry: 450 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 451 %offs.sext = sext <8 x i8> %offs to <8 x i32> 452 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext 453 %input.trunc = trunc <8 x i16> %input to <8 x i8> 454 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 455 ret void 456} 457 458declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) 459declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) 460declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>) 461