1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; VLDRB.8 5define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 6; CHECK-LABEL: unscaled_v16i8_i8: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrb.u8 q1, [r1] 9; CHECK-NEXT: vstrb.8 q0, [r0, q1] 10; CHECK-NEXT: bx lr 11entry: 12 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 13 %offs.zext = zext <16 x i8> %offs to <16 x i32> 14 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 15 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 16 ret void 17} 18 19; Expand 20define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) { 21; CHECK-LABEL: unscaled_v8i8_i8: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: vldrb.u32 q2, [r1] 24; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 25; CHECK-NEXT: vmov.u16 r1, q0[0] 26; CHECK-NEXT: vadd.i32 q2, q2, r0 27; CHECK-NEXT: vadd.i32 q1, q1, r0 28; CHECK-NEXT: vmov r0, s8 29; CHECK-NEXT: strb r1, [r0] 30; CHECK-NEXT: vmov r0, s9 31; CHECK-NEXT: vmov.u16 r1, q0[1] 32; CHECK-NEXT: strb r1, [r0] 33; CHECK-NEXT: vmov r0, s10 34; CHECK-NEXT: vmov.u16 r1, q0[2] 35; CHECK-NEXT: strb r1, [r0] 36; CHECK-NEXT: vmov r0, s11 37; CHECK-NEXT: vmov.u16 r1, q0[3] 38; CHECK-NEXT: strb r1, [r0] 39; CHECK-NEXT: vmov r0, s4 40; CHECK-NEXT: vmov.u16 r1, q0[4] 41; CHECK-NEXT: strb r1, [r0] 42; CHECK-NEXT: vmov r0, s5 43; CHECK-NEXT: vmov.u16 r1, q0[5] 44; CHECK-NEXT: strb r1, [r0] 45; CHECK-NEXT: vmov r0, s6 46; CHECK-NEXT: vmov.u16 r1, q0[6] 47; CHECK-NEXT: strb r1, [r0] 48; CHECK-NEXT: vmov r0, s7 49; CHECK-NEXT: vmov.u16 r1, q0[7] 50; CHECK-NEXT: strb r1, [r0] 51; CHECK-NEXT: bx lr 52entry: 53 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 54 %offs.zext = zext <8 x i8> %offs to <8 x i32> 55 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 56 call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 57 ret void 58} 59 60; Expand 61define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) { 62; CHECK-LABEL: unscaled_v2i8_i8: 63; CHECK: @ %bb.0: @ %entry 64; CHECK-NEXT: ldrb r2, [r1] 65; CHECK-NEXT: vmov.i32 q1, #0xff 66; CHECK-NEXT: ldrb r1, [r1, #1] 67; CHECK-NEXT: vmov.32 q2[0], r2 68; CHECK-NEXT: vmov r2, s0 69; CHECK-NEXT: vmov.32 q2[2], r1 70; CHECK-NEXT: vand q1, q2, q1 71; CHECK-NEXT: vmov r1, s4 72; CHECK-NEXT: strb r2, [r0, r1] 73; CHECK-NEXT: vmov r1, s6 74; CHECK-NEXT: vmov r2, s2 75; CHECK-NEXT: strb r2, [r0, r1] 76; CHECK-NEXT: bx lr 77entry: 78 %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 79 %offs.zext = zext <2 x i8> %offs to <2 x i32> 80 %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext 81 call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>) 82 ret void 83} 84 85; Expand 86define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 87; CHECK-LABEL: unscaled_v16i8_sext: 88; CHECK: @ %bb.0: @ %entry 89; CHECK-NEXT: .vsave {d8, d9} 90; CHECK-NEXT: vpush {d8, d9} 91; CHECK-NEXT: vldrb.s32 q4, [r1] 92; CHECK-NEXT: vldrb.s32 q1, [r1, #12] 93; CHECK-NEXT: vldrb.s32 q2, [r1, #8] 94; CHECK-NEXT: vldrb.s32 q3, [r1, #4] 95; CHECK-NEXT: vadd.i32 q4, q4, r0 96; CHECK-NEXT: vadd.i32 q1, q1, r0 97; CHECK-NEXT: vadd.i32 q2, q2, r0 98; CHECK-NEXT: vadd.i32 q3, q3, r0 99; CHECK-NEXT: vmov r0, s16 100; CHECK-NEXT: vmov.u8 r1, q0[0] 101; CHECK-NEXT: strb r1, [r0] 102; CHECK-NEXT: vmov r0, s17 103; CHECK-NEXT: vmov.u8 r1, q0[1] 104; CHECK-NEXT: strb r1, [r0] 105; CHECK-NEXT: vmov r0, s18 106; CHECK-NEXT: vmov.u8 r1, q0[2] 107; CHECK-NEXT: strb r1, [r0] 108; CHECK-NEXT: vmov r0, s19 109; CHECK-NEXT: vmov.u8 r1, q0[3] 110; CHECK-NEXT: strb r1, [r0] 111; CHECK-NEXT: vmov r0, s12 112; CHECK-NEXT: vmov.u8 r1, q0[4] 113; CHECK-NEXT: strb r1, [r0] 114; CHECK-NEXT: vmov r0, s13 115; CHECK-NEXT: vmov.u8 r1, q0[5] 116; CHECK-NEXT: strb r1, [r0] 117; CHECK-NEXT: vmov r0, s14 118; CHECK-NEXT: vmov.u8 r1, q0[6] 119; CHECK-NEXT: strb r1, [r0] 120; CHECK-NEXT: vmov r0, s15 121; CHECK-NEXT: vmov.u8 r1, q0[7] 122; CHECK-NEXT: strb r1, [r0] 123; CHECK-NEXT: vmov r0, s8 124; CHECK-NEXT: vmov.u8 r1, q0[8] 125; CHECK-NEXT: strb r1, [r0] 126; CHECK-NEXT: vmov r0, s9 127; CHECK-NEXT: vmov.u8 r1, q0[9] 128; CHECK-NEXT: strb r1, [r0] 129; CHECK-NEXT: vmov r0, s10 130; CHECK-NEXT: vmov.u8 r1, q0[10] 131; CHECK-NEXT: strb r1, [r0] 132; CHECK-NEXT: vmov r0, s11 133; CHECK-NEXT: vmov.u8 r1, q0[11] 134; CHECK-NEXT: strb r1, [r0] 135; CHECK-NEXT: vmov r0, s4 136; CHECK-NEXT: vmov.u8 r1, q0[12] 137; CHECK-NEXT: strb r1, [r0] 138; CHECK-NEXT: vmov r0, s5 139; CHECK-NEXT: vmov.u8 r1, q0[13] 140; CHECK-NEXT: strb r1, [r0] 141; CHECK-NEXT: vmov r0, s6 142; CHECK-NEXT: vmov.u8 r1, q0[14] 143; CHECK-NEXT: strb r1, [r0] 144; CHECK-NEXT: vmov r0, s7 145; CHECK-NEXT: vmov.u8 r1, q0[15] 146; CHECK-NEXT: strb r1, [r0] 147; CHECK-NEXT: vpop {d8, d9} 148; CHECK-NEXT: bx lr 149entry: 150 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 151 %offs.sext = sext <16 x i8> %offs to <16 x i32> 152 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext 153 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 154 ret void 155} 156 157; Expand 158define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) { 159; CHECK-LABEL: unscaled_v16i8_i16: 160; CHECK: @ %bb.0: @ %entry 161; CHECK-NEXT: .vsave {d8, d9} 162; CHECK-NEXT: vpush {d8, d9} 163; CHECK-NEXT: vldrh.s32 q4, [r1] 164; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 165; CHECK-NEXT: vldrh.s32 q2, [r1, #16] 166; CHECK-NEXT: vldrh.s32 q3, [r1, #8] 167; CHECK-NEXT: vadd.i32 q4, q4, r0 168; CHECK-NEXT: vadd.i32 q1, q1, r0 169; CHECK-NEXT: vadd.i32 q2, q2, r0 170; CHECK-NEXT: vadd.i32 q3, q3, r0 171; CHECK-NEXT: vmov r0, s16 172; CHECK-NEXT: vmov.u8 r1, q0[0] 173; CHECK-NEXT: strb r1, [r0] 174; CHECK-NEXT: vmov r0, s17 175; CHECK-NEXT: vmov.u8 r1, q0[1] 176; CHECK-NEXT: strb r1, [r0] 177; CHECK-NEXT: vmov r0, s18 178; CHECK-NEXT: vmov.u8 r1, q0[2] 179; CHECK-NEXT: strb r1, [r0] 180; CHECK-NEXT: vmov r0, s19 181; CHECK-NEXT: vmov.u8 r1, q0[3] 182; CHECK-NEXT: strb r1, [r0] 183; CHECK-NEXT: vmov r0, s12 184; CHECK-NEXT: vmov.u8 r1, q0[4] 185; CHECK-NEXT: strb r1, [r0] 186; CHECK-NEXT: vmov r0, s13 187; CHECK-NEXT: vmov.u8 r1, q0[5] 188; CHECK-NEXT: strb r1, [r0] 189; CHECK-NEXT: vmov r0, s14 190; CHECK-NEXT: vmov.u8 r1, q0[6] 191; CHECK-NEXT: strb r1, [r0] 192; CHECK-NEXT: vmov r0, s15 193; CHECK-NEXT: vmov.u8 r1, q0[7] 194; CHECK-NEXT: strb r1, [r0] 195; CHECK-NEXT: vmov r0, s8 196; CHECK-NEXT: vmov.u8 r1, q0[8] 197; CHECK-NEXT: strb r1, [r0] 198; CHECK-NEXT: vmov r0, s9 199; CHECK-NEXT: vmov.u8 r1, q0[9] 200; CHECK-NEXT: strb r1, [r0] 201; CHECK-NEXT: vmov r0, s10 202; CHECK-NEXT: vmov.u8 r1, q0[10] 203; CHECK-NEXT: strb r1, [r0] 204; CHECK-NEXT: vmov r0, s11 205; CHECK-NEXT: vmov.u8 r1, q0[11] 206; CHECK-NEXT: strb r1, [r0] 207; CHECK-NEXT: vmov r0, s4 208; CHECK-NEXT: vmov.u8 r1, q0[12] 209; CHECK-NEXT: strb r1, [r0] 210; CHECK-NEXT: vmov r0, s5 211; CHECK-NEXT: vmov.u8 r1, q0[13] 212; CHECK-NEXT: strb r1, [r0] 213; CHECK-NEXT: vmov r0, s6 214; CHECK-NEXT: vmov.u8 r1, q0[14] 215; CHECK-NEXT: strb r1, [r0] 216; CHECK-NEXT: vmov r0, s7 217; CHECK-NEXT: vmov.u8 r1, q0[15] 218; CHECK-NEXT: strb r1, [r0] 219; CHECK-NEXT: vpop {d8, d9} 220; CHECK-NEXT: bx lr 221entry: 222 %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 223 %offs.sext = sext <16 x i16> %offs to <16 x i32> 224 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext 225 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 226 ret void 227} 228 229; Expand 230define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) { 231; CHECK-LABEL: unscaled_v16i8_scaled: 232; CHECK: @ %bb.0: @ %entry 233; CHECK-NEXT: .vsave {d8, d9} 234; CHECK-NEXT: vpush {d8, d9} 235; CHECK-NEXT: vldrb.u32 q4, [r1] 236; CHECK-NEXT: vldrb.u32 q1, [r1, #12] 237; CHECK-NEXT: vldrb.u32 q2, [r1, #8] 238; CHECK-NEXT: vldrb.u32 q3, [r1, #4] 239; CHECK-NEXT: vshl.i32 q4, q4, #2 240; CHECK-NEXT: vshl.i32 q1, q1, #2 241; CHECK-NEXT: vshl.i32 q2, q2, #2 242; CHECK-NEXT: vshl.i32 q3, q3, #2 243; CHECK-NEXT: vadd.i32 q4, q4, r0 244; CHECK-NEXT: vadd.i32 q1, q1, r0 245; CHECK-NEXT: vadd.i32 q2, q2, r0 246; CHECK-NEXT: vadd.i32 q3, q3, r0 247; CHECK-NEXT: vmov r0, s16 248; CHECK-NEXT: vmov.u8 r1, q0[0] 249; CHECK-NEXT: strb r1, [r0] 250; CHECK-NEXT: vmov r0, s17 251; CHECK-NEXT: vmov.u8 r1, q0[1] 252; CHECK-NEXT: strb r1, [r0] 253; CHECK-NEXT: vmov r0, s18 254; CHECK-NEXT: vmov.u8 r1, q0[2] 255; CHECK-NEXT: strb r1, [r0] 256; CHECK-NEXT: vmov r0, s19 257; CHECK-NEXT: vmov.u8 r1, q0[3] 258; CHECK-NEXT: strb r1, [r0] 259; CHECK-NEXT: vmov r0, s12 260; CHECK-NEXT: vmov.u8 r1, q0[4] 261; CHECK-NEXT: strb r1, [r0] 262; CHECK-NEXT: vmov r0, s13 263; CHECK-NEXT: vmov.u8 r1, q0[5] 264; CHECK-NEXT: strb r1, [r0] 265; CHECK-NEXT: vmov r0, s14 266; CHECK-NEXT: vmov.u8 r1, q0[6] 267; CHECK-NEXT: strb r1, [r0] 268; CHECK-NEXT: vmov r0, s15 269; CHECK-NEXT: vmov.u8 r1, q0[7] 270; CHECK-NEXT: strb r1, [r0] 271; CHECK-NEXT: vmov r0, s8 272; CHECK-NEXT: vmov.u8 r1, q0[8] 273; CHECK-NEXT: strb r1, [r0] 274; CHECK-NEXT: vmov r0, s9 275; CHECK-NEXT: vmov.u8 r1, q0[9] 276; CHECK-NEXT: strb r1, [r0] 277; CHECK-NEXT: vmov r0, s10 278; CHECK-NEXT: vmov.u8 r1, q0[10] 279; CHECK-NEXT: strb r1, [r0] 280; CHECK-NEXT: vmov r0, s11 281; CHECK-NEXT: vmov.u8 r1, q0[11] 282; CHECK-NEXT: strb r1, [r0] 283; CHECK-NEXT: vmov r0, s4 284; CHECK-NEXT: vmov.u8 r1, q0[12] 285; CHECK-NEXT: strb r1, [r0] 286; CHECK-NEXT: vmov r0, s5 287; CHECK-NEXT: vmov.u8 r1, q0[13] 288; CHECK-NEXT: strb r1, [r0] 289; CHECK-NEXT: vmov r0, s6 290; CHECK-NEXT: vmov.u8 r1, q0[14] 291; CHECK-NEXT: strb r1, [r0] 292; CHECK-NEXT: vmov r0, s7 293; CHECK-NEXT: vmov.u8 r1, q0[15] 294; CHECK-NEXT: strb r1, [r0] 295; CHECK-NEXT: vpop {d8, d9} 296; CHECK-NEXT: bx lr 297entry: 298 %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 299 %offs.zext = zext <16 x i8> %offs to <16 x i32> 300 %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext 301 %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*> 302 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 303 ret void 304} 305 306; Expand 307define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) { 308; CHECK-LABEL: unscaled_v16i8_i8_next: 309; CHECK: @ %bb.0: @ %entry 310; CHECK-NEXT: .vsave {d8, d9} 311; CHECK-NEXT: vpush {d8, d9} 312; CHECK-NEXT: vldrw.u32 q4, [r1] 313; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 314; CHECK-NEXT: vldrw.u32 q2, [r1, #32] 315; CHECK-NEXT: vldrw.u32 q3, [r1, #16] 316; CHECK-NEXT: vadd.i32 q4, q4, r0 317; CHECK-NEXT: vadd.i32 q1, q1, r0 318; CHECK-NEXT: vadd.i32 q2, q2, r0 319; CHECK-NEXT: vadd.i32 q3, q3, r0 320; CHECK-NEXT: vmov r0, s16 321; CHECK-NEXT: vmov.u8 r1, q0[0] 322; CHECK-NEXT: strb r1, [r0] 323; CHECK-NEXT: vmov r0, s17 324; CHECK-NEXT: vmov.u8 r1, q0[1] 325; CHECK-NEXT: strb r1, [r0] 326; CHECK-NEXT: vmov r0, s18 327; CHECK-NEXT: vmov.u8 r1, q0[2] 328; CHECK-NEXT: strb r1, [r0] 329; CHECK-NEXT: vmov r0, s19 330; CHECK-NEXT: vmov.u8 r1, q0[3] 331; CHECK-NEXT: strb r1, [r0] 332; CHECK-NEXT: vmov r0, s12 333; CHECK-NEXT: vmov.u8 r1, q0[4] 334; CHECK-NEXT: strb r1, [r0] 335; CHECK-NEXT: vmov r0, s13 336; CHECK-NEXT: vmov.u8 r1, q0[5] 337; CHECK-NEXT: strb r1, [r0] 338; CHECK-NEXT: vmov r0, s14 339; CHECK-NEXT: vmov.u8 r1, q0[6] 340; CHECK-NEXT: strb r1, [r0] 341; CHECK-NEXT: vmov r0, s15 342; CHECK-NEXT: vmov.u8 r1, q0[7] 343; CHECK-NEXT: strb r1, [r0] 344; CHECK-NEXT: vmov r0, s8 345; CHECK-NEXT: vmov.u8 r1, q0[8] 346; CHECK-NEXT: strb r1, [r0] 347; CHECK-NEXT: vmov r0, s9 348; CHECK-NEXT: vmov.u8 r1, q0[9] 349; CHECK-NEXT: strb r1, [r0] 350; CHECK-NEXT: vmov r0, s10 351; CHECK-NEXT: vmov.u8 r1, q0[10] 352; CHECK-NEXT: strb r1, [r0] 353; CHECK-NEXT: vmov r0, s11 354; CHECK-NEXT: vmov.u8 r1, q0[11] 355; CHECK-NEXT: strb r1, [r0] 356; CHECK-NEXT: vmov r0, s4 357; CHECK-NEXT: vmov.u8 r1, q0[12] 358; CHECK-NEXT: strb r1, [r0] 359; CHECK-NEXT: vmov r0, s5 360; CHECK-NEXT: vmov.u8 r1, q0[13] 361; CHECK-NEXT: strb r1, [r0] 362; CHECK-NEXT: vmov r0, s6 363; CHECK-NEXT: vmov.u8 r1, q0[14] 364; CHECK-NEXT: strb r1, [r0] 365; CHECK-NEXT: vmov r0, s7 366; CHECK-NEXT: vmov.u8 r1, q0[15] 367; CHECK-NEXT: strb r1, [r0] 368; CHECK-NEXT: vpop {d8, d9} 369; CHECK-NEXT: bx lr 370entry: 371 %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 372 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs 373 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 374 ret void 375} 376 377; Expand 378define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) { 379; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: 380; CHECK: @ %bb.0: @ %entry 381; CHECK-NEXT: .save {r4, lr} 382; CHECK-NEXT: push {r4, lr} 383; CHECK-NEXT: .vsave {d8, d9, d10, d11} 384; CHECK-NEXT: vpush {d8, d9, d10, d11} 385; CHECK-NEXT: vmov r4, s0 386; CHECK-NEXT: add r3, sp, #40 387; CHECK-NEXT: vmov.8 q5[0], r4 388; CHECK-NEXT: vmov r4, s2 389; CHECK-NEXT: vmov.8 q5[1], r4 390; CHECK-NEXT: vmov r4, s4 391; CHECK-NEXT: vmov.8 q5[2], r4 392; CHECK-NEXT: vmov r4, s6 393; CHECK-NEXT: vmov.8 q5[3], r4 394; CHECK-NEXT: vmov r4, s8 395; CHECK-NEXT: vmov.8 q5[4], r4 396; CHECK-NEXT: vmov r4, s10 397; CHECK-NEXT: vldrw.u32 q0, [r3] 398; CHECK-NEXT: vmov.8 q5[5], r4 399; CHECK-NEXT: vmov r4, s12 400; CHECK-NEXT: add.w lr, sp, #56 401; CHECK-NEXT: vmov.8 q5[6], r4 402; CHECK-NEXT: vmov r4, s14 403; CHECK-NEXT: vmov.8 q5[7], r4 404; CHECK-NEXT: vmov r3, s0 405; CHECK-NEXT: vmov.8 q5[8], r3 406; CHECK-NEXT: vmov r3, s2 407; CHECK-NEXT: vldrw.u32 q0, [lr] 408; CHECK-NEXT: vmov.8 q5[9], r3 409; CHECK-NEXT: add.w r12, sp, #72 410; CHECK-NEXT: add r2, sp, #88 411; CHECK-NEXT: vmov r3, s0 412; CHECK-NEXT: vldrw.u32 q4, [r2] 413; CHECK-NEXT: vmov.8 q5[10], r3 414; CHECK-NEXT: vmov r3, s2 415; CHECK-NEXT: vldrw.u32 q0, [r12] 416; CHECK-NEXT: vmov.8 q5[11], r3 417; CHECK-NEXT: vmov r2, s18 418; CHECK-NEXT: vmov r3, s0 419; CHECK-NEXT: vmov.8 q5[12], r3 420; CHECK-NEXT: vmov r3, s2 421; CHECK-NEXT: vmov.8 q5[13], r3 422; CHECK-NEXT: vmov r3, s16 423; CHECK-NEXT: vmov.8 q5[14], r3 424; CHECK-NEXT: vldrb.u8 q0, [r1] 425; CHECK-NEXT: vmov.8 q5[15], r2 426; CHECK-NEXT: vstrb.8 q5, [r0, q0] 427; CHECK-NEXT: vpop {d8, d9, d10, d11} 428; CHECK-NEXT: pop {r4, pc} 429entry: 430 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 431 %offs.zext = zext <16 x i8> %offs to <16 x i32> 432 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 433 %input.trunc = trunc <16 x i64> %input to <16 x i8> 434 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 435 ret void 436} 437 438; Expand 439define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) { 440; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: 441; CHECK: @ %bb.0: @ %entry 442; CHECK-NEXT: .vsave {d8, d9} 443; CHECK-NEXT: vpush {d8, d9} 444; CHECK-NEXT: vmov r3, s0 445; CHECK-NEXT: vmov.8 q4[0], r3 446; CHECK-NEXT: vmov r3, s1 447; CHECK-NEXT: vmov.8 q4[1], r3 448; CHECK-NEXT: vmov r3, s2 449; CHECK-NEXT: vmov.8 q4[2], r3 450; CHECK-NEXT: vmov r3, s3 451; CHECK-NEXT: vmov.8 q4[3], r3 452; CHECK-NEXT: vmov r3, s4 453; CHECK-NEXT: vmov.8 q4[4], r3 454; CHECK-NEXT: vmov r3, s5 455; CHECK-NEXT: vmov.8 q4[5], r3 456; CHECK-NEXT: vmov r3, s6 457; CHECK-NEXT: vmov.8 q4[6], r3 458; CHECK-NEXT: vmov r3, s7 459; CHECK-NEXT: vmov.8 q4[7], r3 460; CHECK-NEXT: vmov r3, s8 461; CHECK-NEXT: vmov.8 q4[8], r3 462; CHECK-NEXT: vmov r3, s9 463; CHECK-NEXT: vmov.8 q4[9], r3 464; CHECK-NEXT: vmov r3, s10 465; CHECK-NEXT: vmov.8 q4[10], r3 466; CHECK-NEXT: vmov r3, s11 467; CHECK-NEXT: vmov.8 q4[11], r3 468; CHECK-NEXT: vmov r3, s12 469; CHECK-NEXT: vmov.8 q4[12], r3 470; CHECK-NEXT: vmov r3, s13 471; CHECK-NEXT: vmov.8 q4[13], r3 472; CHECK-NEXT: vmov r3, s14 473; CHECK-NEXT: vmov r2, s15 474; CHECK-NEXT: vmov.8 q4[14], r3 475; CHECK-NEXT: vldrb.u8 q0, [r1] 476; CHECK-NEXT: vmov.8 q4[15], r2 477; CHECK-NEXT: vstrb.8 q4, [r0, q0] 478; CHECK-NEXT: vpop {d8, d9} 479; CHECK-NEXT: bx lr 480entry: 481 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 482 %offs.zext = zext <16 x i8> %offs to <16 x i32> 483 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 484 %input.trunc = trunc <16 x i32> %input to <16 x i8> 485 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 486 ret void 487} 488 489; Expand 490define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) { 491; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: 492; CHECK: @ %bb.0: @ %entry 493; CHECK-NEXT: vmov.u16 r3, q0[0] 494; CHECK-NEXT: vmov.u16 r2, q1[7] 495; CHECK-NEXT: vmov.8 q2[0], r3 496; CHECK-NEXT: vmov.u16 r3, q0[1] 497; CHECK-NEXT: vmov.8 q2[1], r3 498; CHECK-NEXT: vmov.u16 r3, q0[2] 499; CHECK-NEXT: vmov.8 q2[2], r3 500; CHECK-NEXT: vmov.u16 r3, q0[3] 501; CHECK-NEXT: vmov.8 q2[3], r3 502; CHECK-NEXT: vmov.u16 r3, q0[4] 503; CHECK-NEXT: vmov.8 q2[4], r3 504; CHECK-NEXT: vmov.u16 r3, q0[5] 505; CHECK-NEXT: vmov.8 q2[5], r3 506; CHECK-NEXT: vmov.u16 r3, q0[6] 507; CHECK-NEXT: vmov.8 q2[6], r3 508; CHECK-NEXT: vmov.u16 r3, q0[7] 509; CHECK-NEXT: vmov.8 q2[7], r3 510; CHECK-NEXT: vmov.u16 r3, q1[0] 511; CHECK-NEXT: vmov.8 q2[8], r3 512; CHECK-NEXT: vmov.u16 r3, q1[1] 513; CHECK-NEXT: vmov.8 q2[9], r3 514; CHECK-NEXT: vmov.u16 r3, q1[2] 515; CHECK-NEXT: vmov.8 q2[10], r3 516; CHECK-NEXT: vmov.u16 r3, q1[3] 517; CHECK-NEXT: vmov.8 q2[11], r3 518; CHECK-NEXT: vmov.u16 r3, q1[4] 519; CHECK-NEXT: vmov.8 q2[12], r3 520; CHECK-NEXT: vmov.u16 r3, q1[5] 521; CHECK-NEXT: vmov.8 q2[13], r3 522; CHECK-NEXT: vmov.u16 r3, q1[6] 523; CHECK-NEXT: vmov.8 q2[14], r3 524; CHECK-NEXT: vldrb.u8 q0, [r1] 525; CHECK-NEXT: vmov.8 q2[15], r2 526; CHECK-NEXT: vstrb.8 q2, [r0, q0] 527; CHECK-NEXT: bx lr 528entry: 529 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 530 %offs.zext = zext <16 x i8> %offs to <16 x i32> 531 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 532 %input.trunc = trunc <16 x i16> %input to <16 x i8> 533 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 534 ret void 535} 536 537define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 538; CHECK-LABEL: unscaled_v16i8_i8_2gep: 539; CHECK: @ %bb.0: @ %entry 540; CHECK-NEXT: .vsave {d8, d9, d10, d11} 541; CHECK-NEXT: vpush {d8, d9, d10, d11} 542; CHECK-NEXT: vldrb.s32 q1, [r1, #12] 543; CHECK-NEXT: vldrb.s32 q2, [r1, #8] 544; CHECK-NEXT: vldrb.s32 q3, [r1, #4] 545; CHECK-NEXT: vldrb.s32 q5, [r1] 546; CHECK-NEXT: vmov.i32 q4, #0x5 547; CHECK-NEXT: vadd.i32 q1, q1, r0 548; CHECK-NEXT: vadd.i32 q2, q2, r0 549; CHECK-NEXT: vadd.i32 q3, q3, r0 550; CHECK-NEXT: vadd.i32 q5, q5, r0 551; CHECK-NEXT: vadd.i32 q1, q1, q4 552; CHECK-NEXT: vadd.i32 q2, q2, q4 553; CHECK-NEXT: vadd.i32 q3, q3, q4 554; CHECK-NEXT: vadd.i32 q4, q5, q4 555; CHECK-NEXT: vmov.u8 r1, q0[0] 556; CHECK-NEXT: vmov r0, s16 557; CHECK-NEXT: strb r1, [r0] 558; CHECK-NEXT: vmov r0, s17 559; CHECK-NEXT: vmov.u8 r1, q0[1] 560; CHECK-NEXT: strb r1, [r0] 561; CHECK-NEXT: vmov r0, s18 562; CHECK-NEXT: vmov.u8 r1, q0[2] 563; CHECK-NEXT: strb r1, [r0] 564; CHECK-NEXT: vmov r0, s19 565; CHECK-NEXT: vmov.u8 r1, q0[3] 566; CHECK-NEXT: strb r1, [r0] 567; CHECK-NEXT: vmov r0, s12 568; CHECK-NEXT: vmov.u8 r1, q0[4] 569; CHECK-NEXT: strb r1, [r0] 570; CHECK-NEXT: vmov r0, s13 571; CHECK-NEXT: vmov.u8 r1, q0[5] 572; CHECK-NEXT: strb r1, [r0] 573; CHECK-NEXT: vmov r0, s14 574; CHECK-NEXT: vmov.u8 r1, q0[6] 575; CHECK-NEXT: strb r1, [r0] 576; CHECK-NEXT: vmov r0, s15 577; CHECK-NEXT: vmov.u8 r1, q0[7] 578; CHECK-NEXT: strb r1, [r0] 579; CHECK-NEXT: vmov r0, s8 580; CHECK-NEXT: vmov.u8 r1, q0[8] 581; CHECK-NEXT: strb r1, [r0] 582; CHECK-NEXT: vmov r0, s9 583; CHECK-NEXT: vmov.u8 r1, q0[9] 584; CHECK-NEXT: strb r1, [r0] 585; CHECK-NEXT: vmov r0, s10 586; CHECK-NEXT: vmov.u8 r1, q0[10] 587; CHECK-NEXT: strb r1, [r0] 588; CHECK-NEXT: vmov r0, s11 589; CHECK-NEXT: vmov.u8 r1, q0[11] 590; CHECK-NEXT: strb r1, [r0] 591; CHECK-NEXT: vmov r0, s4 592; CHECK-NEXT: vmov.u8 r1, q0[12] 593; CHECK-NEXT: strb r1, [r0] 594; CHECK-NEXT: vmov r0, s5 595; CHECK-NEXT: vmov.u8 r1, q0[13] 596; CHECK-NEXT: strb r1, [r0] 597; CHECK-NEXT: vmov r0, s6 598; CHECK-NEXT: vmov.u8 r1, q0[14] 599; CHECK-NEXT: strb r1, [r0] 600; CHECK-NEXT: vmov r0, s7 601; CHECK-NEXT: vmov.u8 r1, q0[15] 602; CHECK-NEXT: strb r1, [r0] 603; CHECK-NEXT: vpop {d8, d9, d10, d11} 604; CHECK-NEXT: bx lr 605entry: 606 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 607 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs 608 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 609 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 610 ret void 611} 612 613define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 614; CHECK-LABEL: unscaled_v16i8_i8_2gep2: 615; CHECK: @ %bb.0: @ %entry 616; CHECK-NEXT: adr r1, .LCPI11_0 617; CHECK-NEXT: vldrw.u32 q1, [r1] 618; CHECK-NEXT: vstrb.8 q0, [r0, q1] 619; CHECK-NEXT: bx lr 620; CHECK-NEXT: .p2align 4 621; CHECK-NEXT: @ %bb.1: 622; CHECK-NEXT: .LCPI11_0: 623; CHECK-NEXT: .byte 5 @ 0x5 624; CHECK-NEXT: .byte 8 @ 0x8 625; CHECK-NEXT: .byte 11 @ 0xb 626; CHECK-NEXT: .byte 14 @ 0xe 627; CHECK-NEXT: .byte 17 @ 0x11 628; CHECK-NEXT: .byte 20 @ 0x14 629; CHECK-NEXT: .byte 23 @ 0x17 630; CHECK-NEXT: .byte 26 @ 0x1a 631; CHECK-NEXT: .byte 29 @ 0x1d 632; CHECK-NEXT: .byte 32 @ 0x20 633; CHECK-NEXT: .byte 35 @ 0x23 634; CHECK-NEXT: .byte 38 @ 0x26 635; CHECK-NEXT: .byte 41 @ 0x29 636; CHECK-NEXT: .byte 44 @ 0x2c 637; CHECK-NEXT: .byte 47 @ 0x2f 638; CHECK-NEXT: .byte 50 @ 0x32 639entry: 640 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45> 641 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 642 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 643 ret void 644} 645 646 647declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) 648declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) 649declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) 650