1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) { 7; CHECK-LABEL: vst3_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r4, lr} 10; CHECK-NEXT: push {r4, lr} 11; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} 12; CHECK-NEXT: ldrd r4, r0, [r0, #16] 13; CHECK-NEXT: vmov.32 q1[0], r2 14; CHECK-NEXT: vmov.32 q1[1], r3 15; CHECK-NEXT: vmov.32 q0[0], r4 16; CHECK-NEXT: vmov.32 q1[2], r12 17; CHECK-NEXT: vmov.32 q0[1], r0 18; CHECK-NEXT: vmov.32 q1[3], lr 19; CHECK-NEXT: vmov.f32 s8, s7 20; CHECK-NEXT: vmov.f32 s10, s1 21; CHECK-NEXT: vmov r2, s8 22; CHECK-NEXT: vmov r0, s10 23; CHECK-NEXT: vmov.f64 d4, d2 24; CHECK-NEXT: vmov.f32 s9, s6 25; CHECK-NEXT: vmov.f32 s10, s0 26; CHECK-NEXT: vmov.f32 s11, s5 27; CHECK-NEXT: vstrw.32 q2, [r1] 28; CHECK-NEXT: strd r2, r0, [r1, #16] 29; CHECK-NEXT: pop {r4, pc} 30entry: 31 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 32 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4 33 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1 34 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4 35 %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2 36 %l3 = load <2 x i32>, <2 x i32>* %s3, align 4 37 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 38 %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 39 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 40 store <6 x i32> %s, <6 x i32> *%dst 41 ret void 42} 43 44define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) { 45; CHECK-LABEL: vst3_v4i32: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: .vsave {d8, d9, d10, d11} 48; CHECK-NEXT: vpush {d8, d9, d10, d11} 49; CHECK-NEXT: vldrw.u32 q4, [r0] 50; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 51; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 52; CHECK-NEXT: vmov.f32 s4, s9 53; CHECK-NEXT: vmov.f64 d6, d8 54; CHECK-NEXT: vmov r0, s0 55; CHECK-NEXT: vmov.f32 s5, s1 56; CHECK-NEXT: vdup.32 q5, r0 57; CHECK-NEXT: vmov.f32 s13, s8 58; CHECK-NEXT: vmov.f32 s0, s2 59; CHECK-NEXT: vmov r0, s11 60; CHECK-NEXT: vmov.f32 s7, s10 61; CHECK-NEXT: vdup.32 q2, r0 62; CHECK-NEXT: vmov.f32 s15, s17 63; CHECK-NEXT: vmov.f32 s1, s19 64; CHECK-NEXT: vmov.f32 s6, s18 65; CHECK-NEXT: vmov.f32 s14, s22 66; CHECK-NEXT: vstrw.32 q1, [r1, #16] 67; CHECK-NEXT: vmov.f32 s2, s10 68; CHECK-NEXT: vstrw.32 q3, [r1] 69; CHECK-NEXT: vstrw.32 q0, [r1, #32] 70; CHECK-NEXT: vpop {d8, d9, d10, d11} 71; CHECK-NEXT: bx lr 72entry: 73 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 74 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4 75 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1 76 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4 77 %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2 78 %l3 = load <4 x i32>, <4 x i32>* %s3, align 4 79 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 80 %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 81 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 82 store <12 x i32> %s, <12 x i32> *%dst 83 ret void 84} 85 86define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) { 87; CHECK-LABEL: vst3_v8i32: 88; CHECK: @ %bb.0: @ %entry 89; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 90; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 91; CHECK-NEXT: .pad #48 92; CHECK-NEXT: sub sp, #48 93; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 94; CHECK-NEXT: vldrw.u32 q5, [r0, #16] 95; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 96; CHECK-NEXT: vldrw.u32 q4, [r0] 97; CHECK-NEXT: vmov.f64 d6, d1 98; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 99; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 100; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill 101; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill 102; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill 103; CHECK-NEXT: vmov.f32 s13, s23 104; CHECK-NEXT: vmov r2, s7 105; CHECK-NEXT: vmov.f32 s15, s3 106; CHECK-NEXT: vdup.32 q2, r2 107; CHECK-NEXT: vmov.f32 s14, s10 108; CHECK-NEXT: vmov.f64 d4, d8 109; CHECK-NEXT: vstrw.32 q3, [r1, #80] 110; CHECK-NEXT: vmov.f32 s9, s24 111; CHECK-NEXT: vmov.f32 s11, s17 112; CHECK-NEXT: vmov q4, q5 113; CHECK-NEXT: vmov.f32 s21, s4 114; CHECK-NEXT: vmov r0, s28 115; CHECK-NEXT: vmov.f32 s23, s17 116; CHECK-NEXT: vdup.32 q4, r0 117; CHECK-NEXT: vmov r0, s0 118; CHECK-NEXT: vmov.f32 s0, s5 119; CHECK-NEXT: vdup.32 q6, r0 120; CHECK-NEXT: vmov.f32 s10, s18 121; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload 122; CHECK-NEXT: vmov.f32 s3, s6 123; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 124; CHECK-NEXT: vmov.f32 s22, s26 125; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload 126; CHECK-NEXT: vmov.f32 s2, s6 127; CHECK-NEXT: vstrw.32 q5, [r1, #48] 128; CHECK-NEXT: vmov.f32 s4, s17 129; CHECK-NEXT: vstrw.32 q0, [r1, #64] 130; CHECK-NEXT: vmov.f32 s5, s29 131; CHECK-NEXT: vstrw.32 q2, [r1] 132; CHECK-NEXT: vmov.f32 s28, s30 133; CHECK-NEXT: vmov r0, s19 134; CHECK-NEXT: vmov.f32 s7, s18 135; CHECK-NEXT: vdup.32 q4, r0 136; CHECK-NEXT: vmov.f32 s29, s27 137; CHECK-NEXT: vmov.f32 s6, s26 138; CHECK-NEXT: vmov.f32 s30, s18 139; CHECK-NEXT: vstrw.32 q1, [r1, #16] 140; CHECK-NEXT: vstrw.32 q7, [r1, #32] 141; CHECK-NEXT: add sp, #48 142; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 143; CHECK-NEXT: bx lr 144entry: 145 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 146 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4 147 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1 148 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4 149 %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2 150 %l3 = load <8 x i32>, <8 x i32>* %s3, align 4 151 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 152 %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 153 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 154 store <24 x i32> %s, <24 x i32> *%dst 155 ret void 156} 157 158define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) { 159; CHECK-LABEL: vst3_v16i32: 160; CHECK: @ %bb.0: @ %entry 161; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 162; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 163; CHECK-NEXT: .pad #160 164; CHECK-NEXT: sub sp, #160 165; CHECK-NEXT: vldrw.u32 q1, [r0, #144] 166; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 167; CHECK-NEXT: vldrw.u32 q3, [r0, #128] 168; CHECK-NEXT: vldrw.u32 q5, [r0] 169; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill 170; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 171; CHECK-NEXT: vmov.f32 s8, s1 172; CHECK-NEXT: vldrw.u32 q4, [r0, #176] 173; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill 174; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 175; CHECK-NEXT: vmov.f32 s9, s13 176; CHECK-NEXT: vldrw.u32 q6, [r0, #112] 177; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill 178; CHECK-NEXT: vldrw.u32 q1, [r0, #160] 179; CHECK-NEXT: vmov.f32 s11, s2 180; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 181; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill 182; CHECK-NEXT: vldrw.u32 q1, [r0, #96] 183; CHECK-NEXT: vmov.f32 s10, s22 184; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill 185; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 186; CHECK-NEXT: vmov r0, s3 187; CHECK-NEXT: vstrw.32 q2, [r1, #16] 188; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill 189; CHECK-NEXT: vmov.f64 d2, d7 190; CHECK-NEXT: vdup.32 q2, r0 191; CHECK-NEXT: vmov.f32 s5, s23 192; CHECK-NEXT: vmov.f32 s7, s15 193; CHECK-NEXT: vmov.f32 s6, s10 194; CHECK-NEXT: vstrw.32 q1, [r1, #32] 195; CHECK-NEXT: vmov.f64 d2, d9 196; CHECK-NEXT: vmov.f32 s5, s31 197; CHECK-NEXT: vmov.f32 s7, s19 198; CHECK-NEXT: vmov r0, s27 199; CHECK-NEXT: vmov q2, q1 200; CHECK-NEXT: vdup.32 q1, r0 201; CHECK-NEXT: vmov r0, s12 202; CHECK-NEXT: vmov.f32 s10, s6 203; CHECK-NEXT: vmov.f64 d2, d10 204; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill 205; CHECK-NEXT: vmov.f32 s5, s0 206; CHECK-NEXT: vdup.32 q0, r0 207; CHECK-NEXT: vmov.f32 s7, s21 208; CHECK-NEXT: vmov.f32 s6, s2 209; CHECK-NEXT: vmov.f64 d0, d14 210; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill 211; CHECK-NEXT: vmov.f32 s1, s24 212; CHECK-NEXT: vmov.f32 s3, s29 213; CHECK-NEXT: vmov r0, s16 214; CHECK-NEXT: vmov q1, q0 215; CHECK-NEXT: vdup.32 q0, r0 216; CHECK-NEXT: vmov.f32 s16, s25 217; CHECK-NEXT: vmov.f32 s6, s2 218; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 219; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill 220; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload 221; CHECK-NEXT: vmov.f32 s19, s26 222; CHECK-NEXT: vldrw.u32 q6, [sp, #144] @ 16-byte Reload 223; CHECK-NEXT: vmov.f32 s18, s30 224; CHECK-NEXT: vmov q2, q1 225; CHECK-NEXT: vmov.f32 s28, s5 226; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill 227; CHECK-NEXT: vmov.f32 s29, s1 228; CHECK-NEXT: vmov.f32 s31, s6 229; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 230; CHECK-NEXT: vmov.f64 d8, d1 231; CHECK-NEXT: vmov q5, q1 232; CHECK-NEXT: vmov r0, s11 233; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload 234; CHECK-NEXT: vmov.f32 s17, s7 235; CHECK-NEXT: vmov.f32 s30, s6 236; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload 237; CHECK-NEXT: vmov.f64 d6, d5 238; CHECK-NEXT: vstrw.32 q7, [r1, #112] 239; CHECK-NEXT: vmov.f32 s13, s7 240; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload 241; CHECK-NEXT: vmov.f32 s19, s3 242; CHECK-NEXT: vdup.32 q0, r0 243; CHECK-NEXT: vmov r0, s27 244; CHECK-NEXT: vmov.f32 s18, s2 245; CHECK-NEXT: vdup.32 q0, r0 246; CHECK-NEXT: vmov.f32 s15, s11 247; CHECK-NEXT: vstrw.32 q4, [r1, #128] 248; CHECK-NEXT: vmov.f32 s14, s2 249; CHECK-NEXT: vmov q0, q5 250; CHECK-NEXT: vmov.f32 s21, s4 251; CHECK-NEXT: vstrw.32 q3, [r1, #80] 252; CHECK-NEXT: vmov.f32 s23, s1 253; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 254; CHECK-NEXT: vmov r0, s0 255; CHECK-NEXT: vdup.32 q0, r0 256; CHECK-NEXT: vmov r0, s8 257; CHECK-NEXT: vmov.f32 s22, s2 258; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 259; CHECK-NEXT: vstrw.32 q5, [r1, #96] 260; CHECK-NEXT: vmov.f64 d2, d0 261; CHECK-NEXT: vmov.f32 s5, s24 262; CHECK-NEXT: vmov q6, q0 263; CHECK-NEXT: vmov.f32 s7, s1 264; CHECK-NEXT: vdup.32 q0, r0 265; CHECK-NEXT: vmov.f32 s6, s2 266; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload 267; CHECK-NEXT: vstrw.32 q1, [r1, #48] 268; CHECK-NEXT: vmov.f32 s8, s1 269; CHECK-NEXT: vmov.f32 s11, s2 270; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 271; CHECK-NEXT: vmov.f32 s10, s26 272; CHECK-NEXT: vstrw.32 q0, [r1, #144] 273; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 274; CHECK-NEXT: vstrw.32 q2, [r1, #64] 275; CHECK-NEXT: vstrw.32 q0, [r1, #160] 276; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 277; CHECK-NEXT: vstrw.32 q0, [r1, #176] 278; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 279; CHECK-NEXT: vstrw.32 q0, [r1] 280; CHECK-NEXT: add sp, #160 281; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 282; CHECK-NEXT: bx lr 283entry: 284 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0 285 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4 286 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1 287 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4 288 %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2 289 %l3 = load <16 x i32>, <16 x i32>* %s3, align 4 290 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 291 %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 292 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 293 store <48 x i32> %s, <48 x i32> *%dst 294 ret void 295} 296 297; i16 298 299define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) { 300; CHECK-LABEL: vst3_v2i16: 301; CHECK: @ %bb.0: @ %entry 302; CHECK-NEXT: .save {r4, lr} 303; CHECK-NEXT: push {r4, lr} 304; CHECK-NEXT: ldrh r2, [r0, #6] 305; CHECK-NEXT: ldrh r3, [r0, #4] 306; CHECK-NEXT: ldrh.w r12, [r0, #8] 307; CHECK-NEXT: vmov.16 q0[4], r2 308; CHECK-NEXT: ldrh.w lr, [r0, #2] 309; CHECK-NEXT: vmov.32 q1[0], r3 310; CHECK-NEXT: ldrh r4, [r0] 311; CHECK-NEXT: vmov.32 q1[2], r2 312; CHECK-NEXT: ldrh r0, [r0, #10] 313; CHECK-NEXT: vmov.16 q0[5], r0 314; CHECK-NEXT: vmov r0, s2 315; CHECK-NEXT: vmov.32 q0[0], r4 316; CHECK-NEXT: vmov.32 q0[2], lr 317; CHECK-NEXT: vmov.f32 s1, s4 318; CHECK-NEXT: vdup.32 q1, r12 319; CHECK-NEXT: vmov.f32 s3, s2 320; CHECK-NEXT: vmov.f32 s2, s6 321; CHECK-NEXT: vstrh.32 q0, [r1] 322; CHECK-NEXT: str r0, [r1, #8] 323; CHECK-NEXT: pop {r4, pc} 324entry: 325 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0 326 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4 327 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1 328 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4 329 %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2 330 %l3 = load <2 x i16>, <2 x i16>* %s3, align 4 331 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 332 %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 333 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 334 store <6 x i16> %s, <6 x i16> *%dst 335 ret void 336} 337 338define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) { 339; CHECK-LABEL: vst3_v4i16: 340; CHECK: @ %bb.0: @ %entry 341; CHECK-NEXT: .vsave {d8, d9} 342; CHECK-NEXT: vpush {d8, d9} 343; CHECK-NEXT: vldrh.u32 q2, [r0, #16] 344; CHECK-NEXT: vldrh.u32 q1, [r0] 345; CHECK-NEXT: vldrh.u32 q3, [r0, #8] 346; CHECK-NEXT: vmov.f64 d0, d5 347; CHECK-NEXT: vmov.f32 s1, s7 348; CHECK-NEXT: vmov r0, s15 349; CHECK-NEXT: vdup.32 q4, r0 350; CHECK-NEXT: vmov.f32 s3, s11 351; CHECK-NEXT: vmov r0, s4 352; CHECK-NEXT: vmov.f32 s2, s18 353; CHECK-NEXT: vmov.16 q4[0], r0 354; CHECK-NEXT: vmov r0, s12 355; CHECK-NEXT: vstrh.32 q0, [r1, #16] 356; CHECK-NEXT: vmov.16 q4[1], r0 357; CHECK-NEXT: vmov r0, s8 358; CHECK-NEXT: vmov.16 q4[2], r0 359; CHECK-NEXT: vmov r0, s5 360; CHECK-NEXT: vmov.16 q4[3], r0 361; CHECK-NEXT: vmov r0, s13 362; CHECK-NEXT: vmov.16 q4[4], r0 363; CHECK-NEXT: vmov r0, s9 364; CHECK-NEXT: vmov.16 q4[5], r0 365; CHECK-NEXT: vmov r0, s6 366; CHECK-NEXT: vmov.16 q4[6], r0 367; CHECK-NEXT: vmov r0, s14 368; CHECK-NEXT: vmov.16 q4[7], r0 369; CHECK-NEXT: vstrw.32 q4, [r1] 370; CHECK-NEXT: vpop {d8, d9} 371; CHECK-NEXT: bx lr 372entry: 373 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 374 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 375 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1 376 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4 377 %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2 378 %l3 = load <4 x i16>, <4 x i16>* %s3, align 4 379 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 380 %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 381 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 382 store <12 x i16> %s, <12 x i16> *%dst 383 ret void 384} 385 386define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { 387; CHECK-LABEL: vst3_v8i16: 388; CHECK: @ %bb.0: @ %entry 389; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 390; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 391; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 392; CHECK-NEXT: vldrw.u32 q2, [r0] 393; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 394; CHECK-NEXT: vmov.u16 r2, q1[2] 395; CHECK-NEXT: vmov.16 q0[0], r2 396; CHECK-NEXT: vmov.u16 r2, q2[3] 397; CHECK-NEXT: vmov.16 q0[1], r2 398; CHECK-NEXT: vmov.u16 r2, q1[4] 399; CHECK-NEXT: vmov.16 q0[6], r2 400; CHECK-NEXT: vmov.u16 r2, q2[5] 401; CHECK-NEXT: vmov.16 q0[7], r2 402; CHECK-NEXT: vmov.u16 r2, q2[0] 403; CHECK-NEXT: vmov.16 q3[0], r2 404; CHECK-NEXT: vmov.u16 r0, q4[0] 405; CHECK-NEXT: vmov.16 q3[1], r0 406; CHECK-NEXT: vmov.u16 r0, q4[1] 407; CHECK-NEXT: vmov.16 q3[4], r0 408; CHECK-NEXT: vmov.u16 r0, q2[2] 409; CHECK-NEXT: vmov.16 q3[6], r0 410; CHECK-NEXT: vmov.u16 r0, q4[2] 411; CHECK-NEXT: vmov r2, s4 412; CHECK-NEXT: vmov.16 q3[7], r0 413; CHECK-NEXT: vdup.32 q5, r2 414; CHECK-NEXT: vmov.f32 s13, s8 415; CHECK-NEXT: vmov.u16 r2, q5[2] 416; CHECK-NEXT: vmov.u16 r0, q3[3] 417; CHECK-NEXT: vmov.16 q6[2], r2 418; CHECK-NEXT: vmov r2, s11 419; CHECK-NEXT: vmov.16 q6[3], r0 420; CHECK-NEXT: vmov.u16 r0, q3[4] 421; CHECK-NEXT: vmov.16 q6[4], r0 422; CHECK-NEXT: vmov.u16 r0, q5[5] 423; CHECK-NEXT: vmov.16 q6[5], r0 424; CHECK-NEXT: vmov.u16 r0, q4[5] 425; CHECK-NEXT: vmov.16 q5[0], r0 426; CHECK-NEXT: vmov.u16 r0, q1[5] 427; CHECK-NEXT: vmov.16 q5[1], r0 428; CHECK-NEXT: vmov.u16 r0, q4[6] 429; CHECK-NEXT: vmov.16 q5[3], r0 430; CHECK-NEXT: vmov.u16 r0, q4[7] 431; CHECK-NEXT: vmov.f32 s13, s25 432; CHECK-NEXT: vmov.16 q5[6], r0 433; CHECK-NEXT: vmov.u16 r0, q1[7] 434; CHECK-NEXT: vmov.f32 s14, s26 435; CHECK-NEXT: vmov.16 q5[7], r0 436; CHECK-NEXT: vdup.32 q6, r2 437; CHECK-NEXT: vmov.f32 s1, s5 438; CHECK-NEXT: vmov.u16 r2, q6[2] 439; CHECK-NEXT: vmov.f32 s22, s7 440; CHECK-NEXT: vrev32.16 q4, q4 441; CHECK-NEXT: vmov.16 q7[2], r2 442; CHECK-NEXT: vmov.u16 r0, q5[3] 443; CHECK-NEXT: vmov.u16 r2, q4[2] 444; CHECK-NEXT: vmov.f32 s2, s10 445; CHECK-NEXT: vmov.16 q7[3], r0 446; CHECK-NEXT: vmov.u16 r0, q5[4] 447; CHECK-NEXT: vstrw.32 q3, [r1] 448; CHECK-NEXT: vmov.16 q3[2], r2 449; CHECK-NEXT: vmov.u16 r2, q0[3] 450; CHECK-NEXT: vmov.16 q7[4], r0 451; CHECK-NEXT: vmov.u16 r0, q6[5] 452; CHECK-NEXT: vmov.16 q3[3], r2 453; CHECK-NEXT: vmov.u16 r2, q0[4] 454; CHECK-NEXT: vmov.16 q7[5], r0 455; CHECK-NEXT: vmov.u16 r0, q4[5] 456; CHECK-NEXT: vmov.16 q3[4], r2 457; CHECK-NEXT: vmov.16 q3[5], r0 458; CHECK-NEXT: vmov.f32 s21, s29 459; CHECK-NEXT: vmov.f32 s1, s13 460; CHECK-NEXT: vmov.f32 s22, s30 461; CHECK-NEXT: vmov.f32 s2, s14 462; CHECK-NEXT: vstrw.32 q5, [r1, #32] 463; CHECK-NEXT: vstrw.32 q0, [r1, #16] 464; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 465; CHECK-NEXT: bx lr 466entry: 467 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 468 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4 469 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1 470 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4 471 %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2 472 %l3 = load <8 x i16>, <8 x i16>* %s3, align 4 473 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 474 %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 475 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 476 store <24 x i16> %s, <24 x i16> *%dst 477 ret void 478} 479 480define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) { 481; CHECK-LABEL: vst3_v16i16: 482; CHECK: @ %bb.0: @ %entry 483; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 484; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 485; CHECK-NEXT: .pad #160 486; CHECK-NEXT: sub sp, #160 487; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 488; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 489; CHECK-NEXT: vmov.u16 r2, q0[0] 490; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill 491; CHECK-NEXT: vmov.16 q2[0], r2 492; CHECK-NEXT: vmov.u16 r2, q1[0] 493; CHECK-NEXT: vmov.16 q2[1], r2 494; CHECK-NEXT: vmov.u16 r2, q1[1] 495; CHECK-NEXT: vmov.16 q2[4], r2 496; CHECK-NEXT: vmov.u16 r2, q0[2] 497; CHECK-NEXT: vmov.16 q2[6], r2 498; CHECK-NEXT: vmov.u16 r2, q1[2] 499; CHECK-NEXT: vmov.16 q2[7], r2 500; CHECK-NEXT: vmov q7, q1 501; CHECK-NEXT: vmov.f32 s9, s0 502; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 503; CHECK-NEXT: vmov.u16 r2, q2[3] 504; CHECK-NEXT: vmov q3, q2 505; CHECK-NEXT: vmov r3, s0 506; CHECK-NEXT: vmov q1, q0 507; CHECK-NEXT: vdup.32 q0, r3 508; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill 509; CHECK-NEXT: vmov.u16 r3, q0[2] 510; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill 511; CHECK-NEXT: vmov.16 q2[2], r3 512; CHECK-NEXT: vmov.16 q2[3], r2 513; CHECK-NEXT: vmov.u16 r2, q3[4] 514; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 515; CHECK-NEXT: vmov.16 q2[4], r2 516; CHECK-NEXT: vmov.u16 r2, q0[5] 517; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 518; CHECK-NEXT: vmov.16 q2[5], r2 519; CHECK-NEXT: vmov.u16 r2, q3[5] 520; CHECK-NEXT: vmov.16 q5[0], r2 521; CHECK-NEXT: vmov.u16 r2, q0[5] 522; CHECK-NEXT: vmov.16 q5[1], r2 523; CHECK-NEXT: vmov.u16 r2, q3[6] 524; CHECK-NEXT: vmov.16 q5[3], r2 525; CHECK-NEXT: vmov.u16 r2, q3[7] 526; CHECK-NEXT: vmov.16 q5[6], r2 527; CHECK-NEXT: vmov.u16 r2, q0[7] 528; CHECK-NEXT: vmov.16 q5[7], r2 529; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill 530; CHECK-NEXT: vmov.f32 s22, s3 531; CHECK-NEXT: vldrw.u32 q0, [r0] 532; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill 533; CHECK-NEXT: vmov.u16 r2, q5[3] 534; CHECK-NEXT: vmov r0, s3 535; CHECK-NEXT: vmov q2, q0 536; CHECK-NEXT: vdup.32 q0, r0 537; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill 538; CHECK-NEXT: vmov.u16 r0, q0[2] 539; CHECK-NEXT: vmov.16 q4[2], r0 540; CHECK-NEXT: vmov.u16 r0, q5[4] 541; CHECK-NEXT: vmov.16 q4[3], r2 542; CHECK-NEXT: vmov.16 q4[4], r0 543; CHECK-NEXT: vmov.u16 r0, q0[5] 544; CHECK-NEXT: vmov.16 q4[5], r0 545; CHECK-NEXT: vmov.u16 r0, q2[0] 546; CHECK-NEXT: vmov.16 q6[0], r0 547; CHECK-NEXT: vmov.u16 r0, q3[0] 548; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload 549; CHECK-NEXT: vmov.16 q6[1], r0 550; CHECK-NEXT: vmov.u16 r0, q3[1] 551; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill 552; CHECK-NEXT: vmov.16 q6[4], r0 553; CHECK-NEXT: vmov.u16 r0, q2[2] 554; CHECK-NEXT: vmov r2, s0 555; CHECK-NEXT: vmov.16 q6[6], r0 556; CHECK-NEXT: vmov.u16 r0, q3[2] 557; CHECK-NEXT: vdup.32 q0, r2 558; CHECK-NEXT: vmov.16 q6[7], r0 559; CHECK-NEXT: vmov.u16 r2, q0[2] 560; CHECK-NEXT: vmov.f32 s25, s8 561; CHECK-NEXT: vmov.16 q2[2], r2 562; CHECK-NEXT: vmov.u16 r0, q6[3] 563; CHECK-NEXT: vmov q4, q1 564; CHECK-NEXT: vmov.16 q2[3], r0 565; CHECK-NEXT: vmov.u16 r0, q6[4] 566; CHECK-NEXT: vmov.16 q2[4], r0 567; CHECK-NEXT: vmov.u16 r0, q0[5] 568; CHECK-NEXT: vmov.16 q2[5], r0 569; CHECK-NEXT: vmov.u16 r0, q7[5] 570; CHECK-NEXT: vmov.16 q0[0], r0 571; CHECK-NEXT: vmov.u16 r0, q1[5] 572; CHECK-NEXT: vmov.16 q0[1], r0 573; CHECK-NEXT: vmov.u16 r0, q7[6] 574; CHECK-NEXT: vmov.16 q0[3], r0 575; CHECK-NEXT: vmov.u16 r0, q7[7] 576; CHECK-NEXT: vmov.16 q0[6], r0 577; CHECK-NEXT: vmov.u16 r0, q1[7] 578; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload 579; CHECK-NEXT: vmov.16 q0[7], r0 580; CHECK-NEXT: vmov.f32 s2, s19 581; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill 582; CHECK-NEXT: vmov r2, s7 583; CHECK-NEXT: vmov.u16 r0, q0[3] 584; CHECK-NEXT: vdup.32 q7, r2 585; CHECK-NEXT: vrev32.16 q3, q3 586; CHECK-NEXT: vmov.u16 r2, q7[2] 587; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill 588; CHECK-NEXT: vmov.16 q2[2], r2 589; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill 590; CHECK-NEXT: vmov.16 q2[3], r0 591; CHECK-NEXT: vmov.u16 r0, q0[4] 592; CHECK-NEXT: vmov.16 q2[4], r0 593; CHECK-NEXT: vmov.u16 r0, q7[5] 594; CHECK-NEXT: vmov.16 q2[5], r0 595; CHECK-NEXT: vmov.u16 r0, q4[2] 596; CHECK-NEXT: vmov.16 q3[0], r0 597; CHECK-NEXT: vmov.u16 r0, q1[3] 598; CHECK-NEXT: vmov.16 q3[1], r0 599; CHECK-NEXT: vmov.u16 r0, q4[4] 600; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 601; CHECK-NEXT: vmov.16 q3[6], r0 602; CHECK-NEXT: vmov.u16 r0, q1[5] 603; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 604; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload 605; CHECK-NEXT: vrev32.16 q7, q7 606; CHECK-NEXT: vmov.16 q3[7], r0 607; CHECK-NEXT: vmov.u16 r0, q1[2] 608; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill 609; CHECK-NEXT: vmov.16 q7[0], r0 610; CHECK-NEXT: vmov.u16 r0, q4[3] 611; CHECK-NEXT: vmov.f32 s1, s9 612; CHECK-NEXT: vmov.16 q7[1], r0 613; CHECK-NEXT: vmov.u16 r0, q1[4] 614; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload 615; CHECK-NEXT: vmov.f32 s2, s10 616; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload 617; CHECK-NEXT: vmov.16 q7[6], r0 618; CHECK-NEXT: vmov.f32 s25, s5 619; CHECK-NEXT: vmov.u16 r0, q4[5] 620; CHECK-NEXT: vmov.f32 s26, s6 621; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload 622; CHECK-NEXT: vstrw.32 q0, [r1, #80] 623; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 624; CHECK-NEXT: vmov.f32 s21, s5 625; CHECK-NEXT: vmov.16 q7[7], r0 626; CHECK-NEXT: vmov.f32 s22, s6 627; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload 628; CHECK-NEXT: vmov.u16 r2, q0[2] 629; CHECK-NEXT: vmov.u16 r0, q0[5] 630; CHECK-NEXT: vmov.f32 s9, s5 631; CHECK-NEXT: vmov.16 q0[2], r2 632; CHECK-NEXT: vmov.f32 s10, s6 633; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 634; CHECK-NEXT: vstrw.32 q5, [r1, #32] 635; CHECK-NEXT: vstrw.32 q2, [r1, #48] 636; CHECK-NEXT: vmov.f32 s29, s5 637; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 638; CHECK-NEXT: vmov.f32 s30, s18 639; CHECK-NEXT: vstrw.32 q6, [r1] 640; CHECK-NEXT: vmov.u16 r2, q7[3] 641; CHECK-NEXT: vmov.f32 s13, s5 642; CHECK-NEXT: vmov.16 q0[3], r2 643; CHECK-NEXT: vmov.u16 r2, q7[4] 644; CHECK-NEXT: vmov.16 q0[4], r2 645; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload 646; CHECK-NEXT: vmov.16 q0[5], r0 647; CHECK-NEXT: vmov.f32 s29, s1 648; CHECK-NEXT: vmov.f32 s30, s2 649; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 650; CHECK-NEXT: vmov.f32 s14, s6 651; CHECK-NEXT: vstrw.32 q7, [r1, #16] 652; CHECK-NEXT: vmov.u16 r2, q0[2] 653; CHECK-NEXT: vmov.u16 r0, q0[5] 654; CHECK-NEXT: vmov.16 q0[2], r2 655; CHECK-NEXT: vmov.u16 r2, q3[3] 656; CHECK-NEXT: vmov.16 q0[3], r2 657; CHECK-NEXT: vmov.u16 r2, q3[4] 658; CHECK-NEXT: vmov.16 q0[4], r2 659; CHECK-NEXT: vmov.16 q0[5], r0 660; CHECK-NEXT: vmov.f32 s13, s1 661; CHECK-NEXT: vmov.f32 s14, s2 662; CHECK-NEXT: vstrw.32 q3, [r1, #64] 663; CHECK-NEXT: add sp, #160 664; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 665; CHECK-NEXT: bx lr 666entry: 667 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 668 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4 669 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1 670 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4 671 %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2 672 %l3 = load <16 x i16>, <16 x i16>* %s3, align 4 673 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 674 %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 675 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 676 store <48 x i16> %s, <48 x i16> *%dst 677 ret void 678} 679 680; i8 681 682define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) { 683; CHECK-LABEL: vst3_v2i8: 684; CHECK: @ %bb.0: @ %entry 685; CHECK-NEXT: .save {r4, r5, r6, lr} 686; CHECK-NEXT: push {r4, r5, r6, lr} 687; CHECK-NEXT: .pad #16 688; CHECK-NEXT: sub sp, #16 689; CHECK-NEXT: ldrb r2, [r0] 690; CHECK-NEXT: movs r6, #0 691; CHECK-NEXT: ldrb r3, [r0, #1] 692; CHECK-NEXT: vmov.32 q0[0], r2 693; CHECK-NEXT: ldrb.w r12, [r0, #2] 694; CHECK-NEXT: vmov.32 q0[2], r3 695; CHECK-NEXT: ldrb.w lr, [r0, #3] 696; CHECK-NEXT: vmov r4, s0 697; CHECK-NEXT: ldrb r5, [r0, #5] 698; CHECK-NEXT: vmov.16 q0[0], r4 699; CHECK-NEXT: ldrb r0, [r0, #4] 700; CHECK-NEXT: vmov.16 q0[1], r12 701; CHECK-NEXT: mov r2, sp 702; CHECK-NEXT: vmov.16 q0[2], r0 703; CHECK-NEXT: add r0, sp, #8 704; CHECK-NEXT: vmov.16 q0[3], r3 705; CHECK-NEXT: vmov.16 q0[4], lr 706; CHECK-NEXT: vmov.16 q0[5], r5 707; CHECK-NEXT: vmov.16 q0[6], r6 708; CHECK-NEXT: vmov.16 q0[7], r6 709; CHECK-NEXT: vstrb.16 q0, [r2] 710; CHECK-NEXT: vstrb.16 q0, [r0] 711; CHECK-NEXT: vldrh.u32 q0, [r0] 712; CHECK-NEXT: ldr r2, [sp] 713; CHECK-NEXT: str r2, [r1] 714; CHECK-NEXT: vmov r0, s2 715; CHECK-NEXT: strh r0, [r1, #4] 716; CHECK-NEXT: add sp, #16 717; CHECK-NEXT: pop {r4, r5, r6, pc} 718entry: 719 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 720 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 721 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1 722 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4 723 %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2 724 %l3 = load <2 x i8>, <2 x i8>* %s3, align 4 725 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 726 %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 727 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 728 store <6 x i8> %s, <6 x i8> *%dst 729 ret void 730} 731 732define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) { 733; CHECK-LABEL: vst3_v4i8: 734; CHECK: @ %bb.0: @ %entry 735; CHECK-NEXT: .vsave {d8, d9} 736; CHECK-NEXT: vpush {d8, d9} 737; CHECK-NEXT: vldrb.u32 q1, [r0] 738; CHECK-NEXT: vldrb.u32 q2, [r0, #4] 739; CHECK-NEXT: vldrb.u32 q3, [r0, #8] 740; CHECK-NEXT: vmov r2, s4 741; CHECK-NEXT: vmov.16 q0[0], r2 742; CHECK-NEXT: vmov r2, s8 743; CHECK-NEXT: vmov.16 q0[1], r2 744; CHECK-NEXT: vmov r0, s12 745; CHECK-NEXT: vmov.16 q0[2], r0 746; CHECK-NEXT: vmov r0, s5 747; CHECK-NEXT: vmov.16 q0[3], r0 748; CHECK-NEXT: vmov r0, s9 749; CHECK-NEXT: vmov.16 q0[4], r0 750; CHECK-NEXT: vmov r0, s13 751; CHECK-NEXT: vmov.16 q0[5], r0 752; CHECK-NEXT: vmov r0, s6 753; CHECK-NEXT: vmov.16 q0[6], r0 754; CHECK-NEXT: vmov r0, s10 755; CHECK-NEXT: vmov.16 q0[7], r0 756; CHECK-NEXT: vmov r0, s14 757; CHECK-NEXT: vmov.8 q4[8], r0 758; CHECK-NEXT: vmov r0, s7 759; CHECK-NEXT: vmov.8 q4[9], r0 760; CHECK-NEXT: vmov r0, s11 761; CHECK-NEXT: vmov.8 q4[10], r0 762; CHECK-NEXT: vmov r0, s15 763; CHECK-NEXT: vmov.8 q4[11], r0 764; CHECK-NEXT: vstrb.16 q0, [r1] 765; CHECK-NEXT: vmov r0, s18 766; CHECK-NEXT: str r0, [r1, #8] 767; CHECK-NEXT: vpop {d8, d9} 768; CHECK-NEXT: bx lr 769entry: 770 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 771 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 772 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1 773 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4 774 %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2 775 %l3 = load <4 x i8>, <4 x i8>* %s3, align 4 776 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 777 %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 778 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 779 store <12 x i8> %s, <12 x i8> *%dst 780 ret void 781} 782 783define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { 784; CHECK-LABEL: vst3_v8i8: 785; CHECK: @ %bb.0: @ %entry 786; CHECK-NEXT: .vsave {d8, d9, d10, d11} 787; CHECK-NEXT: vpush {d8, d9, d10, d11} 788; CHECK-NEXT: vldrb.u16 q1, [r0, #8] 789; CHECK-NEXT: vldrb.u16 q2, [r0, #16] 790; CHECK-NEXT: vldrb.u16 q3, [r0] 791; CHECK-NEXT: vmov.u16 r2, q1[5] 792; CHECK-NEXT: vmov.16 q0[0], r2 793; CHECK-NEXT: vmov.u16 r2, q2[5] 794; CHECK-NEXT: vmov.16 q0[1], r2 795; CHECK-NEXT: vmov.u16 r2, q1[6] 796; CHECK-NEXT: vmov.16 q0[3], r2 797; CHECK-NEXT: vmov.u16 r2, q1[7] 798; CHECK-NEXT: vmov.16 q0[6], r2 799; CHECK-NEXT: vmov.u16 r2, q2[7] 800; CHECK-NEXT: vmov r0, s15 801; CHECK-NEXT: vmov.16 q0[7], r2 802; CHECK-NEXT: vdup.32 q4, r0 803; CHECK-NEXT: vmov.f32 s2, s11 804; CHECK-NEXT: vmov.u16 r0, q4[2] 805; CHECK-NEXT: vmov.u16 r2, q0[3] 806; CHECK-NEXT: vmov.16 q5[2], r0 807; CHECK-NEXT: vmov.u16 r0, q0[4] 808; CHECK-NEXT: vmov.16 q5[3], r2 809; CHECK-NEXT: vmov.16 q5[4], r0 810; CHECK-NEXT: vmov.u16 r0, q4[5] 811; CHECK-NEXT: vmov.16 q5[5], r0 812; CHECK-NEXT: vmov.u16 r0, q3[0] 813; CHECK-NEXT: vmov.8 q4[0], r0 814; CHECK-NEXT: vmov.u16 r0, q1[0] 815; CHECK-NEXT: vmov.8 q4[1], r0 816; CHECK-NEXT: vmov.u16 r0, q2[0] 817; CHECK-NEXT: vmov.8 q4[2], r0 818; CHECK-NEXT: vmov.u16 r0, q3[1] 819; CHECK-NEXT: vmov.8 q4[3], r0 820; CHECK-NEXT: vmov.u16 r0, q1[1] 821; CHECK-NEXT: vmov.8 q4[4], r0 822; CHECK-NEXT: vmov.u16 r0, q2[1] 823; CHECK-NEXT: vmov.8 q4[5], r0 824; CHECK-NEXT: vmov.u16 r0, q3[2] 825; CHECK-NEXT: vmov.8 q4[6], r0 826; CHECK-NEXT: vmov.u16 r0, q1[2] 827; CHECK-NEXT: vmov.8 q4[7], r0 828; CHECK-NEXT: vmov.u16 r0, q2[2] 829; CHECK-NEXT: vmov.8 q4[8], r0 830; CHECK-NEXT: vmov.u16 r0, q3[3] 831; CHECK-NEXT: vmov.8 q4[9], r0 832; CHECK-NEXT: vmov.u16 r0, q1[3] 833; CHECK-NEXT: vmov.8 q4[10], r0 834; CHECK-NEXT: vmov.u16 r0, q2[3] 835; CHECK-NEXT: vmov.8 q4[11], r0 836; CHECK-NEXT: vmov.u16 r0, q3[4] 837; CHECK-NEXT: vmov.8 q4[12], r0 838; CHECK-NEXT: vmov.u16 r0, q1[4] 839; CHECK-NEXT: vmov.8 q4[13], r0 840; CHECK-NEXT: vmov.u16 r0, q2[4] 841; CHECK-NEXT: vmov.f32 s1, s21 842; CHECK-NEXT: vmov.8 q4[14], r0 843; CHECK-NEXT: vmov.u16 r0, q3[5] 844; CHECK-NEXT: vmov.f32 s2, s22 845; CHECK-NEXT: vmov.8 q4[15], r0 846; CHECK-NEXT: vstrb.16 q0, [r1, #16] 847; CHECK-NEXT: vstrw.32 q4, [r1] 848; CHECK-NEXT: vpop {d8, d9, d10, d11} 849; CHECK-NEXT: bx lr 850entry: 851 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 852 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4 853 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1 854 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4 855 %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2 856 %l3 = load <8 x i8>, <8 x i8>* %s3, align 4 857 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 858 %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 859 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 860 store <24 x i8> %s, <24 x i8> *%dst 861 ret void 862} 863 864define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { 865; CHECK-LABEL: vst3_v16i8: 866; CHECK: @ %bb.0: @ %entry 867; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 868; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 869; CHECK-NEXT: vldrw.u32 q3, [r0] 870; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 871; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 872; CHECK-NEXT: vmov.u8 r3, q3[0] 873; CHECK-NEXT: vmov.u8 r0, q2[0] 874; CHECK-NEXT: vmov.8 q5[0], r3 875; CHECK-NEXT: vmov.u8 r2, q1[0] 876; CHECK-NEXT: vmov.8 q5[1], r0 877; CHECK-NEXT: vmov.u8 r0, q3[1] 878; CHECK-NEXT: vmov.8 q5[3], r0 879; CHECK-NEXT: vmov.u8 r0, q2[1] 880; CHECK-NEXT: vmov.8 q5[4], r0 881; CHECK-NEXT: vmov.u8 r0, q3[2] 882; CHECK-NEXT: vmov.8 q5[6], r0 883; CHECK-NEXT: vmov.u8 r0, q2[2] 884; CHECK-NEXT: vmov.8 q5[7], r0 885; CHECK-NEXT: vmov.u8 r0, q3[3] 886; CHECK-NEXT: vmov.8 q5[9], r0 887; CHECK-NEXT: vmov.u8 r0, q2[3] 888; CHECK-NEXT: vmov.8 q5[10], r0 889; CHECK-NEXT: vmov.u8 r0, q3[4] 890; CHECK-NEXT: vmov.8 q4[2], r2 891; CHECK-NEXT: vmov.u8 r2, q1[2] 892; CHECK-NEXT: vmov.8 q5[12], r0 893; CHECK-NEXT: vmov.u8 r0, q2[4] 894; CHECK-NEXT: vmov.8 q4[8], r2 895; CHECK-NEXT: vmov.u8 r2, q1[3] 896; CHECK-NEXT: vmov.8 q5[13], r0 897; CHECK-NEXT: vmov.u8 r0, q3[5] 898; CHECK-NEXT: vmov.8 q5[15], r0 899; CHECK-NEXT: vmov.8 q4[11], r2 900; CHECK-NEXT: vmov.u8 r2, q1[4] 901; CHECK-NEXT: vmov.u8 r0, q5[0] 902; CHECK-NEXT: vmov.8 q4[14], r2 903; CHECK-NEXT: vmov.8 q0[0], r0 904; CHECK-NEXT: vmov.f32 s17, s4 905; CHECK-NEXT: vmov.u8 r0, q5[1] 906; CHECK-NEXT: vmov.8 q0[1], r0 907; CHECK-NEXT: vmov.u8 r2, q4[2] 908; CHECK-NEXT: vmov.8 q0[2], r2 909; CHECK-NEXT: vmov.u8 r0, q5[3] 910; CHECK-NEXT: vmov.8 q0[3], r0 911; CHECK-NEXT: vmov.u8 r0, q5[4] 912; CHECK-NEXT: vmov.8 q0[4], r0 913; CHECK-NEXT: vmov.u8 r0, q4[5] 914; CHECK-NEXT: vmov.8 q0[5], r0 915; CHECK-NEXT: vmov.u8 r0, q5[6] 916; CHECK-NEXT: vmov.8 q0[6], r0 917; CHECK-NEXT: vmov.u8 r0, q5[7] 918; CHECK-NEXT: vmov.8 q0[7], r0 919; CHECK-NEXT: vmov.u8 r0, q4[8] 920; CHECK-NEXT: vmov.8 q0[8], r0 921; CHECK-NEXT: vmov.u8 r0, q5[9] 922; CHECK-NEXT: vmov.8 q0[9], r0 923; CHECK-NEXT: vmov.u8 r0, q5[10] 924; CHECK-NEXT: vmov.8 q0[10], r0 925; CHECK-NEXT: vmov.u8 r0, q4[11] 926; CHECK-NEXT: vmov.8 q0[11], r0 927; CHECK-NEXT: vmov.u8 r0, q5[12] 928; CHECK-NEXT: vmov.8 q0[12], r0 929; CHECK-NEXT: vmov.u8 r0, q5[13] 930; CHECK-NEXT: vmov.8 q0[13], r0 931; CHECK-NEXT: vmov.u8 r0, q4[14] 932; CHECK-NEXT: vmov.8 q0[14], r0 933; CHECK-NEXT: vmov.u8 r0, q5[15] 934; CHECK-NEXT: vmov.8 q0[15], r0 935; CHECK-NEXT: vmov.u8 r0, q2[5] 936; CHECK-NEXT: vmov.8 q5[0], r0 937; CHECK-NEXT: vmov.u8 r0, q1[5] 938; CHECK-NEXT: vmov.8 q5[1], r0 939; CHECK-NEXT: vmov.u8 r0, q2[6] 940; CHECK-NEXT: vmov.8 q5[3], r0 941; CHECK-NEXT: vmov.u8 r0, q1[6] 942; CHECK-NEXT: vmov.8 q5[4], r0 943; CHECK-NEXT: vmov.u8 r0, q2[7] 944; CHECK-NEXT: vmov.8 q5[6], r0 945; CHECK-NEXT: vmov.u8 r0, q1[7] 946; CHECK-NEXT: vmov.8 q5[7], r0 947; CHECK-NEXT: vmov.u8 r0, q2[8] 948; CHECK-NEXT: vmov.8 q5[9], r0 949; CHECK-NEXT: vmov.u8 r0, q1[8] 950; CHECK-NEXT: vmov.8 q5[10], r0 951; CHECK-NEXT: vmov.u8 r0, q2[9] 952; CHECK-NEXT: vmov.8 q5[12], r0 953; CHECK-NEXT: vmov.u8 r0, q1[9] 954; CHECK-NEXT: vmov.8 q5[13], r0 955; CHECK-NEXT: vmov.u8 r0, q2[10] 956; CHECK-NEXT: vmov.8 q5[15], r0 957; CHECK-NEXT: vstrw.32 q0, [r1] 958; CHECK-NEXT: vmov.u8 r0, q5[0] 959; CHECK-NEXT: vmov.8 q4[0], r0 960; CHECK-NEXT: vmov.u8 r0, q5[1] 961; CHECK-NEXT: vmov.8 q4[1], r0 962; CHECK-NEXT: vmov.u8 r0, q3[7] 963; CHECK-NEXT: vmov.8 q6[5], r0 964; CHECK-NEXT: vmov.u8 r0, q3[8] 965; CHECK-NEXT: vmov.8 q6[8], r0 966; CHECK-NEXT: vmov.u8 r0, q3[9] 967; CHECK-NEXT: vmov.8 q6[11], r0 968; CHECK-NEXT: vmov.f32 s24, s13 969; CHECK-NEXT: vmov.f32 s27, s14 970; CHECK-NEXT: vmov.u8 r0, q6[2] 971; CHECK-NEXT: vmov.8 q4[2], r0 972; CHECK-NEXT: vmov.u8 r0, q5[3] 973; CHECK-NEXT: vmov.8 q4[3], r0 974; CHECK-NEXT: vmov.u8 r0, q5[4] 975; CHECK-NEXT: vmov.8 q4[4], r0 976; CHECK-NEXT: vmov.u8 r0, q6[5] 977; CHECK-NEXT: vmov.8 q4[5], r0 978; CHECK-NEXT: vmov.u8 r0, q5[6] 979; CHECK-NEXT: vmov.8 q4[6], r0 980; CHECK-NEXT: vmov.u8 r0, q5[7] 981; CHECK-NEXT: vmov.8 q4[7], r0 982; CHECK-NEXT: vmov.u8 r0, q6[8] 983; CHECK-NEXT: vmov.8 q4[8], r0 984; CHECK-NEXT: vmov.u8 r0, q5[9] 985; CHECK-NEXT: vmov.8 q4[9], r0 986; CHECK-NEXT: vmov.u8 r0, q5[10] 987; CHECK-NEXT: vmov.8 q4[10], r0 988; CHECK-NEXT: vmov.u8 r0, q6[11] 989; CHECK-NEXT: vmov.8 q4[11], r0 990; CHECK-NEXT: vmov.u8 r0, q5[12] 991; CHECK-NEXT: vmov.8 q4[12], r0 992; CHECK-NEXT: vmov.u8 r0, q5[13] 993; CHECK-NEXT: vmov.8 q4[13], r0 994; CHECK-NEXT: vmov.u8 r0, q6[14] 995; CHECK-NEXT: vmov.8 q4[14], r0 996; CHECK-NEXT: vmov.u8 r0, q5[15] 997; CHECK-NEXT: vmov.8 q4[15], r0 998; CHECK-NEXT: vmov.u8 r0, q1[10] 999; CHECK-NEXT: vmov.8 q5[0], r0 1000; CHECK-NEXT: vmov.u8 r0, q3[11] 1001; CHECK-NEXT: vmov.8 q5[1], r0 1002; CHECK-NEXT: vmov.u8 r0, q1[11] 1003; CHECK-NEXT: vmov.8 q5[3], r0 1004; CHECK-NEXT: vmov.u8 r0, q3[12] 1005; CHECK-NEXT: vmov.8 q5[4], r0 1006; CHECK-NEXT: vmov.u8 r0, q1[12] 1007; CHECK-NEXT: vmov.8 q5[6], r0 1008; CHECK-NEXT: vmov.u8 r0, q3[13] 1009; CHECK-NEXT: vmov.8 q5[7], r0 1010; CHECK-NEXT: vmov.u8 r0, q1[13] 1011; CHECK-NEXT: vmov.8 q5[9], r0 1012; CHECK-NEXT: vmov.u8 r0, q3[14] 1013; CHECK-NEXT: vmov.8 q5[10], r0 1014; CHECK-NEXT: vmov.u8 r0, q1[14] 1015; CHECK-NEXT: vmov.8 q5[12], r0 1016; CHECK-NEXT: vmov.u8 r0, q3[15] 1017; CHECK-NEXT: vmov.8 q5[13], r0 1018; CHECK-NEXT: vmov.u8 r0, q1[15] 1019; CHECK-NEXT: vmov.8 q5[15], r0 1020; CHECK-NEXT: vstrw.32 q4, [r1, #16] 1021; CHECK-NEXT: vmov.u8 r0, q5[0] 1022; CHECK-NEXT: vmov.8 q1[0], r0 1023; CHECK-NEXT: vmov.u8 r0, q5[1] 1024; CHECK-NEXT: vmov.8 q1[1], r0 1025; CHECK-NEXT: vmov.u8 r0, q2[11] 1026; CHECK-NEXT: vmov.8 q3[2], r0 1027; CHECK-NEXT: vmov.u8 r0, q2[12] 1028; CHECK-NEXT: vmov.8 q3[5], r0 1029; CHECK-NEXT: vmov.u8 r0, q2[13] 1030; CHECK-NEXT: vmov.8 q3[8], r0 1031; CHECK-NEXT: vmov.u8 r0, q2[14] 1032; CHECK-NEXT: vmov.8 q3[11], r0 1033; CHECK-NEXT: vmov.u8 r0, q2[15] 1034; CHECK-NEXT: vmov.8 q3[14], r0 1035; CHECK-NEXT: vmov.u8 r0, q3[2] 1036; CHECK-NEXT: vmov.8 q1[2], r0 1037; CHECK-NEXT: vmov.u8 r0, q5[3] 1038; CHECK-NEXT: vmov.8 q1[3], r0 1039; CHECK-NEXT: vmov.u8 r0, q5[4] 1040; CHECK-NEXT: vmov.8 q1[4], r0 1041; CHECK-NEXT: vmov.u8 r0, q3[5] 1042; CHECK-NEXT: vmov.8 q1[5], r0 1043; CHECK-NEXT: vmov.u8 r0, q5[6] 1044; CHECK-NEXT: vmov.8 q1[6], r0 1045; CHECK-NEXT: vmov.u8 r0, q5[7] 1046; CHECK-NEXT: vmov.8 q1[7], r0 1047; CHECK-NEXT: vmov.u8 r0, q3[8] 1048; CHECK-NEXT: vmov.8 q1[8], r0 1049; CHECK-NEXT: vmov.u8 r0, q5[9] 1050; CHECK-NEXT: vmov.8 q1[9], r0 1051; CHECK-NEXT: vmov.u8 r0, q5[10] 1052; CHECK-NEXT: vmov.8 q1[10], r0 1053; CHECK-NEXT: vmov.u8 r0, q3[11] 1054; CHECK-NEXT: vmov.8 q1[11], r0 1055; CHECK-NEXT: vmov.u8 r0, q5[12] 1056; CHECK-NEXT: vmov.8 q1[12], r0 1057; CHECK-NEXT: vmov.u8 r0, q5[13] 1058; CHECK-NEXT: vmov.8 q1[13], r0 1059; CHECK-NEXT: vmov.u8 r0, q3[14] 1060; CHECK-NEXT: vmov.8 q1[14], r0 1061; CHECK-NEXT: vmov.u8 r0, q5[15] 1062; CHECK-NEXT: vmov.8 q1[15], r0 1063; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1064; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1065; CHECK-NEXT: bx lr 1066entry: 1067 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 1068 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4 1069 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1 1070 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4 1071 %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2 1072 %l3 = load <16 x i8>, <16 x i8>* %s3, align 4 1073 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1074 %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1075 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1076 store <48 x i8> %s, <48 x i8> *%dst 1077 ret void 1078} 1079 1080; i64 1081 1082define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { 1083; CHECK-LABEL: vst3_v2i64: 1084; CHECK: @ %bb.0: @ %entry 1085; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1086; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1087; CHECK-NEXT: vldrw.u32 q1, [r0] 1088; CHECK-NEXT: vmov.f64 d6, d5 1089; CHECK-NEXT: vmov.f32 s13, s11 1090; CHECK-NEXT: vmov.f32 s14, s2 1091; CHECK-NEXT: vmov.f32 s15, s3 1092; CHECK-NEXT: vmov.f32 s2, s6 1093; CHECK-NEXT: vmov.f32 s3, s7 1094; CHECK-NEXT: vmov.f32 s6, s8 1095; CHECK-NEXT: vmov.f32 s7, s9 1096; CHECK-NEXT: vstrb.8 q1, [r1], #32 1097; CHECK-NEXT: vstrw.32 q3, [r1] 1098; CHECK-NEXT: vstrw.32 q0, [r1, #-16] 1099; CHECK-NEXT: bx lr 1100entry: 1101 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 1102 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4 1103 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1 1104 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4 1105 %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2 1106 %l3 = load <2 x i64>, <2 x i64>* %s3, align 4 1107 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1108 %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1109 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1110 store <6 x i64> %s, <6 x i64> *%dst 1111 ret void 1112} 1113 1114define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) { 1115; CHECK-LABEL: vst3_v4i64: 1116; CHECK: @ %bb.0: @ %entry 1117; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1118; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1119; CHECK-NEXT: .pad #16 1120; CHECK-NEXT: sub sp, #16 1121; CHECK-NEXT: vldrw.u32 q1, [r0] 1122; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 1123; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 1124; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1125; CHECK-NEXT: vmov.f64 d10, d2 1126; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 1127; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1128; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 1129; CHECK-NEXT: vmov.f32 s21, s5 1130; CHECK-NEXT: vmov.f32 s22, s28 1131; CHECK-NEXT: vmov.f32 s23, s29 1132; CHECK-NEXT: vmov.f64 d14, d12 1133; CHECK-NEXT: vstrw.32 q5, [r1] 1134; CHECK-NEXT: vmov.f32 s29, s25 1135; CHECK-NEXT: vmov.f64 d8, d7 1136; CHECK-NEXT: vmov.f32 s30, s12 1137; CHECK-NEXT: vmov.f32 s17, s15 1138; CHECK-NEXT: vmov.f32 s31, s13 1139; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload 1140; CHECK-NEXT: vmov.f32 s18, s2 1141; CHECK-NEXT: vstrw.32 q7, [r1, #48] 1142; CHECK-NEXT: vmov.f32 s4, s8 1143; CHECK-NEXT: vmov.f32 s19, s3 1144; CHECK-NEXT: vmov.f32 s2, s26 1145; CHECK-NEXT: vstrw.32 q4, [r1, #80] 1146; CHECK-NEXT: vmov.f32 s5, s9 1147; CHECK-NEXT: vmov.f32 s8, s14 1148; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1149; CHECK-NEXT: vmov.f32 s3, s27 1150; CHECK-NEXT: vmov.f32 s9, s15 1151; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1152; CHECK-NEXT: vstrw.32 q2, [r1, #32] 1153; CHECK-NEXT: add sp, #16 1154; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1155; CHECK-NEXT: bx lr 1156entry: 1157 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0 1158 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4 1159 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1 1160 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4 1161 %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2 1162 %l3 = load <4 x i64>, <4 x i64>* %s3, align 4 1163 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1164 %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1165 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1166 store <12 x i64> %s, <12 x i64> *%dst 1167 ret void 1168} 1169 1170; f32 1171 1172define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) { 1173; CHECK-LABEL: vst3_v2f32: 1174; CHECK: @ %bb.0: @ %entry 1175; CHECK-NEXT: vldr s0, [r0] 1176; CHECK-NEXT: vldr s3, [r0, #4] 1177; CHECK-NEXT: vldr s1, [r0, #8] 1178; CHECK-NEXT: ldr r2, [r0, #20] 1179; CHECK-NEXT: vldr s2, [r0, #16] 1180; CHECK-NEXT: ldr r0, [r0, #12] 1181; CHECK-NEXT: strd r0, r2, [r1, #16] 1182; CHECK-NEXT: vstrw.32 q0, [r1] 1183; CHECK-NEXT: bx lr 1184entry: 1185 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0 1186 %l1 = load <2 x float>, <2 x float>* %s1, align 4 1187 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1 1188 %l2 = load <2 x float>, <2 x float>* %s2, align 4 1189 %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2 1190 %l3 = load <2 x float>, <2 x float>* %s3, align 4 1191 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1192 %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1193 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1194 store <6 x float> %s, <6 x float> *%dst 1195 ret void 1196} 1197 1198define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) { 1199; CHECK-LABEL: vst3_v4f32: 1200; CHECK: @ %bb.0: @ %entry 1201; CHECK-NEXT: .vsave {d8, d9} 1202; CHECK-NEXT: vpush {d8, d9} 1203; CHECK-NEXT: vldrw.u32 q3, [r0] 1204; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1205; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1206; CHECK-NEXT: vmov.f64 d8, d6 1207; CHECK-NEXT: vmov.f32 s17, s4 1208; CHECK-NEXT: vmov.f32 s8, s5 1209; CHECK-NEXT: vmov.f32 s19, s13 1210; CHECK-NEXT: vmov.f32 s9, s1 1211; CHECK-NEXT: vmov.f32 s18, s0 1212; CHECK-NEXT: vmov.f32 s0, s2 1213; CHECK-NEXT: vstrw.32 q4, [r1] 1214; CHECK-NEXT: vmov.f32 s11, s6 1215; CHECK-NEXT: vmov.f32 s1, s15 1216; CHECK-NEXT: vmov.f32 s10, s14 1217; CHECK-NEXT: vmov.f32 s2, s7 1218; CHECK-NEXT: vstrw.32 q2, [r1, #16] 1219; CHECK-NEXT: vstrw.32 q0, [r1, #32] 1220; CHECK-NEXT: vpop {d8, d9} 1221; CHECK-NEXT: bx lr 1222entry: 1223 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 1224 %l1 = load <4 x float>, <4 x float>* %s1, align 4 1225 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1 1226 %l2 = load <4 x float>, <4 x float>* %s2, align 4 1227 %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2 1228 %l3 = load <4 x float>, <4 x float>* %s3, align 4 1229 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1230 %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1231 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1232 store <12 x float> %s, <12 x float> *%dst 1233 ret void 1234} 1235 1236define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) { 1237; CHECK-LABEL: vst3_v8f32: 1238; CHECK: @ %bb.0: @ %entry 1239; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1240; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1241; CHECK-NEXT: .pad #16 1242; CHECK-NEXT: sub sp, #16 1243; CHECK-NEXT: vldrw.u32 q4, [r0] 1244; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 1245; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 1246; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1247; CHECK-NEXT: vmov.f64 d10, d8 1248; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1249; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 1250; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 1251; CHECK-NEXT: vmov.f32 s21, s28 1252; CHECK-NEXT: vmov.f64 d14, d12 1253; CHECK-NEXT: vmov.f64 d4, d1 1254; CHECK-NEXT: vmov.f32 s29, s12 1255; CHECK-NEXT: vmov.f32 s9, s27 1256; CHECK-NEXT: vmov.f32 s31, s25 1257; CHECK-NEXT: vmov.f32 s11, s3 1258; CHECK-NEXT: vmov.f32 s30, s0 1259; CHECK-NEXT: vmov.f32 s0, s13 1260; CHECK-NEXT: vstrw.32 q7, [r1, #48] 1261; CHECK-NEXT: vmov.f32 s3, s14 1262; CHECK-NEXT: vmov.f32 s2, s26 1263; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload 1264; CHECK-NEXT: vmov.f32 s10, s15 1265; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1266; CHECK-NEXT: vmov.f32 s23, s17 1267; CHECK-NEXT: vstrw.32 q2, [r1, #80] 1268; CHECK-NEXT: vmov.f32 s12, s25 1269; CHECK-NEXT: vmov.f32 s13, s5 1270; CHECK-NEXT: vmov.f32 s22, s4 1271; CHECK-NEXT: vmov.f32 s4, s6 1272; CHECK-NEXT: vstrw.32 q5, [r1] 1273; CHECK-NEXT: vmov.f32 s15, s26 1274; CHECK-NEXT: vmov.f32 s5, s19 1275; CHECK-NEXT: vmov.f32 s14, s18 1276; CHECK-NEXT: vmov.f32 s6, s27 1277; CHECK-NEXT: vstrw.32 q3, [r1, #16] 1278; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1279; CHECK-NEXT: add sp, #16 1280; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1281; CHECK-NEXT: bx lr 1282entry: 1283 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 1284 %l1 = load <8 x float>, <8 x float>* %s1, align 4 1285 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1 1286 %l2 = load <8 x float>, <8 x float>* %s2, align 4 1287 %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2 1288 %l3 = load <8 x float>, <8 x float>* %s3, align 4 1289 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1290 %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1291 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1292 store <24 x float> %s, <24 x float> *%dst 1293 ret void 1294} 1295 1296define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) { 1297; CHECK-LABEL: vst3_v16f32: 1298; CHECK: @ %bb.0: @ %entry 1299; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1300; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1301; CHECK-NEXT: .pad #160 1302; CHECK-NEXT: sub sp, #160 1303; CHECK-NEXT: vldrw.u32 q5, [r0, #96] 1304; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1305; CHECK-NEXT: vldrw.u32 q1, [r0, #128] 1306; CHECK-NEXT: vldrw.u32 q6, [r0] 1307; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill 1308; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 1309; CHECK-NEXT: vmov.f32 s16, s1 1310; CHECK-NEXT: vldrw.u32 q3, [r0, #160] 1311; CHECK-NEXT: vstrw.32 q5, [sp, #144] @ 16-byte Spill 1312; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 1313; CHECK-NEXT: vmov.f32 s17, s5 1314; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill 1315; CHECK-NEXT: vmov.f32 s19, s2 1316; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill 1317; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 1318; CHECK-NEXT: vmov.f32 s18, s26 1319; CHECK-NEXT: vldrw.u32 q7, [r0, #144] 1320; CHECK-NEXT: vldrw.u32 q2, [r0, #176] 1321; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill 1322; CHECK-NEXT: vldrw.u32 q5, [r0, #16] 1323; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 1324; CHECK-NEXT: vstrw.32 q4, [r1, #16] 1325; CHECK-NEXT: vmov.f64 d8, d3 1326; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill 1327; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload 1328; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 1329; CHECK-NEXT: vmov.f32 s17, s27 1330; CHECK-NEXT: vmov.f32 s19, s7 1331; CHECK-NEXT: vmov.f32 s18, s3 1332; CHECK-NEXT: vstrw.32 q4, [r1, #32] 1333; CHECK-NEXT: vmov.f64 d8, d5 1334; CHECK-NEXT: vmov.f32 s17, s23 1335; CHECK-NEXT: vmov.f32 s19, s11 1336; CHECK-NEXT: vmov.f32 s18, s15 1337; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill 1338; CHECK-NEXT: vmov.f64 d8, d12 1339; CHECK-NEXT: vmov.f32 s17, s0 1340; CHECK-NEXT: vmov.f32 s19, s25 1341; CHECK-NEXT: vmov.f32 s18, s4 1342; CHECK-NEXT: vmov q1, q5 1343; CHECK-NEXT: vmov.f64 d0, d2 1344; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill 1345; CHECK-NEXT: vmov.f32 s1, s12 1346; CHECK-NEXT: vmov.f32 s3, s5 1347; CHECK-NEXT: vmov.f32 s2, s8 1348; CHECK-NEXT: vmov.f32 s8, s13 1349; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 1350; CHECK-NEXT: vmov.f32 s11, s14 1351; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 1352; CHECK-NEXT: vmov.f32 s10, s6 1353; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload 1354; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill 1355; CHECK-NEXT: vmov.f64 d8, d1 1356; CHECK-NEXT: vmov q2, q1 1357; CHECK-NEXT: vmov.f32 s20, s5 1358; CHECK-NEXT: vmov.f32 s21, s1 1359; CHECK-NEXT: vmov.f32 s23, s6 1360; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 1361; CHECK-NEXT: vmov.f64 d6, d15 1362; CHECK-NEXT: vmov q6, q1 1363; CHECK-NEXT: vmov.f32 s17, s7 1364; CHECK-NEXT: vmov.f32 s22, s6 1365; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload 1366; CHECK-NEXT: vmov.f32 s19, s3 1367; CHECK-NEXT: vmov q0, q6 1368; CHECK-NEXT: vmov.f32 s13, s7 1369; CHECK-NEXT: vstrw.32 q5, [r1, #112] 1370; CHECK-NEXT: vmov.f32 s15, s31 1371; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload 1372; CHECK-NEXT: vmov.f32 s18, s11 1373; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload 1374; CHECK-NEXT: vmov.f32 s25, s28 1375; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 1376; CHECK-NEXT: vmov.f32 s27, s1 1377; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 1378; CHECK-NEXT: vmov.f32 s14, s11 1379; CHECK-NEXT: vstrw.32 q4, [r1, #128] 1380; CHECK-NEXT: vmov.f32 s26, s0 1381; CHECK-NEXT: vstrw.32 q3, [r1, #80] 1382; CHECK-NEXT: vmov.f64 d0, d2 1383; CHECK-NEXT: vstrw.32 q6, [r1, #96] 1384; CHECK-NEXT: vmov.f32 s1, s8 1385; CHECK-NEXT: vmov q2, q1 1386; CHECK-NEXT: vmov.f32 s3, s5 1387; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 1388; CHECK-NEXT: vmov.f32 s2, s28 1389; CHECK-NEXT: vstrw.32 q0, [r1, #48] 1390; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 1391; CHECK-NEXT: vmov.f32 s28, s5 1392; CHECK-NEXT: vstrw.32 q0, [r1, #144] 1393; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 1394; CHECK-NEXT: vmov.f32 s31, s6 1395; CHECK-NEXT: vstrw.32 q0, [r1, #160] 1396; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 1397; CHECK-NEXT: vmov.f32 s30, s10 1398; CHECK-NEXT: vstrw.32 q0, [r1, #176] 1399; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 1400; CHECK-NEXT: vstrw.32 q7, [r1, #64] 1401; CHECK-NEXT: vstrw.32 q0, [r1] 1402; CHECK-NEXT: add sp, #160 1403; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1404; CHECK-NEXT: bx lr 1405entry: 1406 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0 1407 %l1 = load <16 x float>, <16 x float>* %s1, align 4 1408 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1 1409 %l2 = load <16 x float>, <16 x float>* %s2, align 4 1410 %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2 1411 %l3 = load <16 x float>, <16 x float>* %s3, align 4 1412 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1413 %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1414 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1415 store <48 x float> %s, <48 x float> *%dst 1416 ret void 1417} 1418 1419; f16 1420 1421define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { 1422; CHECK-LABEL: vst3_v2f16: 1423; CHECK: @ %bb.0: @ %entry 1424; CHECK-NEXT: vldmia r0, {s4, s5} 1425; CHECK-NEXT: vmov r2, s5 1426; CHECK-NEXT: ldr r0, [r0, #8] 1427; CHECK-NEXT: vmov r3, s4 1428; CHECK-NEXT: vmovx.f16 s12, s4 1429; CHECK-NEXT: vmov.16 q0[0], r3 1430; CHECK-NEXT: vmov.32 q2[0], r0 1431; CHECK-NEXT: vmov.16 q0[1], r2 1432; CHECK-NEXT: vmov r0, s8 1433; CHECK-NEXT: vmov.16 q0[2], r0 1434; CHECK-NEXT: vmov r0, s12 1435; CHECK-NEXT: vmovx.f16 s4, s5 1436; CHECK-NEXT: vmov.16 q0[3], r0 1437; CHECK-NEXT: vmov r0, s4 1438; CHECK-NEXT: vmovx.f16 s4, s8 1439; CHECK-NEXT: vmov.16 q0[4], r0 1440; CHECK-NEXT: vmov r0, s4 1441; CHECK-NEXT: vmov.16 q0[5], r0 1442; CHECK-NEXT: vmov r0, s0 1443; CHECK-NEXT: vmov r2, s1 1444; CHECK-NEXT: vmov r3, s2 1445; CHECK-NEXT: stm r1!, {r0, r2, r3} 1446; CHECK-NEXT: bx lr 1447entry: 1448 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 1449 %l1 = load <2 x half>, <2 x half>* %s1, align 4 1450 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1 1451 %l2 = load <2 x half>, <2 x half>* %s2, align 4 1452 %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2 1453 %l3 = load <2 x half>, <2 x half>* %s3, align 4 1454 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1455 %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1456 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1457 store <6 x half> %s, <6 x half> *%dst 1458 ret void 1459} 1460 1461define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { 1462; CHECK-LABEL: vst3_v4f16: 1463; CHECK: @ %bb.0: @ %entry 1464; CHECK-NEXT: .save {r7, lr} 1465; CHECK-NEXT: push {r7, lr} 1466; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} 1467; CHECK-NEXT: vmov.32 q0[0], r2 1468; CHECK-NEXT: vmov.32 q0[1], r3 1469; CHECK-NEXT: vmov.32 q0[2], r12 1470; CHECK-NEXT: vmov.32 q0[3], lr 1471; CHECK-NEXT: vmov r3, s0 1472; CHECK-NEXT: vmovx.f16 s12, s0 1473; CHECK-NEXT: vmov r2, s2 1474; CHECK-NEXT: vmov.16 q2[0], r3 1475; CHECK-NEXT: vmov.16 q2[1], r2 1476; CHECK-NEXT: ldrd r2, r0, [r0, #16] 1477; CHECK-NEXT: vmovx.f16 s0, s3 1478; CHECK-NEXT: vmov.32 q1[0], r2 1479; CHECK-NEXT: vmov.32 q1[1], r0 1480; CHECK-NEXT: vmov r0, s4 1481; CHECK-NEXT: vmov.16 q2[2], r0 1482; CHECK-NEXT: vmov r0, s12 1483; CHECK-NEXT: vmovx.f16 s12, s2 1484; CHECK-NEXT: vmov.16 q2[3], r0 1485; CHECK-NEXT: vmov r0, s12 1486; CHECK-NEXT: vmovx.f16 s12, s4 1487; CHECK-NEXT: vmov.16 q2[4], r0 1488; CHECK-NEXT: vmov r0, s12 1489; CHECK-NEXT: vmov.16 q2[5], r0 1490; CHECK-NEXT: vmov r0, s1 1491; CHECK-NEXT: vmov.16 q2[6], r0 1492; CHECK-NEXT: vmov r0, s3 1493; CHECK-NEXT: vmov.16 q2[7], r0 1494; CHECK-NEXT: vmov r2, s5 1495; CHECK-NEXT: vstrw.32 q2, [r1] 1496; CHECK-NEXT: vmovx.f16 s8, s1 1497; CHECK-NEXT: vmov r0, s8 1498; CHECK-NEXT: vmov.16 q2[0], r2 1499; CHECK-NEXT: vmov.16 q2[1], r0 1500; CHECK-NEXT: vmov r0, s0 1501; CHECK-NEXT: vmovx.f16 s0, s5 1502; CHECK-NEXT: vmov.16 q2[2], r0 1503; CHECK-NEXT: vmov r0, s0 1504; CHECK-NEXT: vmov.16 q2[3], r0 1505; CHECK-NEXT: vmov r0, s9 1506; CHECK-NEXT: vmov r2, s8 1507; CHECK-NEXT: strd r2, r0, [r1, #16] 1508; CHECK-NEXT: pop {r7, pc} 1509entry: 1510 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 1511 %l1 = load <4 x half>, <4 x half>* %s1, align 4 1512 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1 1513 %l2 = load <4 x half>, <4 x half>* %s2, align 4 1514 %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2 1515 %l3 = load <4 x half>, <4 x half>* %s3, align 4 1516 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1517 %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1518 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1519 store <12 x half> %s, <12 x half> *%dst 1520 ret void 1521} 1522 1523define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { 1524; CHECK-LABEL: vst3_v8f16: 1525; CHECK: @ %bb.0: @ %entry 1526; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1527; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1528; CHECK-NEXT: .pad #16 1529; CHECK-NEXT: sub sp, #16 1530; CHECK-NEXT: vldrw.u32 q2, [r0] 1531; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1532; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 1533; CHECK-NEXT: vmov r3, s8 1534; CHECK-NEXT: vmovx.f16 s12, s4 1535; CHECK-NEXT: vmov r2, s4 1536; CHECK-NEXT: vmov.16 q0[0], r3 1537; CHECK-NEXT: vmov.16 q0[1], r2 1538; CHECK-NEXT: vmov r2, s12 1539; CHECK-NEXT: vmov r0, s20 1540; CHECK-NEXT: vmov.16 q0[4], r2 1541; CHECK-NEXT: vdup.32 q4, r0 1542; CHECK-NEXT: vmov r2, s9 1543; CHECK-NEXT: vmov.16 q0[6], r2 1544; CHECK-NEXT: vmov r2, s5 1545; CHECK-NEXT: vmovx.f16 s12, s8 1546; CHECK-NEXT: vmov r0, s17 1547; CHECK-NEXT: vmov.16 q0[7], r2 1548; CHECK-NEXT: vmov r2, s12 1549; CHECK-NEXT: vmov.16 q3[2], r0 1550; CHECK-NEXT: vmov.f32 s1, s8 1551; CHECK-NEXT: vmov.16 q3[3], r2 1552; CHECK-NEXT: vmov r0, s2 1553; CHECK-NEXT: vmovx.f16 s16, s18 1554; CHECK-NEXT: vmov.16 q3[4], r0 1555; CHECK-NEXT: vmov r0, s16 1556; CHECK-NEXT: vmovx.f16 s16, s22 1557; CHECK-NEXT: vmov.16 q3[5], r0 1558; CHECK-NEXT: vmov r0, s16 1559; CHECK-NEXT: vmovx.f16 s16, s6 1560; CHECK-NEXT: vmovx.f16 s24, s7 1561; CHECK-NEXT: vmov r2, s16 1562; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill 1563; CHECK-NEXT: vmov.16 q4[0], r2 1564; CHECK-NEXT: vmov r2, s11 1565; CHECK-NEXT: vmov.16 q4[1], r0 1566; CHECK-NEXT: vmov r0, s7 1567; CHECK-NEXT: vmov.16 q4[3], r0 1568; CHECK-NEXT: vmov r0, s24 1569; CHECK-NEXT: vmovx.f16 s24, s23 1570; CHECK-NEXT: vmov.16 q4[6], r0 1571; CHECK-NEXT: vmov r0, s24 1572; CHECK-NEXT: vdup.32 q7, r2 1573; CHECK-NEXT: vmov.16 q4[7], r0 1574; CHECK-NEXT: vmov r2, s29 1575; CHECK-NEXT: vmov.f32 s18, s23 1576; CHECK-NEXT: vmovx.f16 s24, s17 1577; CHECK-NEXT: vmov r0, s24 1578; CHECK-NEXT: vmov.16 q6[2], r2 1579; CHECK-NEXT: vmov.16 q6[3], r0 1580; CHECK-NEXT: vmovx.f16 s28, s30 1581; CHECK-NEXT: vmovx.f16 s4, s10 1582; CHECK-NEXT: vmov.f32 s1, s13 1583; CHECK-NEXT: vmov.f32 s2, s14 1584; CHECK-NEXT: vstrw.32 q0, [r1] 1585; CHECK-NEXT: vmov r0, s18 1586; CHECK-NEXT: vmov.16 q6[4], r0 1587; CHECK-NEXT: vmov r0, s28 1588; CHECK-NEXT: vmov.16 q6[5], r0 1589; CHECK-NEXT: vmovx.f16 s28, s9 1590; CHECK-NEXT: vmov r0, s21 1591; CHECK-NEXT: vmov r2, s28 1592; CHECK-NEXT: vmov.16 q7[0], r0 1593; CHECK-NEXT: vmov.16 q7[1], r2 1594; CHECK-NEXT: vmov r0, s22 1595; CHECK-NEXT: vmov.16 q7[6], r0 1596; CHECK-NEXT: vmov r0, s4 1597; CHECK-NEXT: vmov.16 q7[7], r0 1598; CHECK-NEXT: vmov.f32 s17, s25 1599; CHECK-NEXT: vmov.f32 s29, s21 1600; CHECK-NEXT: vmov.f32 s30, s10 1601; CHECK-NEXT: vmovx.f16 s4, s29 1602; CHECK-NEXT: vmov r0, s4 1603; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 1604; CHECK-NEXT: vmov.f32 s18, s26 1605; CHECK-NEXT: vrev32.16 q2, q1 1606; CHECK-NEXT: vstrw.32 q4, [r1, #32] 1607; CHECK-NEXT: vmov r2, s9 1608; CHECK-NEXT: vmovx.f16 s8, s10 1609; CHECK-NEXT: vmov.16 q1[2], r2 1610; CHECK-NEXT: vmov.16 q1[3], r0 1611; CHECK-NEXT: vmov r0, s30 1612; CHECK-NEXT: vmov.16 q1[4], r0 1613; CHECK-NEXT: vmov r0, s8 1614; CHECK-NEXT: vmov.16 q1[5], r0 1615; CHECK-NEXT: vmov.f32 s29, s5 1616; CHECK-NEXT: vmov.f32 s30, s6 1617; CHECK-NEXT: vstrw.32 q7, [r1, #16] 1618; CHECK-NEXT: add sp, #16 1619; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1620; CHECK-NEXT: bx lr 1621entry: 1622 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 1623 %l1 = load <8 x half>, <8 x half>* %s1, align 4 1624 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1 1625 %l2 = load <8 x half>, <8 x half>* %s2, align 4 1626 %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2 1627 %l3 = load <8 x half>, <8 x half>* %s3, align 4 1628 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1629 %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1630 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1631 store <24 x half> %s, <24 x half> *%dst 1632 ret void 1633} 1634 1635define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) { 1636; CHECK-LABEL: vst3_v16f16: 1637; CHECK: @ %bb.0: @ %entry 1638; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1639; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1640; CHECK-NEXT: .pad #144 1641; CHECK-NEXT: sub sp, #144 1642; CHECK-NEXT: vldrw.u32 q0, [r0] 1643; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 1644; CHECK-NEXT: vmov r3, s0 1645; CHECK-NEXT: vmov q3, q0 1646; CHECK-NEXT: vmov r2, s8 1647; CHECK-NEXT: vmov.16 q1[0], r3 1648; CHECK-NEXT: vmovx.f16 s0, s8 1649; CHECK-NEXT: vmov.16 q1[1], r2 1650; CHECK-NEXT: vmov r2, s0 1651; CHECK-NEXT: vmovx.f16 s0, s12 1652; CHECK-NEXT: vmov.16 q1[4], r2 1653; CHECK-NEXT: vmov r2, s13 1654; CHECK-NEXT: vmov.16 q1[6], r2 1655; CHECK-NEXT: vmov r2, s9 1656; CHECK-NEXT: vmov.16 q1[7], r2 1657; CHECK-NEXT: vmov r2, s0 1658; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1659; CHECK-NEXT: vmov.f32 s5, s12 1660; CHECK-NEXT: vmov q5, q3 1661; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill 1662; CHECK-NEXT: vmov r3, s0 1663; CHECK-NEXT: vmov q4, q0 1664; CHECK-NEXT: vdup.32 q0, r3 1665; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill 1666; CHECK-NEXT: vmov r3, s1 1667; CHECK-NEXT: vmovx.f16 s0, s2 1668; CHECK-NEXT: vmov.16 q3[2], r3 1669; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill 1670; CHECK-NEXT: vmov.16 q3[3], r2 1671; CHECK-NEXT: vmov r2, s6 1672; CHECK-NEXT: vmov.16 q3[4], r2 1673; CHECK-NEXT: vmov r2, s0 1674; CHECK-NEXT: vmovx.f16 s0, s18 1675; CHECK-NEXT: vmov.16 q3[5], r2 1676; CHECK-NEXT: vmov r2, s0 1677; CHECK-NEXT: vmovx.f16 s0, s10 1678; CHECK-NEXT: vmov r3, s0 1679; CHECK-NEXT: vmovx.f16 s0, s11 1680; CHECK-NEXT: vmov.16 q1[0], r3 1681; CHECK-NEXT: vmov r3, s23 1682; CHECK-NEXT: vmov.16 q1[1], r2 1683; CHECK-NEXT: vmov r2, s11 1684; CHECK-NEXT: vmov.16 q1[3], r2 1685; CHECK-NEXT: vmov r2, s0 1686; CHECK-NEXT: vmovx.f16 s0, s19 1687; CHECK-NEXT: vmov.16 q1[6], r2 1688; CHECK-NEXT: vmov r2, s0 1689; CHECK-NEXT: vldrw.u32 q5, [r0, #16] 1690; CHECK-NEXT: vmov.16 q1[7], r2 1691; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill 1692; CHECK-NEXT: vmov.f32 s6, s19 1693; CHECK-NEXT: vmovx.f16 s0, s5 1694; CHECK-NEXT: vmov r2, s0 1695; CHECK-NEXT: vdup.32 q0, r3 1696; CHECK-NEXT: vmov r3, s1 1697; CHECK-NEXT: vmovx.f16 s0, s2 1698; CHECK-NEXT: vmov.16 q2[2], r3 1699; CHECK-NEXT: vmov r3, s20 1700; CHECK-NEXT: vmov.16 q2[3], r2 1701; CHECK-NEXT: vmov.16 q7[0], r3 1702; CHECK-NEXT: vldrw.u32 q3, [r0, #80] 1703; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill 1704; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill 1705; CHECK-NEXT: vmov r2, s6 1706; CHECK-NEXT: vmov.16 q2[4], r2 1707; CHECK-NEXT: vmov r2, s0 1708; CHECK-NEXT: vmov.16 q2[5], r2 1709; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill 1710; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1711; CHECK-NEXT: vmov r0, s12 1712; CHECK-NEXT: vmov r2, s8 1713; CHECK-NEXT: vmovx.f16 s0, s8 1714; CHECK-NEXT: vmov.16 q7[1], r2 1715; CHECK-NEXT: vmov r2, s0 1716; CHECK-NEXT: vmov.16 q7[4], r2 1717; CHECK-NEXT: vmov r2, s21 1718; CHECK-NEXT: vmov.16 q7[6], r2 1719; CHECK-NEXT: vmov r2, s9 1720; CHECK-NEXT: vmovx.f16 s0, s20 1721; CHECK-NEXT: vmov.16 q7[7], r2 1722; CHECK-NEXT: vmov r2, s0 1723; CHECK-NEXT: vdup.32 q0, r0 1724; CHECK-NEXT: vmov r0, s1 1725; CHECK-NEXT: vmovx.f16 s0, s2 1726; CHECK-NEXT: vmov.16 q1[2], r0 1727; CHECK-NEXT: vmov.f32 s29, s20 1728; CHECK-NEXT: vmov.16 q1[3], r2 1729; CHECK-NEXT: vmov r0, s30 1730; CHECK-NEXT: vmov.16 q1[4], r0 1731; CHECK-NEXT: vmov r0, s0 1732; CHECK-NEXT: vmov.16 q1[5], r0 1733; CHECK-NEXT: vmovx.f16 s0, s14 1734; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill 1735; CHECK-NEXT: vmov q1, q2 1736; CHECK-NEXT: vmov r0, s0 1737; CHECK-NEXT: vmovx.f16 s0, s6 1738; CHECK-NEXT: vmov r2, s0 1739; CHECK-NEXT: vmovx.f16 s0, s7 1740; CHECK-NEXT: vmov.16 q2[0], r2 1741; CHECK-NEXT: vmov r2, s23 1742; CHECK-NEXT: vmov.16 q2[1], r0 1743; CHECK-NEXT: vmov r0, s7 1744; CHECK-NEXT: vmov.16 q2[3], r0 1745; CHECK-NEXT: vmov r0, s0 1746; CHECK-NEXT: vmovx.f16 s0, s15 1747; CHECK-NEXT: vmov.16 q2[6], r0 1748; CHECK-NEXT: vmov r0, s0 1749; CHECK-NEXT: vmov.16 q2[7], r0 1750; CHECK-NEXT: vmov.f32 s10, s15 1751; CHECK-NEXT: vmovx.f16 s0, s9 1752; CHECK-NEXT: vmov r0, s0 1753; CHECK-NEXT: vdup.32 q0, r2 1754; CHECK-NEXT: vmov r2, s1 1755; CHECK-NEXT: vmovx.f16 s0, s2 1756; CHECK-NEXT: vmov.16 q6[2], r2 1757; CHECK-NEXT: vmov.16 q6[3], r0 1758; CHECK-NEXT: vmov r0, s10 1759; CHECK-NEXT: vmov.16 q6[4], r0 1760; CHECK-NEXT: vmov r0, s0 1761; CHECK-NEXT: vmov.16 q6[5], r0 1762; CHECK-NEXT: vmov r0, s13 1763; CHECK-NEXT: vmovx.f16 s0, s21 1764; CHECK-NEXT: vmov.16 q4[0], r0 1765; CHECK-NEXT: vmov r2, s0 1766; CHECK-NEXT: vmovx.f16 s0, s22 1767; CHECK-NEXT: vmov.16 q4[1], r2 1768; CHECK-NEXT: vmov r0, s14 1769; CHECK-NEXT: vmov.16 q4[6], r0 1770; CHECK-NEXT: vmov r0, s0 1771; CHECK-NEXT: vmov.16 q4[7], r0 1772; CHECK-NEXT: vmov.f32 s9, s25 1773; CHECK-NEXT: vmov.f32 s17, s13 1774; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload 1775; CHECK-NEXT: vmov.f32 s18, s22 1776; CHECK-NEXT: vmovx.f16 s0, s17 1777; CHECK-NEXT: vmov r0, s0 1778; CHECK-NEXT: vrev32.16 q0, q1 1779; CHECK-NEXT: vmov r2, s1 1780; CHECK-NEXT: vmovx.f16 s0, s2 1781; CHECK-NEXT: vmov.16 q1[2], r2 1782; CHECK-NEXT: vmov.f32 s10, s26 1783; CHECK-NEXT: vmov.16 q1[3], r0 1784; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload 1785; CHECK-NEXT: vstrw.32 q2, [r1, #80] 1786; CHECK-NEXT: vmov r0, s18 1787; CHECK-NEXT: vmov.16 q1[4], r0 1788; CHECK-NEXT: vmov r0, s0 1789; CHECK-NEXT: vmov.16 q1[5], r0 1790; CHECK-NEXT: vmovx.f16 s0, s13 1791; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill 1792; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload 1793; CHECK-NEXT: vmov r2, s0 1794; CHECK-NEXT: vmovx.f16 s0, s14 1795; CHECK-NEXT: vmov r0, s5 1796; CHECK-NEXT: vmov.16 q5[0], r0 1797; CHECK-NEXT: vmov r0, s6 1798; CHECK-NEXT: vmov.16 q5[1], r2 1799; CHECK-NEXT: vmov.16 q5[6], r0 1800; CHECK-NEXT: vmov r0, s0 1801; CHECK-NEXT: vmov.16 q5[7], r0 1802; CHECK-NEXT: vmov.f32 s21, s5 1803; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload 1804; CHECK-NEXT: vmov.f32 s22, s14 1805; CHECK-NEXT: vmovx.f16 s0, s21 1806; CHECK-NEXT: vmov r0, s0 1807; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 1808; CHECK-NEXT: vrev32.16 q3, q0 1809; CHECK-NEXT: vmov r2, s13 1810; CHECK-NEXT: vmovx.f16 s12, s14 1811; CHECK-NEXT: vmov.16 q0[2], r2 1812; CHECK-NEXT: vmov.16 q0[3], r0 1813; CHECK-NEXT: vmov r0, s22 1814; CHECK-NEXT: vmov.16 q0[4], r0 1815; CHECK-NEXT: vmov r0, s12 1816; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload 1817; CHECK-NEXT: vmov.16 q0[5], r0 1818; CHECK-NEXT: vmov.f32 s13, s25 1819; CHECK-NEXT: vmov.f32 s14, s26 1820; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload 1821; CHECK-NEXT: vmov.f32 s25, s5 1822; CHECK-NEXT: vstrw.32 q3, [r1] 1823; CHECK-NEXT: vmov.f32 s21, s1 1824; CHECK-NEXT: vmov.f32 s26, s6 1825; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 1826; CHECK-NEXT: vmov.f32 s22, s2 1827; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 1828; CHECK-NEXT: vmov.f32 s29, s5 1829; CHECK-NEXT: vstrw.32 q6, [r1, #32] 1830; CHECK-NEXT: vmov.f32 s17, s1 1831; CHECK-NEXT: vstrw.32 q5, [r1, #16] 1832; CHECK-NEXT: vmov.f32 s30, s6 1833; CHECK-NEXT: vmov.f32 s18, s2 1834; CHECK-NEXT: vstrw.32 q7, [r1, #48] 1835; CHECK-NEXT: vstrw.32 q4, [r1, #64] 1836; CHECK-NEXT: add sp, #144 1837; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1838; CHECK-NEXT: bx lr 1839entry: 1840 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 1841 %l1 = load <16 x half>, <16 x half>* %s1, align 4 1842 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1 1843 %l2 = load <16 x half>, <16 x half>* %s2, align 4 1844 %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2 1845 %l3 = load <16 x half>, <16 x half>* %s3, align 4 1846 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1847 %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1848 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1849 store <48 x half> %s, <48 x half> *%dst 1850 ret void 1851} 1852 1853; f64 1854 1855define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) { 1856; CHECK-LABEL: vst3_v2f64: 1857; CHECK: @ %bb.0: @ %entry 1858; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1859; CHECK-NEXT: vldrw.u32 q0, [r0] 1860; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1861; CHECK-NEXT: vmov.f64 d6, d2 1862; CHECK-NEXT: vmov.f64 d7, d1 1863; CHECK-NEXT: vmov.f64 d1, d4 1864; CHECK-NEXT: vstrw.32 q3, [r1, #16] 1865; CHECK-NEXT: vmov.f64 d2, d5 1866; CHECK-NEXT: vstrw.32 q0, [r1] 1867; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1868; CHECK-NEXT: bx lr 1869entry: 1870 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0 1871 %l1 = load <2 x double>, <2 x double>* %s1, align 4 1872 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1 1873 %l2 = load <2 x double>, <2 x double>* %s2, align 4 1874 %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2 1875 %l3 = load <2 x double>, <2 x double>* %s3, align 4 1876 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1877 %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1878 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1879 store <6 x double> %s, <6 x double> *%dst 1880 ret void 1881} 1882 1883define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) { 1884; CHECK-LABEL: vst3_v4f64: 1885; CHECK: @ %bb.0: @ %entry 1886; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1887; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1888; CHECK-NEXT: .pad #16 1889; CHECK-NEXT: sub sp, #16 1890; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 1891; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 1892; CHECK-NEXT: vldrw.u32 q1, [r0] 1893; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1894; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1895; CHECK-NEXT: vmov.f64 d6, d15 1896; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill 1897; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 1898; CHECK-NEXT: vmov.f64 d10, d2 1899; CHECK-NEXT: vmov.f64 d7, d1 1900; CHECK-NEXT: vmov.f64 d11, d12 1901; CHECK-NEXT: vstrw.32 q3, [r1, #80] 1902; CHECK-NEXT: vmov.f64 d12, d4 1903; CHECK-NEXT: vstrw.32 q5, [r1] 1904; CHECK-NEXT: vmov.f64 d1, d5 1905; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload 1906; CHECK-NEXT: vmov.f64 d2, d8 1907; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1908; CHECK-NEXT: vmov.f64 d13, d14 1909; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1910; CHECK-NEXT: vmov.f64 d8, d5 1911; CHECK-NEXT: vstrw.32 q6, [r1, #48] 1912; CHECK-NEXT: vstrw.32 q4, [r1, #32] 1913; CHECK-NEXT: add sp, #16 1914; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1915; CHECK-NEXT: bx lr 1916entry: 1917 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0 1918 %l1 = load <4 x double>, <4 x double>* %s1, align 4 1919 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1 1920 %l2 = load <4 x double>, <4 x double>* %s2, align 4 1921 %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2 1922 %l3 = load <4 x double>, <4 x double>* %s3, align 4 1923 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1924 %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1925 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1926 store <12 x double> %s, <12 x double> *%dst 1927 ret void 1928} 1929