1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) { 7; CHECK-LABEL: vst4_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r4, lr} 10; CHECK-NEXT: push {r4, lr} 11; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} 12; CHECK-NEXT: ldrd r4, r0, [r0, #16] 13; CHECK-NEXT: vmov.32 q1[0], r4 14; CHECK-NEXT: vmov.32 q1[2], r0 15; CHECK-NEXT: vmov.f64 d0, d2 16; CHECK-NEXT: vmov.f32 s1, s6 17; CHECK-NEXT: vmov.f32 s2, s4 18; CHECK-NEXT: vmov.f32 s3, s6 19; CHECK-NEXT: vmov.32 q1[0], r2 20; CHECK-NEXT: vmov.32 q1[1], r3 21; CHECK-NEXT: vmov.32 q1[2], r12 22; CHECK-NEXT: vmov.32 q1[3], lr 23; CHECK-NEXT: vmov.f64 d4, d2 24; CHECK-NEXT: vmov.f32 s9, s6 25; CHECK-NEXT: vmov.f32 s10, s0 26; CHECK-NEXT: vmov.f32 s11, s2 27; CHECK-NEXT: vstrw.32 q2, [r1] 28; CHECK-NEXT: vmov.f32 s8, s5 29; CHECK-NEXT: vmov.f32 s9, s7 30; CHECK-NEXT: vmov.f32 s10, s1 31; CHECK-NEXT: vmov.f32 s11, s3 32; CHECK-NEXT: vstrw.32 q2, [r1, #16] 33; CHECK-NEXT: pop {r4, pc} 34entry: 35 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 36 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4 37 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1 38 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4 39 %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2 40 %l3 = load <2 x i32>, <2 x i32>* %s3, align 4 41 %s4 = getelementptr <2 x i32>, <2 x i32>* %src, i32 3 42 %l4 = load <2 x i32>, <2 x i32>* %s3, align 4 43 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 44 %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 45 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 46 store <8 x i32> %s, <8 x i32> *%dst 47 ret void 48} 49 50define void @vst4_v4i32(<4 x i32> *%src, <16 x i32> *%dst) { 51; CHECK-LABEL: vst4_v4i32: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 54; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 55; CHECK-NEXT: vldrw.u32 q0, [r0] 56; CHECK-NEXT: vmov q3, q2 57; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 58; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 59; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 60; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] 61; CHECK-NEXT: bx lr 62entry: 63 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 64 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4 65 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1 66 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4 67 %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2 68 %l3 = load <4 x i32>, <4 x i32>* %s3, align 4 69 %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3 70 %l4 = load <4 x i32>, <4 x i32>* %s3, align 4 71 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 72 %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 73 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 74 store <16 x i32> %s, <16 x i32> *%dst 75 ret void 76} 77 78define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) { 79; CHECK-LABEL: vst4_v8i32: 80; CHECK: @ %bb.0: @ %entry 81; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 82; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 83; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 84; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 85; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 86; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 87; CHECK-NEXT: vldrw.u32 q4, [r0] 88; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 89; CHECK-NEXT: vmov q7, q6 90; CHECK-NEXT: vmov q3, q2 91; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] 92; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] 93; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] 94; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]! 95; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 96; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 97; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 98; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] 99; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 100; CHECK-NEXT: bx lr 101entry: 102 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 103 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4 104 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1 105 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4 106 %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2 107 %l3 = load <8 x i32>, <8 x i32>* %s3, align 4 108 %s4 = getelementptr <8 x i32>, <8 x i32>* %src, i32 3 109 %l4 = load <8 x i32>, <8 x i32>* %s3, align 4 110 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 111 %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 112 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 113 store <32 x i32> %s, <32 x i32> *%dst 114 ret void 115} 116 117define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) { 118; CHECK-LABEL: vst4_v16i32: 119; CHECK: @ %bb.0: @ %entry 120; CHECK-NEXT: .save {r4, r5} 121; CHECK-NEXT: push {r4, r5} 122; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 123; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 124; CHECK-NEXT: .pad #216 125; CHECK-NEXT: sub sp, #216 126; CHECK-NEXT: vldrw.u32 q2, [r0, #144] 127; CHECK-NEXT: add r2, sp, #64 128; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 129; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 130; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 131; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill 132; CHECK-NEXT: add r2, sp, #128 133; CHECK-NEXT: vldrw.u32 q2, [r0, #128] 134; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 135; CHECK-NEXT: add r2, sp, #64 136; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 137; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 138; CHECK-NEXT: add r2, sp, #64 139; CHECK-NEXT: vmov q7, q6 140; CHECK-NEXT: vldrw.u32 q0, [r0] 141; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 142; CHECK-NEXT: add r2, sp, #128 143; CHECK-NEXT: vldrw.u32 q6, [r0, #160] 144; CHECK-NEXT: vmov q3, q2 145; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 146; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 147; CHECK-NEXT: add r2, sp, #128 148; CHECK-NEXT: vldrw.u32 q5, [r0, #112] 149; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 150; CHECK-NEXT: add r2, sp, #128 151; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 152; CHECK-NEXT: vldrw.u32 q5, [r0, #96] 153; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 154; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 155; CHECK-NEXT: add r2, sp, #128 156; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 157; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 158; CHECK-NEXT: add r2, sp, #64 159; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 160; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 161; CHECK-NEXT: mov r0, r1 162; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 163; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 164; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 165; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! 166; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload 167; CHECK-NEXT: add r2, sp, #128 168; CHECK-NEXT: vmov q7, q6 169; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] 170; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] 171; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] 172; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] 173; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload 174; CHECK-NEXT: add.w r0, r1, #192 175; CHECK-NEXT: adds r1, #128 176; CHECK-NEXT: vmov q3, q2 177; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] 178; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] 179; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] 180; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] 181; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] 182; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] 183; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] 184; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] 185; CHECK-NEXT: add sp, #216 186; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 187; CHECK-NEXT: pop {r4, r5} 188; CHECK-NEXT: bx lr 189entry: 190 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0 191 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4 192 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1 193 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4 194 %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2 195 %l3 = load <16 x i32>, <16 x i32>* %s3, align 4 196 %s4 = getelementptr <16 x i32>, <16 x i32>* %src, i32 3 197 %l4 = load <16 x i32>, <16 x i32>* %s3, align 4 198 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 199 %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 200 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 201 store <64 x i32> %s, <64 x i32> *%dst 202 ret void 203} 204 205; i16 206 207define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) { 208; CHECK-LABEL: vst4_v2i16: 209; CHECK: @ %bb.0: @ %entry 210; CHECK-NEXT: .save {r4, lr} 211; CHECK-NEXT: push {r4, lr} 212; CHECK-NEXT: ldrh r4, [r0] 213; CHECK-NEXT: ldrh.w lr, [r0, #4] 214; CHECK-NEXT: ldrh r3, [r0, #8] 215; CHECK-NEXT: vmov.32 q0[0], r4 216; CHECK-NEXT: ldrh.w r12, [r0, #6] 217; CHECK-NEXT: ldrh r2, [r0, #10] 218; CHECK-NEXT: ldrh r0, [r0, #2] 219; CHECK-NEXT: vmov.32 q0[2], r0 220; CHECK-NEXT: vmov r4, s0 221; CHECK-NEXT: vmov.16 q0[0], r4 222; CHECK-NEXT: vmov.16 q0[1], lr 223; CHECK-NEXT: vmov.16 q0[2], r3 224; CHECK-NEXT: vmov.16 q0[3], r3 225; CHECK-NEXT: vmov.16 q0[4], r0 226; CHECK-NEXT: vmov.16 q0[5], r12 227; CHECK-NEXT: vmov.16 q0[6], r2 228; CHECK-NEXT: vmov.16 q0[7], r2 229; CHECK-NEXT: vstrw.32 q0, [r1] 230; CHECK-NEXT: pop {r4, pc} 231entry: 232 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0 233 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4 234 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1 235 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4 236 %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2 237 %l3 = load <2 x i16>, <2 x i16>* %s3, align 4 238 %s4 = getelementptr <2 x i16>, <2 x i16>* %src, i32 3 239 %l4 = load <2 x i16>, <2 x i16>* %s3, align 4 240 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 241 %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 242 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 243 store <8 x i16> %s, <8 x i16> *%dst 244 ret void 245} 246 247define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) { 248; CHECK-LABEL: vst4_v4i16: 249; CHECK: @ %bb.0: @ %entry 250; CHECK-NEXT: .vsave {d8, d9} 251; CHECK-NEXT: vpush {d8, d9} 252; CHECK-NEXT: vldrh.u32 q1, [r0] 253; CHECK-NEXT: vldrh.u32 q2, [r0, #8] 254; CHECK-NEXT: vldrh.u32 q3, [r0, #16] 255; CHECK-NEXT: vmov r2, s6 256; CHECK-NEXT: vmov.16 q0[0], r2 257; CHECK-NEXT: vmov r2, s10 258; CHECK-NEXT: vmov.16 q0[1], r2 259; CHECK-NEXT: vmov r0, s14 260; CHECK-NEXT: vmov.16 q0[2], r0 261; CHECK-NEXT: vmov.16 q0[3], r0 262; CHECK-NEXT: vmov r0, s7 263; CHECK-NEXT: vmov.16 q0[4], r0 264; CHECK-NEXT: vmov r0, s11 265; CHECK-NEXT: vmov.16 q0[5], r0 266; CHECK-NEXT: vmov r0, s15 267; CHECK-NEXT: vmov.16 q0[6], r0 268; CHECK-NEXT: vmov.16 q0[7], r0 269; CHECK-NEXT: vmov r0, s4 270; CHECK-NEXT: vmov.16 q4[0], r0 271; CHECK-NEXT: vmov r0, s8 272; CHECK-NEXT: vmov.16 q4[1], r0 273; CHECK-NEXT: vmov r0, s12 274; CHECK-NEXT: vmov.16 q4[2], r0 275; CHECK-NEXT: vstrw.32 q0, [r1, #16] 276; CHECK-NEXT: vmov.16 q4[3], r0 277; CHECK-NEXT: vmov r0, s5 278; CHECK-NEXT: vmov.16 q4[4], r0 279; CHECK-NEXT: vmov r0, s9 280; CHECK-NEXT: vmov.16 q4[5], r0 281; CHECK-NEXT: vmov r0, s13 282; CHECK-NEXT: vmov.16 q4[6], r0 283; CHECK-NEXT: vmov.16 q4[7], r0 284; CHECK-NEXT: vstrw.32 q4, [r1] 285; CHECK-NEXT: vpop {d8, d9} 286; CHECK-NEXT: bx lr 287entry: 288 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 289 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 290 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1 291 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4 292 %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2 293 %l3 = load <4 x i16>, <4 x i16>* %s3, align 4 294 %s4 = getelementptr <4 x i16>, <4 x i16>* %src, i32 3 295 %l4 = load <4 x i16>, <4 x i16>* %s3, align 4 296 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 297 %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 298 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 299 store <16 x i16> %s, <16 x i16> *%dst 300 ret void 301} 302 303define void @vst4_v8i16(<8 x i16> *%src, <32 x i16> *%dst) { 304; CHECK-LABEL: vst4_v8i16: 305; CHECK: @ %bb.0: @ %entry 306; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 307; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 308; CHECK-NEXT: vldrw.u32 q0, [r0] 309; CHECK-NEXT: vmov q3, q2 310; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] 311; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] 312; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] 313; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] 314; CHECK-NEXT: bx lr 315entry: 316 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 317 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4 318 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1 319 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4 320 %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2 321 %l3 = load <8 x i16>, <8 x i16>* %s3, align 4 322 %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3 323 %l4 = load <8 x i16>, <8 x i16>* %s3, align 4 324 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 325 %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 326 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 327 store <32 x i16> %s, <32 x i16> *%dst 328 ret void 329} 330 331define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) { 332; CHECK-LABEL: vst4_v16i16: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 335; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 336; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 337; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 338; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 339; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 340; CHECK-NEXT: vldrw.u32 q4, [r0] 341; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 342; CHECK-NEXT: vmov q7, q6 343; CHECK-NEXT: vmov q3, q2 344; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] 345; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] 346; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] 347; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]! 348; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] 349; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] 350; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] 351; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] 352; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 353; CHECK-NEXT: bx lr 354entry: 355 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 356 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4 357 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1 358 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4 359 %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2 360 %l3 = load <16 x i16>, <16 x i16>* %s3, align 4 361 %s4 = getelementptr <16 x i16>, <16 x i16>* %src, i32 3 362 %l4 = load <16 x i16>, <16 x i16>* %s3, align 4 363 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 364 %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 365 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 366 store <64 x i16> %s, <64 x i16> *%dst 367 ret void 368} 369 370; i8 371 372define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) { 373; CHECK-LABEL: vst4_v2i8: 374; CHECK: @ %bb.0: @ %entry 375; CHECK-NEXT: .save {r4, lr} 376; CHECK-NEXT: push {r4, lr} 377; CHECK-NEXT: ldrb r2, [r0] 378; CHECK-NEXT: ldrb r3, [r0, #1] 379; CHECK-NEXT: vmov.32 q0[0], r2 380; CHECK-NEXT: ldrb.w r12, [r0, #2] 381; CHECK-NEXT: vmov.32 q0[2], r3 382; CHECK-NEXT: ldrb.w lr, [r0, #3] 383; CHECK-NEXT: vmov r2, s0 384; CHECK-NEXT: ldrb r4, [r0, #5] 385; CHECK-NEXT: vmov.16 q0[0], r2 386; CHECK-NEXT: ldrb r0, [r0, #4] 387; CHECK-NEXT: vmov.16 q0[1], r12 388; CHECK-NEXT: vmov.16 q0[2], r0 389; CHECK-NEXT: vmov.16 q0[3], r0 390; CHECK-NEXT: vmov.16 q0[4], r3 391; CHECK-NEXT: vmov.16 q0[5], lr 392; CHECK-NEXT: vmov.16 q0[6], r4 393; CHECK-NEXT: vmov.16 q0[7], r4 394; CHECK-NEXT: vstrb.16 q0, [r1] 395; CHECK-NEXT: pop {r4, pc} 396entry: 397 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 398 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 399 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1 400 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4 401 %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2 402 %l3 = load <2 x i8>, <2 x i8>* %s3, align 4 403 %s4 = getelementptr <2 x i8>, <2 x i8>* %src, i32 3 404 %l4 = load <2 x i8>, <2 x i8>* %s3, align 4 405 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 406 %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 407 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 408 store <8 x i8> %s, <8 x i8> *%dst 409 ret void 410} 411 412define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) { 413; CHECK-LABEL: vst4_v4i8: 414; CHECK: @ %bb.0: @ %entry 415; CHECK-NEXT: vldrb.u32 q1, [r0] 416; CHECK-NEXT: vldrb.u32 q2, [r0, #4] 417; CHECK-NEXT: vldrb.u32 q3, [r0, #8] 418; CHECK-NEXT: vmov r2, s4 419; CHECK-NEXT: vmov.8 q0[0], r2 420; CHECK-NEXT: vmov r2, s8 421; CHECK-NEXT: vmov.8 q0[1], r2 422; CHECK-NEXT: vmov r0, s12 423; CHECK-NEXT: vmov.8 q0[2], r0 424; CHECK-NEXT: vmov.8 q0[3], r0 425; CHECK-NEXT: vmov r0, s5 426; CHECK-NEXT: vmov.8 q0[4], r0 427; CHECK-NEXT: vmov r0, s9 428; CHECK-NEXT: vmov.8 q0[5], r0 429; CHECK-NEXT: vmov r0, s13 430; CHECK-NEXT: vmov.8 q0[6], r0 431; CHECK-NEXT: vmov.8 q0[7], r0 432; CHECK-NEXT: vmov r0, s6 433; CHECK-NEXT: vmov.8 q0[8], r0 434; CHECK-NEXT: vmov r0, s10 435; CHECK-NEXT: vmov.8 q0[9], r0 436; CHECK-NEXT: vmov r0, s14 437; CHECK-NEXT: vmov.8 q0[10], r0 438; CHECK-NEXT: vmov.8 q0[11], r0 439; CHECK-NEXT: vmov r0, s7 440; CHECK-NEXT: vmov.8 q0[12], r0 441; CHECK-NEXT: vmov r0, s11 442; CHECK-NEXT: vmov.8 q0[13], r0 443; CHECK-NEXT: vmov r0, s15 444; CHECK-NEXT: vmov.8 q0[14], r0 445; CHECK-NEXT: vmov.8 q0[15], r0 446; CHECK-NEXT: vstrw.32 q0, [r1] 447; CHECK-NEXT: bx lr 448entry: 449 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 450 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 451 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1 452 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4 453 %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2 454 %l3 = load <4 x i8>, <4 x i8>* %s3, align 4 455 %s4 = getelementptr <4 x i8>, <4 x i8>* %src, i32 3 456 %l4 = load <4 x i8>, <4 x i8>* %s3, align 4 457 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 458 %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 459 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 460 store <16 x i8> %s, <16 x i8> *%dst 461 ret void 462} 463 464define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) { 465; CHECK-LABEL: vst4_v8i8: 466; CHECK: @ %bb.0: @ %entry 467; CHECK-NEXT: .vsave {d8, d9} 468; CHECK-NEXT: vpush {d8, d9} 469; CHECK-NEXT: vldrb.u16 q1, [r0] 470; CHECK-NEXT: vldrb.u16 q2, [r0, #8] 471; CHECK-NEXT: vldrb.u16 q3, [r0, #16] 472; CHECK-NEXT: vmov.u16 r2, q1[4] 473; CHECK-NEXT: vmov.8 q0[0], r2 474; CHECK-NEXT: vmov.u16 r2, q2[4] 475; CHECK-NEXT: vmov.8 q0[1], r2 476; CHECK-NEXT: vmov.u16 r0, q3[4] 477; CHECK-NEXT: vmov.8 q0[2], r0 478; CHECK-NEXT: vmov.8 q0[3], r0 479; CHECK-NEXT: vmov.u16 r0, q1[5] 480; CHECK-NEXT: vmov.8 q0[4], r0 481; CHECK-NEXT: vmov.u16 r0, q2[5] 482; CHECK-NEXT: vmov.8 q0[5], r0 483; CHECK-NEXT: vmov.u16 r0, q3[5] 484; CHECK-NEXT: vmov.8 q0[6], r0 485; CHECK-NEXT: vmov.8 q0[7], r0 486; CHECK-NEXT: vmov.u16 r0, q1[6] 487; CHECK-NEXT: vmov.8 q0[8], r0 488; CHECK-NEXT: vmov.u16 r0, q2[6] 489; CHECK-NEXT: vmov.8 q0[9], r0 490; CHECK-NEXT: vmov.u16 r0, q3[6] 491; CHECK-NEXT: vmov.8 q0[10], r0 492; CHECK-NEXT: vmov.8 q0[11], r0 493; CHECK-NEXT: vmov.u16 r0, q1[7] 494; CHECK-NEXT: vmov.8 q0[12], r0 495; CHECK-NEXT: vmov.u16 r0, q2[7] 496; CHECK-NEXT: vmov.8 q0[13], r0 497; CHECK-NEXT: vmov.u16 r0, q3[7] 498; CHECK-NEXT: vmov.8 q0[14], r0 499; CHECK-NEXT: vmov.8 q0[15], r0 500; CHECK-NEXT: vmov.u16 r0, q1[0] 501; CHECK-NEXT: vmov.8 q4[0], r0 502; CHECK-NEXT: vmov.u16 r0, q2[0] 503; CHECK-NEXT: vmov.8 q4[1], r0 504; CHECK-NEXT: vmov.u16 r0, q3[0] 505; CHECK-NEXT: vmov.8 q4[2], r0 506; CHECK-NEXT: vstrw.32 q0, [r1, #16] 507; CHECK-NEXT: vmov.8 q4[3], r0 508; CHECK-NEXT: vmov.u16 r0, q1[1] 509; CHECK-NEXT: vmov.8 q4[4], r0 510; CHECK-NEXT: vmov.u16 r0, q2[1] 511; CHECK-NEXT: vmov.8 q4[5], r0 512; CHECK-NEXT: vmov.u16 r0, q3[1] 513; CHECK-NEXT: vmov.8 q4[6], r0 514; CHECK-NEXT: vmov.8 q4[7], r0 515; CHECK-NEXT: vmov.u16 r0, q1[2] 516; CHECK-NEXT: vmov.8 q4[8], r0 517; CHECK-NEXT: vmov.u16 r0, q2[2] 518; CHECK-NEXT: vmov.8 q4[9], r0 519; CHECK-NEXT: vmov.u16 r0, q3[2] 520; CHECK-NEXT: vmov.8 q4[10], r0 521; CHECK-NEXT: vmov.8 q4[11], r0 522; CHECK-NEXT: vmov.u16 r0, q1[3] 523; CHECK-NEXT: vmov.8 q4[12], r0 524; CHECK-NEXT: vmov.u16 r0, q2[3] 525; CHECK-NEXT: vmov.8 q4[13], r0 526; CHECK-NEXT: vmov.u16 r0, q3[3] 527; CHECK-NEXT: vmov.8 q4[14], r0 528; CHECK-NEXT: vmov.8 q4[15], r0 529; CHECK-NEXT: vstrw.32 q4, [r1] 530; CHECK-NEXT: vpop {d8, d9} 531; CHECK-NEXT: bx lr 532entry: 533 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 534 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4 535 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1 536 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4 537 %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2 538 %l3 = load <8 x i8>, <8 x i8>* %s3, align 4 539 %s4 = getelementptr <8 x i8>, <8 x i8>* %src, i32 3 540 %l4 = load <8 x i8>, <8 x i8>* %s3, align 4 541 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 542 %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 543 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 544 store <32 x i8> %s, <32 x i8> *%dst 545 ret void 546} 547 548define void @vst4_v16i8(<16 x i8> *%src, <64 x i8> *%dst) { 549; CHECK-LABEL: vst4_v16i8: 550; CHECK: @ %bb.0: @ %entry 551; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 552; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 553; CHECK-NEXT: vldrw.u32 q0, [r0] 554; CHECK-NEXT: vmov q3, q2 555; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r1] 556; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r1] 557; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r1] 558; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1] 559; CHECK-NEXT: bx lr 560entry: 561 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 562 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4 563 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1 564 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4 565 %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2 566 %l3 = load <16 x i8>, <16 x i8>* %s3, align 4 567 %s4 = getelementptr <16 x i8>, <16 x i8>* %src, i32 3 568 %l4 = load <16 x i8>, <16 x i8>* %s3, align 4 569 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 570 %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 571 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 572 store <64 x i8> %s, <64 x i8> *%dst 573 ret void 574} 575 576; i64 577 578define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { 579; CHECK-LABEL: vst4_v2i64: 580; CHECK: @ %bb.0: @ %entry 581; CHECK-NEXT: vldrw.u32 q3, [r0] 582; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 583; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 584; CHECK-NEXT: vmov.f64 d4, d6 585; CHECK-NEXT: vmov.f32 s9, s13 586; CHECK-NEXT: vmov.f32 s10, s0 587; CHECK-NEXT: vmov.f32 s11, s1 588; CHECK-NEXT: vmov.f32 s0, s14 589; CHECK-NEXT: vstrw.32 q2, [r1] 590; CHECK-NEXT: vmov.f32 s1, s15 591; CHECK-NEXT: vmov.f64 d6, d2 592; CHECK-NEXT: vstrw.32 q0, [r1, #32] 593; CHECK-NEXT: vmov.f32 s13, s5 594; CHECK-NEXT: vmov.f32 s14, s4 595; CHECK-NEXT: vmov.f32 s15, s5 596; CHECK-NEXT: vmov.f32 s4, s6 597; CHECK-NEXT: vstrw.32 q3, [r1, #16] 598; CHECK-NEXT: vmov.f32 s5, s7 599; CHECK-NEXT: vstrw.32 q1, [r1, #48] 600; CHECK-NEXT: bx lr 601entry: 602 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 603 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4 604 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1 605 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4 606 %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2 607 %l3 = load <2 x i64>, <2 x i64>* %s3, align 4 608 %s4 = getelementptr <2 x i64>, <2 x i64>* %src, i32 3 609 %l4 = load <2 x i64>, <2 x i64>* %s3, align 4 610 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 611 %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 612 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 613 store <8 x i64> %s, <8 x i64> *%dst 614 ret void 615} 616 617define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) { 618; CHECK-LABEL: vst4_v4i64: 619; CHECK: @ %bb.0: @ %entry 620; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 621; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 622; CHECK-NEXT: vldrw.u32 q5, [r0] 623; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 624; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 625; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 626; CHECK-NEXT: vmov.f64 d6, d10 627; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 628; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 629; CHECK-NEXT: vmov.f32 s13, s21 630; CHECK-NEXT: vmov.f32 s14, s0 631; CHECK-NEXT: vmov.f32 s15, s1 632; CHECK-NEXT: vmov.f32 s0, s22 633; CHECK-NEXT: vstrw.32 q3, [r1] 634; CHECK-NEXT: vmov.f32 s1, s23 635; CHECK-NEXT: vmov.f64 d10, d12 636; CHECK-NEXT: vstrw.32 q0, [r1, #32] 637; CHECK-NEXT: vmov.f32 s21, s25 638; CHECK-NEXT: vmov.f32 s22, s8 639; CHECK-NEXT: vmov.f32 s23, s9 640; CHECK-NEXT: vmov.f32 s8, s26 641; CHECK-NEXT: vstrw.32 q5, [r1, #64] 642; CHECK-NEXT: vmov.f32 s9, s27 643; CHECK-NEXT: vmov.f64 d12, d2 644; CHECK-NEXT: vstrw.32 q2, [r1, #96] 645; CHECK-NEXT: vmov.f64 d14, d8 646; CHECK-NEXT: vmov.f32 s25, s5 647; CHECK-NEXT: vmov.f32 s29, s17 648; CHECK-NEXT: vmov.f32 s26, s4 649; CHECK-NEXT: vmov.f32 s30, s16 650; CHECK-NEXT: vmov.f32 s27, s5 651; CHECK-NEXT: vmov.f32 s4, s6 652; CHECK-NEXT: vstrw.32 q6, [r1, #16] 653; CHECK-NEXT: vmov.f32 s31, s17 654; CHECK-NEXT: vmov.f32 s16, s18 655; CHECK-NEXT: vstrw.32 q7, [r1, #80] 656; CHECK-NEXT: vmov.f32 s5, s7 657; CHECK-NEXT: vmov.f32 s17, s19 658; CHECK-NEXT: vstrw.32 q1, [r1, #48] 659; CHECK-NEXT: vstrw.32 q4, [r1, #112] 660; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 661; CHECK-NEXT: bx lr 662entry: 663 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0 664 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4 665 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1 666 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4 667 %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2 668 %l3 = load <4 x i64>, <4 x i64>* %s3, align 4 669 %s4 = getelementptr <4 x i64>, <4 x i64>* %src, i32 3 670 %l4 = load <4 x i64>, <4 x i64>* %s3, align 4 671 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 672 %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 673 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 674 store <16 x i64> %s, <16 x i64> *%dst 675 ret void 676} 677 678; f32 679 680define void @vst4_v2f32(<2 x float> *%src, <8 x float> *%dst) { 681; CHECK-LABEL: vst4_v2f32: 682; CHECK: @ %bb.0: @ %entry 683; CHECK-NEXT: vldr s0, [r0] 684; CHECK-NEXT: vldr s4, [r0, #4] 685; CHECK-NEXT: vldr s1, [r0, #8] 686; CHECK-NEXT: vldr s5, [r0, #12] 687; CHECK-NEXT: vldr s2, [r0, #16] 688; CHECK-NEXT: vldr s6, [r0, #20] 689; CHECK-NEXT: vmov.f32 s3, s2 690; CHECK-NEXT: vmov.f32 s7, s6 691; CHECK-NEXT: vstrw.32 q0, [r1] 692; CHECK-NEXT: vstrw.32 q1, [r1, #16] 693; CHECK-NEXT: bx lr 694entry: 695 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0 696 %l1 = load <2 x float>, <2 x float>* %s1, align 4 697 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1 698 %l2 = load <2 x float>, <2 x float>* %s2, align 4 699 %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2 700 %l3 = load <2 x float>, <2 x float>* %s3, align 4 701 %s4 = getelementptr <2 x float>, <2 x float>* %src, i32 3 702 %l4 = load <2 x float>, <2 x float>* %s3, align 4 703 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 704 %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 705 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 706 store <8 x float> %s, <8 x float> *%dst 707 ret void 708} 709 710define void @vst4_v4f32(<4 x float> *%src, <16 x float> *%dst) { 711; CHECK-LABEL: vst4_v4f32: 712; CHECK: @ %bb.0: @ %entry 713; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 714; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 715; CHECK-NEXT: vldrw.u32 q0, [r0] 716; CHECK-NEXT: vmov q3, q2 717; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 718; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 719; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 720; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] 721; CHECK-NEXT: bx lr 722entry: 723 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 724 %l1 = load <4 x float>, <4 x float>* %s1, align 4 725 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1 726 %l2 = load <4 x float>, <4 x float>* %s2, align 4 727 %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2 728 %l3 = load <4 x float>, <4 x float>* %s3, align 4 729 %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3 730 %l4 = load <4 x float>, <4 x float>* %s3, align 4 731 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 732 %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 733 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 734 store <16 x float> %s, <16 x float> *%dst 735 ret void 736} 737 738define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) { 739; CHECK-LABEL: vst4_v8f32: 740; CHECK: @ %bb.0: @ %entry 741; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 742; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 743; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 744; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 745; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 746; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 747; CHECK-NEXT: vldrw.u32 q4, [r0] 748; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 749; CHECK-NEXT: vmov q7, q6 750; CHECK-NEXT: vmov q3, q2 751; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] 752; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] 753; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] 754; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]! 755; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 756; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 757; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 758; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] 759; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 760; CHECK-NEXT: bx lr 761entry: 762 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 763 %l1 = load <8 x float>, <8 x float>* %s1, align 4 764 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1 765 %l2 = load <8 x float>, <8 x float>* %s2, align 4 766 %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2 767 %l3 = load <8 x float>, <8 x float>* %s3, align 4 768 %s4 = getelementptr <8 x float>, <8 x float>* %src, i32 3 769 %l4 = load <8 x float>, <8 x float>* %s3, align 4 770 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 771 %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 772 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 773 store <32 x float> %s, <32 x float> *%dst 774 ret void 775} 776 777define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) { 778; CHECK-LABEL: vst4_v16f32: 779; CHECK: @ %bb.0: @ %entry 780; CHECK-NEXT: .save {r4, r5} 781; CHECK-NEXT: push {r4, r5} 782; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 783; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 784; CHECK-NEXT: .pad #216 785; CHECK-NEXT: sub sp, #216 786; CHECK-NEXT: vldrw.u32 q2, [r0, #144] 787; CHECK-NEXT: add r2, sp, #64 788; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 789; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 790; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 791; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill 792; CHECK-NEXT: add r2, sp, #128 793; CHECK-NEXT: vldrw.u32 q2, [r0, #128] 794; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 795; CHECK-NEXT: add r2, sp, #64 796; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 797; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 798; CHECK-NEXT: add r2, sp, #64 799; CHECK-NEXT: vmov q7, q6 800; CHECK-NEXT: vldrw.u32 q0, [r0] 801; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 802; CHECK-NEXT: add r2, sp, #128 803; CHECK-NEXT: vldrw.u32 q6, [r0, #160] 804; CHECK-NEXT: vmov q3, q2 805; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 806; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 807; CHECK-NEXT: add r2, sp, #128 808; CHECK-NEXT: vldrw.u32 q5, [r0, #112] 809; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 810; CHECK-NEXT: add r2, sp, #128 811; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 812; CHECK-NEXT: vldrw.u32 q5, [r0, #96] 813; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 814; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 815; CHECK-NEXT: add r2, sp, #128 816; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 817; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill 818; CHECK-NEXT: add r2, sp, #64 819; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload 820; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 821; CHECK-NEXT: mov r0, r1 822; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] 823; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] 824; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] 825; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! 826; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload 827; CHECK-NEXT: add r2, sp, #128 828; CHECK-NEXT: vmov q7, q6 829; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] 830; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] 831; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] 832; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] 833; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload 834; CHECK-NEXT: add.w r0, r1, #192 835; CHECK-NEXT: adds r1, #128 836; CHECK-NEXT: vmov q3, q2 837; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] 838; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] 839; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] 840; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] 841; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] 842; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] 843; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] 844; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] 845; CHECK-NEXT: add sp, #216 846; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 847; CHECK-NEXT: pop {r4, r5} 848; CHECK-NEXT: bx lr 849entry: 850 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0 851 %l1 = load <16 x float>, <16 x float>* %s1, align 4 852 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1 853 %l2 = load <16 x float>, <16 x float>* %s2, align 4 854 %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2 855 %l3 = load <16 x float>, <16 x float>* %s3, align 4 856 %s4 = getelementptr <16 x float>, <16 x float>* %src, i32 3 857 %l4 = load <16 x float>, <16 x float>* %s3, align 4 858 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 859 %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 860 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 861 store <64 x float> %s, <64 x float> *%dst 862 ret void 863} 864 865; f16 866 867define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { 868; CHECK-LABEL: vst4_v2f16: 869; CHECK: @ %bb.0: @ %entry 870; CHECK-NEXT: vldmia r0, {s4, s5} 871; CHECK-NEXT: vmov r2, s5 872; CHECK-NEXT: ldr r0, [r0, #8] 873; CHECK-NEXT: vmov r3, s4 874; CHECK-NEXT: vmovx.f16 s12, s4 875; CHECK-NEXT: vmov.16 q0[0], r3 876; CHECK-NEXT: vdup.32 q2, r0 877; CHECK-NEXT: vmov.16 q0[1], r2 878; CHECK-NEXT: vmov r0, s8 879; CHECK-NEXT: vmov.16 q0[2], r0 880; CHECK-NEXT: vmov r0, s9 881; CHECK-NEXT: vmov.16 q0[3], r0 882; CHECK-NEXT: vmov r0, s12 883; CHECK-NEXT: vmovx.f16 s4, s5 884; CHECK-NEXT: vmov.16 q0[4], r0 885; CHECK-NEXT: vmov r0, s4 886; CHECK-NEXT: vmovx.f16 s4, s8 887; CHECK-NEXT: vmov.16 q0[5], r0 888; CHECK-NEXT: vmov r0, s4 889; CHECK-NEXT: vmovx.f16 s4, s9 890; CHECK-NEXT: vmov.16 q0[6], r0 891; CHECK-NEXT: vmov r0, s4 892; CHECK-NEXT: vmov.16 q0[7], r0 893; CHECK-NEXT: vstrw.32 q0, [r1] 894; CHECK-NEXT: bx lr 895entry: 896 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 897 %l1 = load <2 x half>, <2 x half>* %s1, align 4 898 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1 899 %l2 = load <2 x half>, <2 x half>* %s2, align 4 900 %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2 901 %l3 = load <2 x half>, <2 x half>* %s3, align 4 902 %s4 = getelementptr <2 x half>, <2 x half>* %src, i32 3 903 %l4 = load <2 x half>, <2 x half>* %s3, align 4 904 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 905 %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 906 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 907 store <8 x half> %s, <8 x half> *%dst 908 ret void 909} 910 911define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { 912; CHECK-LABEL: vst4_v4f16: 913; CHECK: @ %bb.0: @ %entry 914; CHECK-NEXT: .save {r7, lr} 915; CHECK-NEXT: push {r7, lr} 916; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} 917; CHECK-NEXT: vmov.32 q0[0], r2 918; CHECK-NEXT: vmov.32 q0[1], r3 919; CHECK-NEXT: vmov.32 q0[2], r12 920; CHECK-NEXT: vmov.32 q0[3], lr 921; CHECK-NEXT: vmov r3, s1 922; CHECK-NEXT: vmovx.f16 s12, s1 923; CHECK-NEXT: vmov r2, s3 924; CHECK-NEXT: vmov.16 q2[0], r3 925; CHECK-NEXT: vmov.16 q2[1], r2 926; CHECK-NEXT: ldrd r2, r0, [r0, #16] 927; CHECK-NEXT: vmov.32 q1[0], r2 928; CHECK-NEXT: vmov.32 q1[1], r0 929; CHECK-NEXT: vmov.32 q1[2], r2 930; CHECK-NEXT: vmov r2, s0 931; CHECK-NEXT: vmov.32 q1[3], r0 932; CHECK-NEXT: vmov r0, s5 933; CHECK-NEXT: vmov.16 q2[2], r0 934; CHECK-NEXT: vmov r0, s7 935; CHECK-NEXT: vmov.16 q2[3], r0 936; CHECK-NEXT: vmov r0, s12 937; CHECK-NEXT: vmovx.f16 s12, s3 938; CHECK-NEXT: vmov.16 q2[4], r0 939; CHECK-NEXT: vmov r0, s12 940; CHECK-NEXT: vmovx.f16 s12, s5 941; CHECK-NEXT: vmov.16 q2[5], r0 942; CHECK-NEXT: vmov r0, s12 943; CHECK-NEXT: vmovx.f16 s12, s7 944; CHECK-NEXT: vmov.16 q2[6], r0 945; CHECK-NEXT: vmov r0, s12 946; CHECK-NEXT: vmovx.f16 s12, s0 947; CHECK-NEXT: vmov.16 q2[7], r0 948; CHECK-NEXT: vmov r0, s2 949; CHECK-NEXT: vstrw.32 q2, [r1, #16] 950; CHECK-NEXT: vmov.16 q2[0], r2 951; CHECK-NEXT: vmov.16 q2[1], r0 952; CHECK-NEXT: vmov r0, s4 953; CHECK-NEXT: vmov.16 q2[2], r0 954; CHECK-NEXT: vmov r0, s6 955; CHECK-NEXT: vmov.16 q2[3], r0 956; CHECK-NEXT: vmov r0, s12 957; CHECK-NEXT: vmovx.f16 s0, s2 958; CHECK-NEXT: vmov.16 q2[4], r0 959; CHECK-NEXT: vmov r0, s0 960; CHECK-NEXT: vmovx.f16 s0, s4 961; CHECK-NEXT: vmov.16 q2[5], r0 962; CHECK-NEXT: vmov r0, s0 963; CHECK-NEXT: vmovx.f16 s0, s6 964; CHECK-NEXT: vmov.16 q2[6], r0 965; CHECK-NEXT: vmov r0, s0 966; CHECK-NEXT: vmov.16 q2[7], r0 967; CHECK-NEXT: vstrw.32 q2, [r1] 968; CHECK-NEXT: pop {r7, pc} 969entry: 970 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 971 %l1 = load <4 x half>, <4 x half>* %s1, align 4 972 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1 973 %l2 = load <4 x half>, <4 x half>* %s2, align 4 974 %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2 975 %l3 = load <4 x half>, <4 x half>* %s3, align 4 976 %s4 = getelementptr <4 x half>, <4 x half>* %src, i32 3 977 %l4 = load <4 x half>, <4 x half>* %s3, align 4 978 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 979 %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 980 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 981 store <16 x half> %s, <16 x half> *%dst 982 ret void 983} 984 985define void @vst4_v8f16(<8 x half> *%src, <32 x half> *%dst) { 986; CHECK-LABEL: vst4_v8f16: 987; CHECK: @ %bb.0: @ %entry 988; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 989; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 990; CHECK-NEXT: vldrw.u32 q0, [r0] 991; CHECK-NEXT: vmov q3, q2 992; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] 993; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] 994; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] 995; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] 996; CHECK-NEXT: bx lr 997entry: 998 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 999 %l1 = load <8 x half>, <8 x half>* %s1, align 4 1000 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1 1001 %l2 = load <8 x half>, <8 x half>* %s2, align 4 1002 %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2 1003 %l3 = load <8 x half>, <8 x half>* %s3, align 4 1004 %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3 1005 %l4 = load <8 x half>, <8 x half>* %s3, align 4 1006 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1007 %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1008 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 1009 store <32 x half> %s, <32 x half> *%dst 1010 ret void 1011} 1012 1013define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) { 1014; CHECK-LABEL: vst4_v16f16: 1015; CHECK: @ %bb.0: @ %entry 1016; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1017; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1018; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 1019; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 1020; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 1021; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1022; CHECK-NEXT: vldrw.u32 q4, [r0] 1023; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1024; CHECK-NEXT: vmov q7, q6 1025; CHECK-NEXT: vmov q3, q2 1026; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] 1027; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] 1028; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] 1029; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]! 1030; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] 1031; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] 1032; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] 1033; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] 1034; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1035; CHECK-NEXT: bx lr 1036entry: 1037 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 1038 %l1 = load <16 x half>, <16 x half>* %s1, align 4 1039 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1 1040 %l2 = load <16 x half>, <16 x half>* %s2, align 4 1041 %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2 1042 %l3 = load <16 x half>, <16 x half>* %s3, align 4 1043 %s4 = getelementptr <16 x half>, <16 x half>* %src, i32 3 1044 %l4 = load <16 x half>, <16 x half>* %s3, align 4 1045 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1046 %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1047 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 1048 store <64 x half> %s, <64 x half> *%dst 1049 ret void 1050} 1051 1052; f64 1053 1054define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) { 1055; CHECK-LABEL: vst4_v2f64: 1056; CHECK: @ %bb.0: @ %entry 1057; CHECK-NEXT: vldrw.u32 q1, [r0] 1058; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1059; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 1060; CHECK-NEXT: vmov.f64 d6, d2 1061; CHECK-NEXT: vmov.f64 d7, d0 1062; CHECK-NEXT: vmov.f64 d0, d3 1063; CHECK-NEXT: vstrw.32 q3, [r1] 1064; CHECK-NEXT: vmov.f64 d2, d4 1065; CHECK-NEXT: vstrw.32 q0, [r1, #32] 1066; CHECK-NEXT: vmov.f64 d3, d4 1067; CHECK-NEXT: vmov.f64 d4, d5 1068; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1069; CHECK-NEXT: vstrw.32 q2, [r1, #48] 1070; CHECK-NEXT: bx lr 1071entry: 1072 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0 1073 %l1 = load <2 x double>, <2 x double>* %s1, align 4 1074 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1 1075 %l2 = load <2 x double>, <2 x double>* %s2, align 4 1076 %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2 1077 %l3 = load <2 x double>, <2 x double>* %s3, align 4 1078 %s4 = getelementptr <2 x double>, <2 x double>* %src, i32 3 1079 %l4 = load <2 x double>, <2 x double>* %s3, align 4 1080 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1081 %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1082 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 1083 store <8 x double> %s, <8 x double> *%dst 1084 ret void 1085} 1086 1087define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) { 1088; CHECK-LABEL: vst4_v4f64: 1089; CHECK: @ %bb.0: @ %entry 1090; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1091; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1092; CHECK-NEXT: vldrw.u32 q5, [r0] 1093; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1094; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 1095; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 1096; CHECK-NEXT: vmov.f64 d4, d10 1097; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1098; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 1099; CHECK-NEXT: vmov.f64 d5, d0 1100; CHECK-NEXT: vmov.f64 d0, d11 1101; CHECK-NEXT: vstrw.32 q2, [r1] 1102; CHECK-NEXT: vmov.f64 d10, d12 1103; CHECK-NEXT: vstrw.32 q0, [r1, #32] 1104; CHECK-NEXT: vmov.f64 d11, d6 1105; CHECK-NEXT: vmov.f64 d6, d13 1106; CHECK-NEXT: vstrw.32 q5, [r1, #64] 1107; CHECK-NEXT: vmov.f64 d12, d2 1108; CHECK-NEXT: vstrw.32 q3, [r1, #96] 1109; CHECK-NEXT: vmov.f64 d14, d8 1110; CHECK-NEXT: vmov.f64 d13, d2 1111; CHECK-NEXT: vmov.f64 d15, d8 1112; CHECK-NEXT: vstrw.32 q6, [r1, #16] 1113; CHECK-NEXT: vmov.f64 d2, d3 1114; CHECK-NEXT: vstrw.32 q7, [r1, #80] 1115; CHECK-NEXT: vmov.f64 d8, d9 1116; CHECK-NEXT: vstrw.32 q1, [r1, #48] 1117; CHECK-NEXT: vstrw.32 q4, [r1, #112] 1118; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1119; CHECK-NEXT: bx lr 1120entry: 1121 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0 1122 %l1 = load <4 x double>, <4 x double>* %s1, align 4 1123 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1 1124 %l2 = load <4 x double>, <4 x double>* %s2, align 4 1125 %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2 1126 %l3 = load <4 x double>, <4 x double>* %s3, align 4 1127 %s4 = getelementptr <4 x double>, <4 x double>* %src, i32 3 1128 %l4 = load <4 x double>, <4 x double>* %s3, align 4 1129 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1130 %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1131 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 1132 store <16 x double> %s, <16 x double> *%dst 1133 ret void 1134} 1135