• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
7; CHECK-LABEL: vst4_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    .save {r4, lr}
10; CHECK-NEXT:    push {r4, lr}
11; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
12; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
13; CHECK-NEXT:    vmov.32 q1[0], r4
14; CHECK-NEXT:    vmov.32 q1[2], r0
15; CHECK-NEXT:    vmov.f64 d0, d2
16; CHECK-NEXT:    vmov.f32 s1, s6
17; CHECK-NEXT:    vmov.f32 s2, s4
18; CHECK-NEXT:    vmov.f32 s3, s6
19; CHECK-NEXT:    vmov.32 q1[0], r2
20; CHECK-NEXT:    vmov.32 q1[1], r3
21; CHECK-NEXT:    vmov.32 q1[2], r12
22; CHECK-NEXT:    vmov.32 q1[3], lr
23; CHECK-NEXT:    vmov.f64 d4, d2
24; CHECK-NEXT:    vmov.f32 s9, s6
25; CHECK-NEXT:    vmov.f32 s10, s0
26; CHECK-NEXT:    vmov.f32 s11, s2
27; CHECK-NEXT:    vstrw.32 q2, [r1]
28; CHECK-NEXT:    vmov.f32 s8, s5
29; CHECK-NEXT:    vmov.f32 s9, s7
30; CHECK-NEXT:    vmov.f32 s10, s1
31; CHECK-NEXT:    vmov.f32 s11, s3
32; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
33; CHECK-NEXT:    pop {r4, pc}
34entry:
35  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
36  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
37  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
38  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
39  %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
40  %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
41  %s4 = getelementptr <2 x i32>, <2 x i32>* %src, i32 3
42  %l4 = load <2 x i32>, <2 x i32>* %s3, align 4
43  %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
44  %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
45  %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
46  store <8 x i32> %s, <8 x i32> *%dst
47  ret void
48}
49
50define void @vst4_v4i32(<4 x i32> *%src, <16 x i32> *%dst) {
51; CHECK-LABEL: vst4_v4i32:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
54; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
55; CHECK-NEXT:    vldrw.u32 q0, [r0]
56; CHECK-NEXT:    vmov q3, q2
57; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
58; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
59; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
60; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
61; CHECK-NEXT:    bx lr
62entry:
63  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
64  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
65  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
66  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
67  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
68  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
69  %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3
70  %l4 = load <4 x i32>, <4 x i32>* %s3, align 4
71  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72  %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
73  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
74  store <16 x i32> %s, <16 x i32> *%dst
75  ret void
76}
77
78define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) {
79; CHECK-LABEL: vst4_v8i32:
80; CHECK:       @ %bb.0: @ %entry
81; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
82; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
83; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
84; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
85; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
86; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
87; CHECK-NEXT:    vldrw.u32 q4, [r0]
88; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
89; CHECK-NEXT:    vmov q7, q6
90; CHECK-NEXT:    vmov q3, q2
91; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
92; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
93; CHECK-NEXT:    vst42.32 {q4, q5, q6, q7}, [r1]
94; CHECK-NEXT:    vst43.32 {q4, q5, q6, q7}, [r1]!
95; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
96; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
97; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
98; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
99; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
100; CHECK-NEXT:    bx lr
101entry:
102  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
103  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
104  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
105  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
106  %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
107  %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
108  %s4 = getelementptr <8 x i32>, <8 x i32>* %src, i32 3
109  %l4 = load <8 x i32>, <8 x i32>* %s3, align 4
110  %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
111  %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
112  %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
113  store <32 x i32> %s, <32 x i32> *%dst
114  ret void
115}
116
117define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
118; CHECK-LABEL: vst4_v16i32:
119; CHECK:       @ %bb.0: @ %entry
120; CHECK-NEXT:    .save {r4, r5}
121; CHECK-NEXT:    push {r4, r5}
122; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
123; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
124; CHECK-NEXT:    .pad #216
125; CHECK-NEXT:    sub sp, #216
126; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
127; CHECK-NEXT:    add r2, sp, #64
128; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
129; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
130; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
131; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
132; CHECK-NEXT:    add r2, sp, #128
133; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
134; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
135; CHECK-NEXT:    add r2, sp, #64
136; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
137; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
138; CHECK-NEXT:    add r2, sp, #64
139; CHECK-NEXT:    vmov q7, q6
140; CHECK-NEXT:    vldrw.u32 q0, [r0]
141; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
142; CHECK-NEXT:    add r2, sp, #128
143; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
144; CHECK-NEXT:    vmov q3, q2
145; CHECK-NEXT:    vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
146; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
147; CHECK-NEXT:    add r2, sp, #128
148; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
149; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
150; CHECK-NEXT:    add r2, sp, #128
151; CHECK-NEXT:    vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
152; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
153; CHECK-NEXT:    vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
154; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
155; CHECK-NEXT:    add r2, sp, #128
156; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
157; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
158; CHECK-NEXT:    add r2, sp, #64
159; CHECK-NEXT:    vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
160; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
161; CHECK-NEXT:    mov r0, r1
162; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
163; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
164; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
165; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
166; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
167; CHECK-NEXT:    add r2, sp, #128
168; CHECK-NEXT:    vmov q7, q6
169; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
170; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
171; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
172; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]
173; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
174; CHECK-NEXT:    add.w r0, r1, #192
175; CHECK-NEXT:    adds r1, #128
176; CHECK-NEXT:    vmov q3, q2
177; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
178; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
179; CHECK-NEXT:    vst42.32 {q4, q5, q6, q7}, [r1]
180; CHECK-NEXT:    vst43.32 {q4, q5, q6, q7}, [r1]
181; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
182; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
183; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
184; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]
185; CHECK-NEXT:    add sp, #216
186; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
187; CHECK-NEXT:    pop {r4, r5}
188; CHECK-NEXT:    bx lr
189entry:
190  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
191  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
192  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
193  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
194  %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
195  %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
196  %s4 = getelementptr <16 x i32>, <16 x i32>* %src, i32 3
197  %l4 = load <16 x i32>, <16 x i32>* %s3, align 4
198  %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
199  %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
200  %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
201  store <64 x i32> %s, <64 x i32> *%dst
202  ret void
203}
204
205; i16
206
207define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
208; CHECK-LABEL: vst4_v2i16:
209; CHECK:       @ %bb.0: @ %entry
210; CHECK-NEXT:    .save {r4, lr}
211; CHECK-NEXT:    push {r4, lr}
212; CHECK-NEXT:    ldrh r4, [r0]
213; CHECK-NEXT:    ldrh.w lr, [r0, #4]
214; CHECK-NEXT:    ldrh r3, [r0, #8]
215; CHECK-NEXT:    vmov.32 q0[0], r4
216; CHECK-NEXT:    ldrh.w r12, [r0, #6]
217; CHECK-NEXT:    ldrh r2, [r0, #10]
218; CHECK-NEXT:    ldrh r0, [r0, #2]
219; CHECK-NEXT:    vmov.32 q0[2], r0
220; CHECK-NEXT:    vmov r4, s0
221; CHECK-NEXT:    vmov.16 q0[0], r4
222; CHECK-NEXT:    vmov.16 q0[1], lr
223; CHECK-NEXT:    vmov.16 q0[2], r3
224; CHECK-NEXT:    vmov.16 q0[3], r3
225; CHECK-NEXT:    vmov.16 q0[4], r0
226; CHECK-NEXT:    vmov.16 q0[5], r12
227; CHECK-NEXT:    vmov.16 q0[6], r2
228; CHECK-NEXT:    vmov.16 q0[7], r2
229; CHECK-NEXT:    vstrw.32 q0, [r1]
230; CHECK-NEXT:    pop {r4, pc}
231entry:
232  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
233  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
234  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
235  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
236  %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
237  %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
238  %s4 = getelementptr <2 x i16>, <2 x i16>* %src, i32 3
239  %l4 = load <2 x i16>, <2 x i16>* %s3, align 4
240  %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
241  %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
242  %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
243  store <8 x i16> %s, <8 x i16> *%dst
244  ret void
245}
246
247define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
248; CHECK-LABEL: vst4_v4i16:
249; CHECK:       @ %bb.0: @ %entry
250; CHECK-NEXT:    .vsave {d8, d9}
251; CHECK-NEXT:    vpush {d8, d9}
252; CHECK-NEXT:    vldrh.u32 q1, [r0]
253; CHECK-NEXT:    vldrh.u32 q2, [r0, #8]
254; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
255; CHECK-NEXT:    vmov r2, s6
256; CHECK-NEXT:    vmov.16 q0[0], r2
257; CHECK-NEXT:    vmov r2, s10
258; CHECK-NEXT:    vmov.16 q0[1], r2
259; CHECK-NEXT:    vmov r0, s14
260; CHECK-NEXT:    vmov.16 q0[2], r0
261; CHECK-NEXT:    vmov.16 q0[3], r0
262; CHECK-NEXT:    vmov r0, s7
263; CHECK-NEXT:    vmov.16 q0[4], r0
264; CHECK-NEXT:    vmov r0, s11
265; CHECK-NEXT:    vmov.16 q0[5], r0
266; CHECK-NEXT:    vmov r0, s15
267; CHECK-NEXT:    vmov.16 q0[6], r0
268; CHECK-NEXT:    vmov.16 q0[7], r0
269; CHECK-NEXT:    vmov r0, s4
270; CHECK-NEXT:    vmov.16 q4[0], r0
271; CHECK-NEXT:    vmov r0, s8
272; CHECK-NEXT:    vmov.16 q4[1], r0
273; CHECK-NEXT:    vmov r0, s12
274; CHECK-NEXT:    vmov.16 q4[2], r0
275; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
276; CHECK-NEXT:    vmov.16 q4[3], r0
277; CHECK-NEXT:    vmov r0, s5
278; CHECK-NEXT:    vmov.16 q4[4], r0
279; CHECK-NEXT:    vmov r0, s9
280; CHECK-NEXT:    vmov.16 q4[5], r0
281; CHECK-NEXT:    vmov r0, s13
282; CHECK-NEXT:    vmov.16 q4[6], r0
283; CHECK-NEXT:    vmov.16 q4[7], r0
284; CHECK-NEXT:    vstrw.32 q4, [r1]
285; CHECK-NEXT:    vpop {d8, d9}
286; CHECK-NEXT:    bx lr
287entry:
288  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
289  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
290  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
291  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
292  %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
293  %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
294  %s4 = getelementptr <4 x i16>, <4 x i16>* %src, i32 3
295  %l4 = load <4 x i16>, <4 x i16>* %s3, align 4
296  %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
297  %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
298  %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
299  store <16 x i16> %s, <16 x i16> *%dst
300  ret void
301}
302
303define void @vst4_v8i16(<8 x i16> *%src, <32 x i16> *%dst) {
304; CHECK-LABEL: vst4_v8i16:
305; CHECK:       @ %bb.0: @ %entry
306; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
307; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
308; CHECK-NEXT:    vldrw.u32 q0, [r0]
309; CHECK-NEXT:    vmov q3, q2
310; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
311; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
312; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
313; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
314; CHECK-NEXT:    bx lr
315entry:
316  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
317  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
318  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
319  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
320  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
321  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
322  %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3
323  %l4 = load <8 x i16>, <8 x i16>* %s3, align 4
324  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325  %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
327  store <32 x i16> %s, <32 x i16> *%dst
328  ret void
329}
330
331define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) {
332; CHECK-LABEL: vst4_v16i16:
333; CHECK:       @ %bb.0: @ %entry
334; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
335; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
336; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
337; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
338; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
339; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
340; CHECK-NEXT:    vldrw.u32 q4, [r0]
341; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
342; CHECK-NEXT:    vmov q7, q6
343; CHECK-NEXT:    vmov q3, q2
344; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
345; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
346; CHECK-NEXT:    vst42.16 {q4, q5, q6, q7}, [r1]
347; CHECK-NEXT:    vst43.16 {q4, q5, q6, q7}, [r1]!
348; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
349; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
350; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
351; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
352; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
353; CHECK-NEXT:    bx lr
354entry:
355  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
356  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
357  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
358  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
359  %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
360  %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
361  %s4 = getelementptr <16 x i16>, <16 x i16>* %src, i32 3
362  %l4 = load <16 x i16>, <16 x i16>* %s3, align 4
363  %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
364  %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
365  %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
366  store <64 x i16> %s, <64 x i16> *%dst
367  ret void
368}
369
370; i8
371
372define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) {
373; CHECK-LABEL: vst4_v2i8:
374; CHECK:       @ %bb.0: @ %entry
375; CHECK-NEXT:    .save {r4, lr}
376; CHECK-NEXT:    push {r4, lr}
377; CHECK-NEXT:    ldrb r2, [r0]
378; CHECK-NEXT:    ldrb r3, [r0, #1]
379; CHECK-NEXT:    vmov.32 q0[0], r2
380; CHECK-NEXT:    ldrb.w r12, [r0, #2]
381; CHECK-NEXT:    vmov.32 q0[2], r3
382; CHECK-NEXT:    ldrb.w lr, [r0, #3]
383; CHECK-NEXT:    vmov r2, s0
384; CHECK-NEXT:    ldrb r4, [r0, #5]
385; CHECK-NEXT:    vmov.16 q0[0], r2
386; CHECK-NEXT:    ldrb r0, [r0, #4]
387; CHECK-NEXT:    vmov.16 q0[1], r12
388; CHECK-NEXT:    vmov.16 q0[2], r0
389; CHECK-NEXT:    vmov.16 q0[3], r0
390; CHECK-NEXT:    vmov.16 q0[4], r3
391; CHECK-NEXT:    vmov.16 q0[5], lr
392; CHECK-NEXT:    vmov.16 q0[6], r4
393; CHECK-NEXT:    vmov.16 q0[7], r4
394; CHECK-NEXT:    vstrb.16 q0, [r1]
395; CHECK-NEXT:    pop {r4, pc}
396entry:
397  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
398  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
399  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
400  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
401  %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
402  %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
403  %s4 = getelementptr <2 x i8>, <2 x i8>* %src, i32 3
404  %l4 = load <2 x i8>, <2 x i8>* %s3, align 4
405  %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
406  %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
407  %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
408  store <8 x i8> %s, <8 x i8> *%dst
409  ret void
410}
411
412define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) {
413; CHECK-LABEL: vst4_v4i8:
414; CHECK:       @ %bb.0: @ %entry
415; CHECK-NEXT:    vldrb.u32 q1, [r0]
416; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
417; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
418; CHECK-NEXT:    vmov r2, s4
419; CHECK-NEXT:    vmov.8 q0[0], r2
420; CHECK-NEXT:    vmov r2, s8
421; CHECK-NEXT:    vmov.8 q0[1], r2
422; CHECK-NEXT:    vmov r0, s12
423; CHECK-NEXT:    vmov.8 q0[2], r0
424; CHECK-NEXT:    vmov.8 q0[3], r0
425; CHECK-NEXT:    vmov r0, s5
426; CHECK-NEXT:    vmov.8 q0[4], r0
427; CHECK-NEXT:    vmov r0, s9
428; CHECK-NEXT:    vmov.8 q0[5], r0
429; CHECK-NEXT:    vmov r0, s13
430; CHECK-NEXT:    vmov.8 q0[6], r0
431; CHECK-NEXT:    vmov.8 q0[7], r0
432; CHECK-NEXT:    vmov r0, s6
433; CHECK-NEXT:    vmov.8 q0[8], r0
434; CHECK-NEXT:    vmov r0, s10
435; CHECK-NEXT:    vmov.8 q0[9], r0
436; CHECK-NEXT:    vmov r0, s14
437; CHECK-NEXT:    vmov.8 q0[10], r0
438; CHECK-NEXT:    vmov.8 q0[11], r0
439; CHECK-NEXT:    vmov r0, s7
440; CHECK-NEXT:    vmov.8 q0[12], r0
441; CHECK-NEXT:    vmov r0, s11
442; CHECK-NEXT:    vmov.8 q0[13], r0
443; CHECK-NEXT:    vmov r0, s15
444; CHECK-NEXT:    vmov.8 q0[14], r0
445; CHECK-NEXT:    vmov.8 q0[15], r0
446; CHECK-NEXT:    vstrw.32 q0, [r1]
447; CHECK-NEXT:    bx lr
448entry:
449  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
450  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
451  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
452  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
453  %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
454  %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
455  %s4 = getelementptr <4 x i8>, <4 x i8>* %src, i32 3
456  %l4 = load <4 x i8>, <4 x i8>* %s3, align 4
457  %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
458  %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
459  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
460  store <16 x i8> %s, <16 x i8> *%dst
461  ret void
462}
463
464define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) {
465; CHECK-LABEL: vst4_v8i8:
466; CHECK:       @ %bb.0: @ %entry
467; CHECK-NEXT:    .vsave {d8, d9}
468; CHECK-NEXT:    vpush {d8, d9}
469; CHECK-NEXT:    vldrb.u16 q1, [r0]
470; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
471; CHECK-NEXT:    vldrb.u16 q3, [r0, #16]
472; CHECK-NEXT:    vmov.u16 r2, q1[4]
473; CHECK-NEXT:    vmov.8 q0[0], r2
474; CHECK-NEXT:    vmov.u16 r2, q2[4]
475; CHECK-NEXT:    vmov.8 q0[1], r2
476; CHECK-NEXT:    vmov.u16 r0, q3[4]
477; CHECK-NEXT:    vmov.8 q0[2], r0
478; CHECK-NEXT:    vmov.8 q0[3], r0
479; CHECK-NEXT:    vmov.u16 r0, q1[5]
480; CHECK-NEXT:    vmov.8 q0[4], r0
481; CHECK-NEXT:    vmov.u16 r0, q2[5]
482; CHECK-NEXT:    vmov.8 q0[5], r0
483; CHECK-NEXT:    vmov.u16 r0, q3[5]
484; CHECK-NEXT:    vmov.8 q0[6], r0
485; CHECK-NEXT:    vmov.8 q0[7], r0
486; CHECK-NEXT:    vmov.u16 r0, q1[6]
487; CHECK-NEXT:    vmov.8 q0[8], r0
488; CHECK-NEXT:    vmov.u16 r0, q2[6]
489; CHECK-NEXT:    vmov.8 q0[9], r0
490; CHECK-NEXT:    vmov.u16 r0, q3[6]
491; CHECK-NEXT:    vmov.8 q0[10], r0
492; CHECK-NEXT:    vmov.8 q0[11], r0
493; CHECK-NEXT:    vmov.u16 r0, q1[7]
494; CHECK-NEXT:    vmov.8 q0[12], r0
495; CHECK-NEXT:    vmov.u16 r0, q2[7]
496; CHECK-NEXT:    vmov.8 q0[13], r0
497; CHECK-NEXT:    vmov.u16 r0, q3[7]
498; CHECK-NEXT:    vmov.8 q0[14], r0
499; CHECK-NEXT:    vmov.8 q0[15], r0
500; CHECK-NEXT:    vmov.u16 r0, q1[0]
501; CHECK-NEXT:    vmov.8 q4[0], r0
502; CHECK-NEXT:    vmov.u16 r0, q2[0]
503; CHECK-NEXT:    vmov.8 q4[1], r0
504; CHECK-NEXT:    vmov.u16 r0, q3[0]
505; CHECK-NEXT:    vmov.8 q4[2], r0
506; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
507; CHECK-NEXT:    vmov.8 q4[3], r0
508; CHECK-NEXT:    vmov.u16 r0, q1[1]
509; CHECK-NEXT:    vmov.8 q4[4], r0
510; CHECK-NEXT:    vmov.u16 r0, q2[1]
511; CHECK-NEXT:    vmov.8 q4[5], r0
512; CHECK-NEXT:    vmov.u16 r0, q3[1]
513; CHECK-NEXT:    vmov.8 q4[6], r0
514; CHECK-NEXT:    vmov.8 q4[7], r0
515; CHECK-NEXT:    vmov.u16 r0, q1[2]
516; CHECK-NEXT:    vmov.8 q4[8], r0
517; CHECK-NEXT:    vmov.u16 r0, q2[2]
518; CHECK-NEXT:    vmov.8 q4[9], r0
519; CHECK-NEXT:    vmov.u16 r0, q3[2]
520; CHECK-NEXT:    vmov.8 q4[10], r0
521; CHECK-NEXT:    vmov.8 q4[11], r0
522; CHECK-NEXT:    vmov.u16 r0, q1[3]
523; CHECK-NEXT:    vmov.8 q4[12], r0
524; CHECK-NEXT:    vmov.u16 r0, q2[3]
525; CHECK-NEXT:    vmov.8 q4[13], r0
526; CHECK-NEXT:    vmov.u16 r0, q3[3]
527; CHECK-NEXT:    vmov.8 q4[14], r0
528; CHECK-NEXT:    vmov.8 q4[15], r0
529; CHECK-NEXT:    vstrw.32 q4, [r1]
530; CHECK-NEXT:    vpop {d8, d9}
531; CHECK-NEXT:    bx lr
532entry:
533  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
534  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
535  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
536  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
537  %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
538  %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
539  %s4 = getelementptr <8 x i8>, <8 x i8>* %src, i32 3
540  %l4 = load <8 x i8>, <8 x i8>* %s3, align 4
541  %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
542  %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
543  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
544  store <32 x i8> %s, <32 x i8> *%dst
545  ret void
546}
547
548define void @vst4_v16i8(<16 x i8> *%src, <64 x i8> *%dst) {
549; CHECK-LABEL: vst4_v16i8:
550; CHECK:       @ %bb.0: @ %entry
551; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
552; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
553; CHECK-NEXT:    vldrw.u32 q0, [r0]
554; CHECK-NEXT:    vmov q3, q2
555; CHECK-NEXT:    vst40.8 {q0, q1, q2, q3}, [r1]
556; CHECK-NEXT:    vst41.8 {q0, q1, q2, q3}, [r1]
557; CHECK-NEXT:    vst42.8 {q0, q1, q2, q3}, [r1]
558; CHECK-NEXT:    vst43.8 {q0, q1, q2, q3}, [r1]
559; CHECK-NEXT:    bx lr
560entry:
561  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
562  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
563  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
564  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
565  %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
566  %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
567  %s4 = getelementptr <16 x i8>, <16 x i8>* %src, i32 3
568  %l4 = load <16 x i8>, <16 x i8>* %s3, align 4
569  %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
570  %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
571  %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
572  store <64 x i8> %s, <64 x i8> *%dst
573  ret void
574}
575
576; i64
577
578define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
579; CHECK-LABEL: vst4_v2i64:
580; CHECK:       @ %bb.0: @ %entry
581; CHECK-NEXT:    vldrw.u32 q3, [r0]
582; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
583; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
584; CHECK-NEXT:    vmov.f64 d4, d6
585; CHECK-NEXT:    vmov.f32 s9, s13
586; CHECK-NEXT:    vmov.f32 s10, s0
587; CHECK-NEXT:    vmov.f32 s11, s1
588; CHECK-NEXT:    vmov.f32 s0, s14
589; CHECK-NEXT:    vstrw.32 q2, [r1]
590; CHECK-NEXT:    vmov.f32 s1, s15
591; CHECK-NEXT:    vmov.f64 d6, d2
592; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
593; CHECK-NEXT:    vmov.f32 s13, s5
594; CHECK-NEXT:    vmov.f32 s14, s4
595; CHECK-NEXT:    vmov.f32 s15, s5
596; CHECK-NEXT:    vmov.f32 s4, s6
597; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
598; CHECK-NEXT:    vmov.f32 s5, s7
599; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
600; CHECK-NEXT:    bx lr
601entry:
602  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
603  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
604  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
605  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
606  %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
607  %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
608  %s4 = getelementptr <2 x i64>, <2 x i64>* %src, i32 3
609  %l4 = load <2 x i64>, <2 x i64>* %s3, align 4
610  %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
611  %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
612  %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
613  store <8 x i64> %s, <8 x i64> *%dst
614  ret void
615}
616
617define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) {
618; CHECK-LABEL: vst4_v4i64:
619; CHECK:       @ %bb.0: @ %entry
620; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
621; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
622; CHECK-NEXT:    vldrw.u32 q5, [r0]
623; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
624; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
625; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
626; CHECK-NEXT:    vmov.f64 d6, d10
627; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
628; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
629; CHECK-NEXT:    vmov.f32 s13, s21
630; CHECK-NEXT:    vmov.f32 s14, s0
631; CHECK-NEXT:    vmov.f32 s15, s1
632; CHECK-NEXT:    vmov.f32 s0, s22
633; CHECK-NEXT:    vstrw.32 q3, [r1]
634; CHECK-NEXT:    vmov.f32 s1, s23
635; CHECK-NEXT:    vmov.f64 d10, d12
636; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
637; CHECK-NEXT:    vmov.f32 s21, s25
638; CHECK-NEXT:    vmov.f32 s22, s8
639; CHECK-NEXT:    vmov.f32 s23, s9
640; CHECK-NEXT:    vmov.f32 s8, s26
641; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
642; CHECK-NEXT:    vmov.f32 s9, s27
643; CHECK-NEXT:    vmov.f64 d12, d2
644; CHECK-NEXT:    vstrw.32 q2, [r1, #96]
645; CHECK-NEXT:    vmov.f64 d14, d8
646; CHECK-NEXT:    vmov.f32 s25, s5
647; CHECK-NEXT:    vmov.f32 s29, s17
648; CHECK-NEXT:    vmov.f32 s26, s4
649; CHECK-NEXT:    vmov.f32 s30, s16
650; CHECK-NEXT:    vmov.f32 s27, s5
651; CHECK-NEXT:    vmov.f32 s4, s6
652; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
653; CHECK-NEXT:    vmov.f32 s31, s17
654; CHECK-NEXT:    vmov.f32 s16, s18
655; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
656; CHECK-NEXT:    vmov.f32 s5, s7
657; CHECK-NEXT:    vmov.f32 s17, s19
658; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
659; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
660; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
661; CHECK-NEXT:    bx lr
662entry:
663  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
664  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
665  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
666  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
667  %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
668  %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
669  %s4 = getelementptr <4 x i64>, <4 x i64>* %src, i32 3
670  %l4 = load <4 x i64>, <4 x i64>* %s3, align 4
671  %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
672  %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
673  %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
674  store <16 x i64> %s, <16 x i64> *%dst
675  ret void
676}
677
678; f32
679
680define void @vst4_v2f32(<2 x float> *%src, <8 x float> *%dst) {
681; CHECK-LABEL: vst4_v2f32:
682; CHECK:       @ %bb.0: @ %entry
683; CHECK-NEXT:    vldr s0, [r0]
684; CHECK-NEXT:    vldr s4, [r0, #4]
685; CHECK-NEXT:    vldr s1, [r0, #8]
686; CHECK-NEXT:    vldr s5, [r0, #12]
687; CHECK-NEXT:    vldr s2, [r0, #16]
688; CHECK-NEXT:    vldr s6, [r0, #20]
689; CHECK-NEXT:    vmov.f32 s3, s2
690; CHECK-NEXT:    vmov.f32 s7, s6
691; CHECK-NEXT:    vstrw.32 q0, [r1]
692; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
693; CHECK-NEXT:    bx lr
694entry:
695  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
696  %l1 = load <2 x float>, <2 x float>* %s1, align 4
697  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
698  %l2 = load <2 x float>, <2 x float>* %s2, align 4
699  %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
700  %l3 = load <2 x float>, <2 x float>* %s3, align 4
701  %s4 = getelementptr <2 x float>, <2 x float>* %src, i32 3
702  %l4 = load <2 x float>, <2 x float>* %s3, align 4
703  %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
704  %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
705  %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
706  store <8 x float> %s, <8 x float> *%dst
707  ret void
708}
709
710define void @vst4_v4f32(<4 x float> *%src, <16 x float> *%dst) {
711; CHECK-LABEL: vst4_v4f32:
712; CHECK:       @ %bb.0: @ %entry
713; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
714; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
715; CHECK-NEXT:    vldrw.u32 q0, [r0]
716; CHECK-NEXT:    vmov q3, q2
717; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
718; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
719; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
720; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
721; CHECK-NEXT:    bx lr
722entry:
723  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
724  %l1 = load <4 x float>, <4 x float>* %s1, align 4
725  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
726  %l2 = load <4 x float>, <4 x float>* %s2, align 4
727  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
728  %l3 = load <4 x float>, <4 x float>* %s3, align 4
729  %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3
730  %l4 = load <4 x float>, <4 x float>* %s3, align 4
731  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
732  %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
733  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
734  store <16 x float> %s, <16 x float> *%dst
735  ret void
736}
737
738define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) {
739; CHECK-LABEL: vst4_v8f32:
740; CHECK:       @ %bb.0: @ %entry
741; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
742; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
743; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
744; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
745; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
746; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
747; CHECK-NEXT:    vldrw.u32 q4, [r0]
748; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
749; CHECK-NEXT:    vmov q7, q6
750; CHECK-NEXT:    vmov q3, q2
751; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
752; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
753; CHECK-NEXT:    vst42.32 {q4, q5, q6, q7}, [r1]
754; CHECK-NEXT:    vst43.32 {q4, q5, q6, q7}, [r1]!
755; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
756; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
757; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
758; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
759; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
760; CHECK-NEXT:    bx lr
761entry:
762  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
763  %l1 = load <8 x float>, <8 x float>* %s1, align 4
764  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
765  %l2 = load <8 x float>, <8 x float>* %s2, align 4
766  %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
767  %l3 = load <8 x float>, <8 x float>* %s3, align 4
768  %s4 = getelementptr <8 x float>, <8 x float>* %src, i32 3
769  %l4 = load <8 x float>, <8 x float>* %s3, align 4
770  %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
771  %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
772  %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
773  store <32 x float> %s, <32 x float> *%dst
774  ret void
775}
776
777define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
778; CHECK-LABEL: vst4_v16f32:
779; CHECK:       @ %bb.0: @ %entry
780; CHECK-NEXT:    .save {r4, r5}
781; CHECK-NEXT:    push {r4, r5}
782; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
783; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
784; CHECK-NEXT:    .pad #216
785; CHECK-NEXT:    sub sp, #216
786; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
787; CHECK-NEXT:    add r2, sp, #64
788; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
789; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
790; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
791; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
792; CHECK-NEXT:    add r2, sp, #128
793; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
794; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
795; CHECK-NEXT:    add r2, sp, #64
796; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
797; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
798; CHECK-NEXT:    add r2, sp, #64
799; CHECK-NEXT:    vmov q7, q6
800; CHECK-NEXT:    vldrw.u32 q0, [r0]
801; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
802; CHECK-NEXT:    add r2, sp, #128
803; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
804; CHECK-NEXT:    vmov q3, q2
805; CHECK-NEXT:    vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
806; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
807; CHECK-NEXT:    add r2, sp, #128
808; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
809; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
810; CHECK-NEXT:    add r2, sp, #128
811; CHECK-NEXT:    vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
812; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
813; CHECK-NEXT:    vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
814; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
815; CHECK-NEXT:    add r2, sp, #128
816; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
817; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
818; CHECK-NEXT:    add r2, sp, #64
819; CHECK-NEXT:    vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
820; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
821; CHECK-NEXT:    mov r0, r1
822; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
823; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
824; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
825; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
826; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
827; CHECK-NEXT:    add r2, sp, #128
828; CHECK-NEXT:    vmov q7, q6
829; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
830; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
831; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
832; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]
833; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
834; CHECK-NEXT:    add.w r0, r1, #192
835; CHECK-NEXT:    adds r1, #128
836; CHECK-NEXT:    vmov q3, q2
837; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
838; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
839; CHECK-NEXT:    vst42.32 {q4, q5, q6, q7}, [r1]
840; CHECK-NEXT:    vst43.32 {q4, q5, q6, q7}, [r1]
841; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
842; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
843; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
844; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]
845; CHECK-NEXT:    add sp, #216
846; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
847; CHECK-NEXT:    pop {r4, r5}
848; CHECK-NEXT:    bx lr
849entry:
850  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
851  %l1 = load <16 x float>, <16 x float>* %s1, align 4
852  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
853  %l2 = load <16 x float>, <16 x float>* %s2, align 4
854  %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
855  %l3 = load <16 x float>, <16 x float>* %s3, align 4
856  %s4 = getelementptr <16 x float>, <16 x float>* %src, i32 3
857  %l4 = load <16 x float>, <16 x float>* %s3, align 4
858  %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
859  %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
860  %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
861  store <64 x float> %s, <64 x float> *%dst
862  ret void
863}
864
865; f16
866
867define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
868; CHECK-LABEL: vst4_v2f16:
869; CHECK:       @ %bb.0: @ %entry
870; CHECK-NEXT:    vldmia r0, {s4, s5}
871; CHECK-NEXT:    vmov r2, s5
872; CHECK-NEXT:    ldr r0, [r0, #8]
873; CHECK-NEXT:    vmov r3, s4
874; CHECK-NEXT:    vmovx.f16 s12, s4
875; CHECK-NEXT:    vmov.16 q0[0], r3
876; CHECK-NEXT:    vdup.32 q2, r0
877; CHECK-NEXT:    vmov.16 q0[1], r2
878; CHECK-NEXT:    vmov r0, s8
879; CHECK-NEXT:    vmov.16 q0[2], r0
880; CHECK-NEXT:    vmov r0, s9
881; CHECK-NEXT:    vmov.16 q0[3], r0
882; CHECK-NEXT:    vmov r0, s12
883; CHECK-NEXT:    vmovx.f16 s4, s5
884; CHECK-NEXT:    vmov.16 q0[4], r0
885; CHECK-NEXT:    vmov r0, s4
886; CHECK-NEXT:    vmovx.f16 s4, s8
887; CHECK-NEXT:    vmov.16 q0[5], r0
888; CHECK-NEXT:    vmov r0, s4
889; CHECK-NEXT:    vmovx.f16 s4, s9
890; CHECK-NEXT:    vmov.16 q0[6], r0
891; CHECK-NEXT:    vmov r0, s4
892; CHECK-NEXT:    vmov.16 q0[7], r0
893; CHECK-NEXT:    vstrw.32 q0, [r1]
894; CHECK-NEXT:    bx lr
895entry:
896  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
897  %l1 = load <2 x half>, <2 x half>* %s1, align 4
898  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
899  %l2 = load <2 x half>, <2 x half>* %s2, align 4
900  %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
901  %l3 = load <2 x half>, <2 x half>* %s3, align 4
902  %s4 = getelementptr <2 x half>, <2 x half>* %src, i32 3
903  %l4 = load <2 x half>, <2 x half>* %s3, align 4
904  %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
905  %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
906  %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
907  store <8 x half> %s, <8 x half> *%dst
908  ret void
909}
910
911define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
912; CHECK-LABEL: vst4_v4f16:
913; CHECK:       @ %bb.0: @ %entry
914; CHECK-NEXT:    .save {r7, lr}
915; CHECK-NEXT:    push {r7, lr}
916; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
917; CHECK-NEXT:    vmov.32 q0[0], r2
918; CHECK-NEXT:    vmov.32 q0[1], r3
919; CHECK-NEXT:    vmov.32 q0[2], r12
920; CHECK-NEXT:    vmov.32 q0[3], lr
921; CHECK-NEXT:    vmov r3, s1
922; CHECK-NEXT:    vmovx.f16 s12, s1
923; CHECK-NEXT:    vmov r2, s3
924; CHECK-NEXT:    vmov.16 q2[0], r3
925; CHECK-NEXT:    vmov.16 q2[1], r2
926; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
927; CHECK-NEXT:    vmov.32 q1[0], r2
928; CHECK-NEXT:    vmov.32 q1[1], r0
929; CHECK-NEXT:    vmov.32 q1[2], r2
930; CHECK-NEXT:    vmov r2, s0
931; CHECK-NEXT:    vmov.32 q1[3], r0
932; CHECK-NEXT:    vmov r0, s5
933; CHECK-NEXT:    vmov.16 q2[2], r0
934; CHECK-NEXT:    vmov r0, s7
935; CHECK-NEXT:    vmov.16 q2[3], r0
936; CHECK-NEXT:    vmov r0, s12
937; CHECK-NEXT:    vmovx.f16 s12, s3
938; CHECK-NEXT:    vmov.16 q2[4], r0
939; CHECK-NEXT:    vmov r0, s12
940; CHECK-NEXT:    vmovx.f16 s12, s5
941; CHECK-NEXT:    vmov.16 q2[5], r0
942; CHECK-NEXT:    vmov r0, s12
943; CHECK-NEXT:    vmovx.f16 s12, s7
944; CHECK-NEXT:    vmov.16 q2[6], r0
945; CHECK-NEXT:    vmov r0, s12
946; CHECK-NEXT:    vmovx.f16 s12, s0
947; CHECK-NEXT:    vmov.16 q2[7], r0
948; CHECK-NEXT:    vmov r0, s2
949; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
950; CHECK-NEXT:    vmov.16 q2[0], r2
951; CHECK-NEXT:    vmov.16 q2[1], r0
952; CHECK-NEXT:    vmov r0, s4
953; CHECK-NEXT:    vmov.16 q2[2], r0
954; CHECK-NEXT:    vmov r0, s6
955; CHECK-NEXT:    vmov.16 q2[3], r0
956; CHECK-NEXT:    vmov r0, s12
957; CHECK-NEXT:    vmovx.f16 s0, s2
958; CHECK-NEXT:    vmov.16 q2[4], r0
959; CHECK-NEXT:    vmov r0, s0
960; CHECK-NEXT:    vmovx.f16 s0, s4
961; CHECK-NEXT:    vmov.16 q2[5], r0
962; CHECK-NEXT:    vmov r0, s0
963; CHECK-NEXT:    vmovx.f16 s0, s6
964; CHECK-NEXT:    vmov.16 q2[6], r0
965; CHECK-NEXT:    vmov r0, s0
966; CHECK-NEXT:    vmov.16 q2[7], r0
967; CHECK-NEXT:    vstrw.32 q2, [r1]
968; CHECK-NEXT:    pop {r7, pc}
969entry:
970  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
971  %l1 = load <4 x half>, <4 x half>* %s1, align 4
972  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
973  %l2 = load <4 x half>, <4 x half>* %s2, align 4
974  %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
975  %l3 = load <4 x half>, <4 x half>* %s3, align 4
976  %s4 = getelementptr <4 x half>, <4 x half>* %src, i32 3
977  %l4 = load <4 x half>, <4 x half>* %s3, align 4
978  %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
979  %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
980  %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
981  store <16 x half> %s, <16 x half> *%dst
982  ret void
983}
984
985define void @vst4_v8f16(<8 x half> *%src, <32 x half> *%dst) {
986; CHECK-LABEL: vst4_v8f16:
987; CHECK:       @ %bb.0: @ %entry
988; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
989; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
990; CHECK-NEXT:    vldrw.u32 q0, [r0]
991; CHECK-NEXT:    vmov q3, q2
992; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
993; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
994; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
995; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
996; CHECK-NEXT:    bx lr
997entry:
998  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
999  %l1 = load <8 x half>, <8 x half>* %s1, align 4
1000  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
1001  %l2 = load <8 x half>, <8 x half>* %s2, align 4
1002  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
1003  %l3 = load <8 x half>, <8 x half>* %s3, align 4
1004  %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3
1005  %l4 = load <8 x half>, <8 x half>* %s3, align 4
1006  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1007  %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1008  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1009  store <32 x half> %s, <32 x half> *%dst
1010  ret void
1011}
1012
1013define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) {
1014; CHECK-LABEL: vst4_v16f16:
1015; CHECK:       @ %bb.0: @ %entry
1016; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1017; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1018; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
1019; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
1020; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
1021; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
1022; CHECK-NEXT:    vldrw.u32 q4, [r0]
1023; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1024; CHECK-NEXT:    vmov q7, q6
1025; CHECK-NEXT:    vmov q3, q2
1026; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
1027; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
1028; CHECK-NEXT:    vst42.16 {q4, q5, q6, q7}, [r1]
1029; CHECK-NEXT:    vst43.16 {q4, q5, q6, q7}, [r1]!
1030; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
1031; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
1032; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
1033; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
1034; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1035; CHECK-NEXT:    bx lr
1036entry:
1037  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
1038  %l1 = load <16 x half>, <16 x half>* %s1, align 4
1039  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
1040  %l2 = load <16 x half>, <16 x half>* %s2, align 4
1041  %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
1042  %l3 = load <16 x half>, <16 x half>* %s3, align 4
1043  %s4 = getelementptr <16 x half>, <16 x half>* %src, i32 3
1044  %l4 = load <16 x half>, <16 x half>* %s3, align 4
1045  %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1046  %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1047  %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
1048  store <64 x half> %s, <64 x half> *%dst
1049  ret void
1050}
1051
1052; f64
1053
1054define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
1055; CHECK-LABEL: vst4_v2f64:
1056; CHECK:       @ %bb.0: @ %entry
1057; CHECK-NEXT:    vldrw.u32 q1, [r0]
1058; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1059; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
1060; CHECK-NEXT:    vmov.f64 d6, d2
1061; CHECK-NEXT:    vmov.f64 d7, d0
1062; CHECK-NEXT:    vmov.f64 d0, d3
1063; CHECK-NEXT:    vstrw.32 q3, [r1]
1064; CHECK-NEXT:    vmov.f64 d2, d4
1065; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
1066; CHECK-NEXT:    vmov.f64 d3, d4
1067; CHECK-NEXT:    vmov.f64 d4, d5
1068; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1069; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
1070; CHECK-NEXT:    bx lr
1071entry:
1072  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
1073  %l1 = load <2 x double>, <2 x double>* %s1, align 4
1074  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
1075  %l2 = load <2 x double>, <2 x double>* %s2, align 4
1076  %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
1077  %l3 = load <2 x double>, <2 x double>* %s3, align 4
1078  %s4 = getelementptr <2 x double>, <2 x double>* %src, i32 3
1079  %l4 = load <2 x double>, <2 x double>* %s3, align 4
1080  %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1081  %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1082  %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
1083  store <8 x double> %s, <8 x double> *%dst
1084  ret void
1085}
1086
1087define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) {
1088; CHECK-LABEL: vst4_v4f64:
1089; CHECK:       @ %bb.0: @ %entry
1090; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1091; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1092; CHECK-NEXT:    vldrw.u32 q5, [r0]
1093; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1094; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
1095; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
1096; CHECK-NEXT:    vmov.f64 d4, d10
1097; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1098; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
1099; CHECK-NEXT:    vmov.f64 d5, d0
1100; CHECK-NEXT:    vmov.f64 d0, d11
1101; CHECK-NEXT:    vstrw.32 q2, [r1]
1102; CHECK-NEXT:    vmov.f64 d10, d12
1103; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
1104; CHECK-NEXT:    vmov.f64 d11, d6
1105; CHECK-NEXT:    vmov.f64 d6, d13
1106; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
1107; CHECK-NEXT:    vmov.f64 d12, d2
1108; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
1109; CHECK-NEXT:    vmov.f64 d14, d8
1110; CHECK-NEXT:    vmov.f64 d13, d2
1111; CHECK-NEXT:    vmov.f64 d15, d8
1112; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
1113; CHECK-NEXT:    vmov.f64 d2, d3
1114; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
1115; CHECK-NEXT:    vmov.f64 d8, d9
1116; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
1117; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
1118; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1119; CHECK-NEXT:    bx lr
1120entry:
1121  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
1122  %l1 = load <4 x double>, <4 x double>* %s1, align 4
1123  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
1124  %l2 = load <4 x double>, <4 x double>* %s2, align 4
1125  %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
1126  %l3 = load <4 x double>, <4 x double>* %s3, align 4
1127  %s4 = getelementptr <4 x double>, <4 x double>* %src, i32 3
1128  %l4 = load <4 x double>, <4 x double>* %s3, align 4
1129  %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1130  %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1131  %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1132  store <16 x double> %s, <16 x double> *%dst
1133  ret void
1134}
1135