• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
7; CHECK-LABEL: vst3_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    .save {r4, lr}
10; CHECK-NEXT:    push {r4, lr}
11; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
12; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
13; CHECK-NEXT:    vmov.32 q1[0], r2
14; CHECK-NEXT:    vmov.32 q1[1], r3
15; CHECK-NEXT:    vmov.32 q0[0], r4
16; CHECK-NEXT:    vmov.32 q1[2], r12
17; CHECK-NEXT:    vmov.32 q0[1], r0
18; CHECK-NEXT:    vmov.32 q1[3], lr
19; CHECK-NEXT:    vmov.f32 s8, s7
20; CHECK-NEXT:    vmov.f32 s10, s1
21; CHECK-NEXT:    vmov r2, s8
22; CHECK-NEXT:    vmov r0, s10
23; CHECK-NEXT:    vmov.f64 d4, d2
24; CHECK-NEXT:    vmov.f32 s9, s6
25; CHECK-NEXT:    vmov.f32 s10, s0
26; CHECK-NEXT:    vmov.f32 s11, s5
27; CHECK-NEXT:    vstrw.32 q2, [r1]
28; CHECK-NEXT:    strd r2, r0, [r1, #16]
29; CHECK-NEXT:    pop {r4, pc}
30entry:
31  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
32  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
33  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
34  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
35  %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
36  %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
37  %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
38  %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
39  %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
40  store <6 x i32> %s, <6 x i32> *%dst
41  ret void
42}
43
44define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) {
45; CHECK-LABEL: vst3_v4i32:
46; CHECK:       @ %bb.0: @ %entry
47; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
48; CHECK-NEXT:    vpush {d8, d9, d10, d11}
49; CHECK-NEXT:    vldrw.u32 q4, [r0]
50; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
51; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
52; CHECK-NEXT:    vmov.f32 s4, s9
53; CHECK-NEXT:    vmov.f64 d6, d8
54; CHECK-NEXT:    vmov r0, s0
55; CHECK-NEXT:    vmov.f32 s5, s1
56; CHECK-NEXT:    vdup.32 q5, r0
57; CHECK-NEXT:    vmov.f32 s13, s8
58; CHECK-NEXT:    vmov.f32 s0, s2
59; CHECK-NEXT:    vmov r0, s11
60; CHECK-NEXT:    vmov.f32 s7, s10
61; CHECK-NEXT:    vdup.32 q2, r0
62; CHECK-NEXT:    vmov.f32 s15, s17
63; CHECK-NEXT:    vmov.f32 s1, s19
64; CHECK-NEXT:    vmov.f32 s6, s18
65; CHECK-NEXT:    vmov.f32 s14, s22
66; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
67; CHECK-NEXT:    vmov.f32 s2, s10
68; CHECK-NEXT:    vstrw.32 q3, [r1]
69; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
70; CHECK-NEXT:    vpop {d8, d9, d10, d11}
71; CHECK-NEXT:    bx lr
72entry:
73  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
74  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
75  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
76  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
77  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
78  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
79  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
80  %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
81  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
82  store <12 x i32> %s, <12 x i32> *%dst
83  ret void
84}
85
86define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) {
87; CHECK-LABEL: vst3_v8i32:
88; CHECK:       @ %bb.0: @ %entry
89; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
90; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
91; CHECK-NEXT:    .pad #48
92; CHECK-NEXT:    sub sp, #48
93; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
94; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
95; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
96; CHECK-NEXT:    vldrw.u32 q4, [r0]
97; CHECK-NEXT:    vmov.f64 d6, d1
98; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
99; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
100; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
101; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
102; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
103; CHECK-NEXT:    vmov.f32 s13, s23
104; CHECK-NEXT:    vmov r2, s7
105; CHECK-NEXT:    vmov.f32 s15, s3
106; CHECK-NEXT:    vdup.32 q2, r2
107; CHECK-NEXT:    vmov.f32 s14, s10
108; CHECK-NEXT:    vmov.f64 d4, d8
109; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
110; CHECK-NEXT:    vmov.f32 s9, s24
111; CHECK-NEXT:    vmov.f32 s11, s17
112; CHECK-NEXT:    vmov q4, q5
113; CHECK-NEXT:    vmov.f32 s21, s4
114; CHECK-NEXT:    vmov r0, s28
115; CHECK-NEXT:    vmov.f32 s23, s17
116; CHECK-NEXT:    vdup.32 q4, r0
117; CHECK-NEXT:    vmov r0, s0
118; CHECK-NEXT:    vmov.f32 s0, s5
119; CHECK-NEXT:    vdup.32 q6, r0
120; CHECK-NEXT:    vmov.f32 s10, s18
121; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
122; CHECK-NEXT:    vmov.f32 s3, s6
123; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
124; CHECK-NEXT:    vmov.f32 s22, s26
125; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
126; CHECK-NEXT:    vmov.f32 s2, s6
127; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
128; CHECK-NEXT:    vmov.f32 s4, s17
129; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
130; CHECK-NEXT:    vmov.f32 s5, s29
131; CHECK-NEXT:    vstrw.32 q2, [r1]
132; CHECK-NEXT:    vmov.f32 s28, s30
133; CHECK-NEXT:    vmov r0, s19
134; CHECK-NEXT:    vmov.f32 s7, s18
135; CHECK-NEXT:    vdup.32 q4, r0
136; CHECK-NEXT:    vmov.f32 s29, s27
137; CHECK-NEXT:    vmov.f32 s6, s26
138; CHECK-NEXT:    vmov.f32 s30, s18
139; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
140; CHECK-NEXT:    vstrw.32 q7, [r1, #32]
141; CHECK-NEXT:    add sp, #48
142; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
143; CHECK-NEXT:    bx lr
144entry:
145  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
146  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
147  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
148  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
149  %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
150  %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
151  %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
152  %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
153  %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
154  store <24 x i32> %s, <24 x i32> *%dst
155  ret void
156}
157
158define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
159; CHECK-LABEL: vst3_v16i32:
160; CHECK:       @ %bb.0: @ %entry
161; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
162; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
163; CHECK-NEXT:    .pad #160
164; CHECK-NEXT:    sub sp, #160
165; CHECK-NEXT:    vldrw.u32 q1, [r0, #144]
166; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
167; CHECK-NEXT:    vldrw.u32 q3, [r0, #128]
168; CHECK-NEXT:    vldrw.u32 q5, [r0]
169; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
170; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
171; CHECK-NEXT:    vmov.f32 s8, s1
172; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
173; CHECK-NEXT:    vstrw.32 q1, [sp, #144] @ 16-byte Spill
174; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
175; CHECK-NEXT:    vmov.f32 s9, s13
176; CHECK-NEXT:    vldrw.u32 q6, [r0, #112]
177; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
178; CHECK-NEXT:    vldrw.u32 q1, [r0, #160]
179; CHECK-NEXT:    vmov.f32 s11, s2
180; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
181; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
182; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
183; CHECK-NEXT:    vmov.f32 s10, s22
184; CHECK-NEXT:    vstrw.32 q1, [sp, #96] @ 16-byte Spill
185; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
186; CHECK-NEXT:    vmov r0, s3
187; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
188; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
189; CHECK-NEXT:    vmov.f64 d2, d7
190; CHECK-NEXT:    vdup.32 q2, r0
191; CHECK-NEXT:    vmov.f32 s5, s23
192; CHECK-NEXT:    vmov.f32 s7, s15
193; CHECK-NEXT:    vmov.f32 s6, s10
194; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
195; CHECK-NEXT:    vmov.f64 d2, d9
196; CHECK-NEXT:    vmov.f32 s5, s31
197; CHECK-NEXT:    vmov.f32 s7, s19
198; CHECK-NEXT:    vmov r0, s27
199; CHECK-NEXT:    vmov q2, q1
200; CHECK-NEXT:    vdup.32 q1, r0
201; CHECK-NEXT:    vmov r0, s12
202; CHECK-NEXT:    vmov.f32 s10, s6
203; CHECK-NEXT:    vmov.f64 d2, d10
204; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
205; CHECK-NEXT:    vmov.f32 s5, s0
206; CHECK-NEXT:    vdup.32 q0, r0
207; CHECK-NEXT:    vmov.f32 s7, s21
208; CHECK-NEXT:    vmov.f32 s6, s2
209; CHECK-NEXT:    vmov.f64 d0, d14
210; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
211; CHECK-NEXT:    vmov.f32 s1, s24
212; CHECK-NEXT:    vmov.f32 s3, s29
213; CHECK-NEXT:    vmov r0, s16
214; CHECK-NEXT:    vmov q1, q0
215; CHECK-NEXT:    vdup.32 q0, r0
216; CHECK-NEXT:    vmov.f32 s16, s25
217; CHECK-NEXT:    vmov.f32 s6, s2
218; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
219; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
220; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
221; CHECK-NEXT:    vmov.f32 s19, s26
222; CHECK-NEXT:    vldrw.u32 q6, [sp, #144] @ 16-byte Reload
223; CHECK-NEXT:    vmov.f32 s18, s30
224; CHECK-NEXT:    vmov q2, q1
225; CHECK-NEXT:    vmov.f32 s28, s5
226; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
227; CHECK-NEXT:    vmov.f32 s29, s1
228; CHECK-NEXT:    vmov.f32 s31, s6
229; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
230; CHECK-NEXT:    vmov.f64 d8, d1
231; CHECK-NEXT:    vmov q5, q1
232; CHECK-NEXT:    vmov r0, s11
233; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
234; CHECK-NEXT:    vmov.f32 s17, s7
235; CHECK-NEXT:    vmov.f32 s30, s6
236; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
237; CHECK-NEXT:    vmov.f64 d6, d5
238; CHECK-NEXT:    vstrw.32 q7, [r1, #112]
239; CHECK-NEXT:    vmov.f32 s13, s7
240; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
241; CHECK-NEXT:    vmov.f32 s19, s3
242; CHECK-NEXT:    vdup.32 q0, r0
243; CHECK-NEXT:    vmov r0, s27
244; CHECK-NEXT:    vmov.f32 s18, s2
245; CHECK-NEXT:    vdup.32 q0, r0
246; CHECK-NEXT:    vmov.f32 s15, s11
247; CHECK-NEXT:    vstrw.32 q4, [r1, #128]
248; CHECK-NEXT:    vmov.f32 s14, s2
249; CHECK-NEXT:    vmov q0, q5
250; CHECK-NEXT:    vmov.f32 s21, s4
251; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
252; CHECK-NEXT:    vmov.f32 s23, s1
253; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
254; CHECK-NEXT:    vmov r0, s0
255; CHECK-NEXT:    vdup.32 q0, r0
256; CHECK-NEXT:    vmov r0, s8
257; CHECK-NEXT:    vmov.f32 s22, s2
258; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
259; CHECK-NEXT:    vstrw.32 q5, [r1, #96]
260; CHECK-NEXT:    vmov.f64 d2, d0
261; CHECK-NEXT:    vmov.f32 s5, s24
262; CHECK-NEXT:    vmov q6, q0
263; CHECK-NEXT:    vmov.f32 s7, s1
264; CHECK-NEXT:    vdup.32 q0, r0
265; CHECK-NEXT:    vmov.f32 s6, s2
266; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
267; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
268; CHECK-NEXT:    vmov.f32 s8, s1
269; CHECK-NEXT:    vmov.f32 s11, s2
270; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
271; CHECK-NEXT:    vmov.f32 s10, s26
272; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
273; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
274; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
275; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
276; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
277; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
278; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
279; CHECK-NEXT:    vstrw.32 q0, [r1]
280; CHECK-NEXT:    add sp, #160
281; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
282; CHECK-NEXT:    bx lr
283entry:
284  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
285  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
286  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
287  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
288  %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
289  %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
290  %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
291  %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
292  %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
293  store <48 x i32> %s, <48 x i32> *%dst
294  ret void
295}
296
297; i16
298
299define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) {
300; CHECK-LABEL: vst3_v2i16:
301; CHECK:       @ %bb.0: @ %entry
302; CHECK-NEXT:    .save {r4, lr}
303; CHECK-NEXT:    push {r4, lr}
304; CHECK-NEXT:    ldrh r2, [r0, #6]
305; CHECK-NEXT:    ldrh r3, [r0, #4]
306; CHECK-NEXT:    ldrh.w r12, [r0, #8]
307; CHECK-NEXT:    vmov.16 q0[4], r2
308; CHECK-NEXT:    ldrh.w lr, [r0, #2]
309; CHECK-NEXT:    vmov.32 q1[0], r3
310; CHECK-NEXT:    ldrh r4, [r0]
311; CHECK-NEXT:    vmov.32 q1[2], r2
312; CHECK-NEXT:    ldrh r0, [r0, #10]
313; CHECK-NEXT:    vmov.16 q0[5], r0
314; CHECK-NEXT:    vmov r0, s2
315; CHECK-NEXT:    vmov.32 q0[0], r4
316; CHECK-NEXT:    vmov.32 q0[2], lr
317; CHECK-NEXT:    vmov.f32 s1, s4
318; CHECK-NEXT:    vdup.32 q1, r12
319; CHECK-NEXT:    vmov.f32 s3, s2
320; CHECK-NEXT:    vmov.f32 s2, s6
321; CHECK-NEXT:    vstrh.32 q0, [r1]
322; CHECK-NEXT:    str r0, [r1, #8]
323; CHECK-NEXT:    pop {r4, pc}
324entry:
325  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
326  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
327  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
328  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
329  %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
330  %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
331  %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332  %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
333  %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
334  store <6 x i16> %s, <6 x i16> *%dst
335  ret void
336}
337
338define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
339; CHECK-LABEL: vst3_v4i16:
340; CHECK:       @ %bb.0: @ %entry
341; CHECK-NEXT:    .vsave {d8, d9}
342; CHECK-NEXT:    vpush {d8, d9}
343; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
344; CHECK-NEXT:    vldrh.u32 q1, [r0]
345; CHECK-NEXT:    vldrh.u32 q3, [r0, #8]
346; CHECK-NEXT:    vmov.f64 d0, d5
347; CHECK-NEXT:    vmov.f32 s1, s7
348; CHECK-NEXT:    vmov r0, s15
349; CHECK-NEXT:    vdup.32 q4, r0
350; CHECK-NEXT:    vmov.f32 s3, s11
351; CHECK-NEXT:    vmov r0, s4
352; CHECK-NEXT:    vmov.f32 s2, s18
353; CHECK-NEXT:    vmov.16 q4[0], r0
354; CHECK-NEXT:    vmov r0, s12
355; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
356; CHECK-NEXT:    vmov.16 q4[1], r0
357; CHECK-NEXT:    vmov r0, s8
358; CHECK-NEXT:    vmov.16 q4[2], r0
359; CHECK-NEXT:    vmov r0, s5
360; CHECK-NEXT:    vmov.16 q4[3], r0
361; CHECK-NEXT:    vmov r0, s13
362; CHECK-NEXT:    vmov.16 q4[4], r0
363; CHECK-NEXT:    vmov r0, s9
364; CHECK-NEXT:    vmov.16 q4[5], r0
365; CHECK-NEXT:    vmov r0, s6
366; CHECK-NEXT:    vmov.16 q4[6], r0
367; CHECK-NEXT:    vmov r0, s14
368; CHECK-NEXT:    vmov.16 q4[7], r0
369; CHECK-NEXT:    vstrw.32 q4, [r1]
370; CHECK-NEXT:    vpop {d8, d9}
371; CHECK-NEXT:    bx lr
372entry:
373  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
374  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
375  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
376  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
377  %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
378  %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
379  %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
380  %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
381  %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
382  store <12 x i16> %s, <12 x i16> *%dst
383  ret void
384}
385
386define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
387; CHECK-LABEL: vst3_v8i16:
388; CHECK:       @ %bb.0: @ %entry
389; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
390; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
391; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
392; CHECK-NEXT:    vldrw.u32 q2, [r0]
393; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
394; CHECK-NEXT:    vmov.u16 r2, q1[2]
395; CHECK-NEXT:    vmov.16 q0[0], r2
396; CHECK-NEXT:    vmov.u16 r2, q2[3]
397; CHECK-NEXT:    vmov.16 q0[1], r2
398; CHECK-NEXT:    vmov.u16 r2, q1[4]
399; CHECK-NEXT:    vmov.16 q0[6], r2
400; CHECK-NEXT:    vmov.u16 r2, q2[5]
401; CHECK-NEXT:    vmov.16 q0[7], r2
402; CHECK-NEXT:    vmov.u16 r2, q2[0]
403; CHECK-NEXT:    vmov.16 q3[0], r2
404; CHECK-NEXT:    vmov.u16 r0, q4[0]
405; CHECK-NEXT:    vmov.16 q3[1], r0
406; CHECK-NEXT:    vmov.u16 r0, q4[1]
407; CHECK-NEXT:    vmov.16 q3[4], r0
408; CHECK-NEXT:    vmov.u16 r0, q2[2]
409; CHECK-NEXT:    vmov.16 q3[6], r0
410; CHECK-NEXT:    vmov.u16 r0, q4[2]
411; CHECK-NEXT:    vmov r2, s4
412; CHECK-NEXT:    vmov.16 q3[7], r0
413; CHECK-NEXT:    vdup.32 q5, r2
414; CHECK-NEXT:    vmov.f32 s13, s8
415; CHECK-NEXT:    vmov.u16 r2, q5[2]
416; CHECK-NEXT:    vmov.u16 r0, q3[3]
417; CHECK-NEXT:    vmov.16 q6[2], r2
418; CHECK-NEXT:    vmov r2, s11
419; CHECK-NEXT:    vmov.16 q6[3], r0
420; CHECK-NEXT:    vmov.u16 r0, q3[4]
421; CHECK-NEXT:    vmov.16 q6[4], r0
422; CHECK-NEXT:    vmov.u16 r0, q5[5]
423; CHECK-NEXT:    vmov.16 q6[5], r0
424; CHECK-NEXT:    vmov.u16 r0, q4[5]
425; CHECK-NEXT:    vmov.16 q5[0], r0
426; CHECK-NEXT:    vmov.u16 r0, q1[5]
427; CHECK-NEXT:    vmov.16 q5[1], r0
428; CHECK-NEXT:    vmov.u16 r0, q4[6]
429; CHECK-NEXT:    vmov.16 q5[3], r0
430; CHECK-NEXT:    vmov.u16 r0, q4[7]
431; CHECK-NEXT:    vmov.f32 s13, s25
432; CHECK-NEXT:    vmov.16 q5[6], r0
433; CHECK-NEXT:    vmov.u16 r0, q1[7]
434; CHECK-NEXT:    vmov.f32 s14, s26
435; CHECK-NEXT:    vmov.16 q5[7], r0
436; CHECK-NEXT:    vdup.32 q6, r2
437; CHECK-NEXT:    vmov.f32 s1, s5
438; CHECK-NEXT:    vmov.u16 r2, q6[2]
439; CHECK-NEXT:    vmov.f32 s22, s7
440; CHECK-NEXT:    vrev32.16 q4, q4
441; CHECK-NEXT:    vmov.16 q7[2], r2
442; CHECK-NEXT:    vmov.u16 r0, q5[3]
443; CHECK-NEXT:    vmov.u16 r2, q4[2]
444; CHECK-NEXT:    vmov.f32 s2, s10
445; CHECK-NEXT:    vmov.16 q7[3], r0
446; CHECK-NEXT:    vmov.u16 r0, q5[4]
447; CHECK-NEXT:    vstrw.32 q3, [r1]
448; CHECK-NEXT:    vmov.16 q3[2], r2
449; CHECK-NEXT:    vmov.u16 r2, q0[3]
450; CHECK-NEXT:    vmov.16 q7[4], r0
451; CHECK-NEXT:    vmov.u16 r0, q6[5]
452; CHECK-NEXT:    vmov.16 q3[3], r2
453; CHECK-NEXT:    vmov.u16 r2, q0[4]
454; CHECK-NEXT:    vmov.16 q7[5], r0
455; CHECK-NEXT:    vmov.u16 r0, q4[5]
456; CHECK-NEXT:    vmov.16 q3[4], r2
457; CHECK-NEXT:    vmov.16 q3[5], r0
458; CHECK-NEXT:    vmov.f32 s21, s29
459; CHECK-NEXT:    vmov.f32 s1, s13
460; CHECK-NEXT:    vmov.f32 s22, s30
461; CHECK-NEXT:    vmov.f32 s2, s14
462; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
463; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
464; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
465; CHECK-NEXT:    bx lr
466entry:
467  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
468  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
469  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
470  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
471  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
472  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
473  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
474  %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
475  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
476  store <24 x i16> %s, <24 x i16> *%dst
477  ret void
478}
479
480define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
481; CHECK-LABEL: vst3_v16i16:
482; CHECK:       @ %bb.0: @ %entry
483; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
484; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
485; CHECK-NEXT:    .pad #160
486; CHECK-NEXT:    sub sp, #160
487; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
488; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
489; CHECK-NEXT:    vmov.u16 r2, q0[0]
490; CHECK-NEXT:    vstrw.32 q0, [sp, #128] @ 16-byte Spill
491; CHECK-NEXT:    vmov.16 q2[0], r2
492; CHECK-NEXT:    vmov.u16 r2, q1[0]
493; CHECK-NEXT:    vmov.16 q2[1], r2
494; CHECK-NEXT:    vmov.u16 r2, q1[1]
495; CHECK-NEXT:    vmov.16 q2[4], r2
496; CHECK-NEXT:    vmov.u16 r2, q0[2]
497; CHECK-NEXT:    vmov.16 q2[6], r2
498; CHECK-NEXT:    vmov.u16 r2, q1[2]
499; CHECK-NEXT:    vmov.16 q2[7], r2
500; CHECK-NEXT:    vmov q7, q1
501; CHECK-NEXT:    vmov.f32 s9, s0
502; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
503; CHECK-NEXT:    vmov.u16 r2, q2[3]
504; CHECK-NEXT:    vmov q3, q2
505; CHECK-NEXT:    vmov r3, s0
506; CHECK-NEXT:    vmov q1, q0
507; CHECK-NEXT:    vdup.32 q0, r3
508; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
509; CHECK-NEXT:    vmov.u16 r3, q0[2]
510; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
511; CHECK-NEXT:    vmov.16 q2[2], r3
512; CHECK-NEXT:    vmov.16 q2[3], r2
513; CHECK-NEXT:    vmov.u16 r2, q3[4]
514; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
515; CHECK-NEXT:    vmov.16 q2[4], r2
516; CHECK-NEXT:    vmov.u16 r2, q0[5]
517; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
518; CHECK-NEXT:    vmov.16 q2[5], r2
519; CHECK-NEXT:    vmov.u16 r2, q3[5]
520; CHECK-NEXT:    vmov.16 q5[0], r2
521; CHECK-NEXT:    vmov.u16 r2, q0[5]
522; CHECK-NEXT:    vmov.16 q5[1], r2
523; CHECK-NEXT:    vmov.u16 r2, q3[6]
524; CHECK-NEXT:    vmov.16 q5[3], r2
525; CHECK-NEXT:    vmov.u16 r2, q3[7]
526; CHECK-NEXT:    vmov.16 q5[6], r2
527; CHECK-NEXT:    vmov.u16 r2, q0[7]
528; CHECK-NEXT:    vmov.16 q5[7], r2
529; CHECK-NEXT:    vstrw.32 q0, [sp, #144] @ 16-byte Spill
530; CHECK-NEXT:    vmov.f32 s22, s3
531; CHECK-NEXT:    vldrw.u32 q0, [r0]
532; CHECK-NEXT:    vstrw.32 q2, [sp, #112] @ 16-byte Spill
533; CHECK-NEXT:    vmov.u16 r2, q5[3]
534; CHECK-NEXT:    vmov r0, s3
535; CHECK-NEXT:    vmov q2, q0
536; CHECK-NEXT:    vdup.32 q0, r0
537; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
538; CHECK-NEXT:    vmov.u16 r0, q0[2]
539; CHECK-NEXT:    vmov.16 q4[2], r0
540; CHECK-NEXT:    vmov.u16 r0, q5[4]
541; CHECK-NEXT:    vmov.16 q4[3], r2
542; CHECK-NEXT:    vmov.16 q4[4], r0
543; CHECK-NEXT:    vmov.u16 r0, q0[5]
544; CHECK-NEXT:    vmov.16 q4[5], r0
545; CHECK-NEXT:    vmov.u16 r0, q2[0]
546; CHECK-NEXT:    vmov.16 q6[0], r0
547; CHECK-NEXT:    vmov.u16 r0, q3[0]
548; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
549; CHECK-NEXT:    vmov.16 q6[1], r0
550; CHECK-NEXT:    vmov.u16 r0, q3[1]
551; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
552; CHECK-NEXT:    vmov.16 q6[4], r0
553; CHECK-NEXT:    vmov.u16 r0, q2[2]
554; CHECK-NEXT:    vmov r2, s0
555; CHECK-NEXT:    vmov.16 q6[6], r0
556; CHECK-NEXT:    vmov.u16 r0, q3[2]
557; CHECK-NEXT:    vdup.32 q0, r2
558; CHECK-NEXT:    vmov.16 q6[7], r0
559; CHECK-NEXT:    vmov.u16 r2, q0[2]
560; CHECK-NEXT:    vmov.f32 s25, s8
561; CHECK-NEXT:    vmov.16 q2[2], r2
562; CHECK-NEXT:    vmov.u16 r0, q6[3]
563; CHECK-NEXT:    vmov q4, q1
564; CHECK-NEXT:    vmov.16 q2[3], r0
565; CHECK-NEXT:    vmov.u16 r0, q6[4]
566; CHECK-NEXT:    vmov.16 q2[4], r0
567; CHECK-NEXT:    vmov.u16 r0, q0[5]
568; CHECK-NEXT:    vmov.16 q2[5], r0
569; CHECK-NEXT:    vmov.u16 r0, q7[5]
570; CHECK-NEXT:    vmov.16 q0[0], r0
571; CHECK-NEXT:    vmov.u16 r0, q1[5]
572; CHECK-NEXT:    vmov.16 q0[1], r0
573; CHECK-NEXT:    vmov.u16 r0, q7[6]
574; CHECK-NEXT:    vmov.16 q0[3], r0
575; CHECK-NEXT:    vmov.u16 r0, q7[7]
576; CHECK-NEXT:    vmov.16 q0[6], r0
577; CHECK-NEXT:    vmov.u16 r0, q1[7]
578; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
579; CHECK-NEXT:    vmov.16 q0[7], r0
580; CHECK-NEXT:    vmov.f32 s2, s19
581; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
582; CHECK-NEXT:    vmov r2, s7
583; CHECK-NEXT:    vmov.u16 r0, q0[3]
584; CHECK-NEXT:    vdup.32 q7, r2
585; CHECK-NEXT:    vrev32.16 q3, q3
586; CHECK-NEXT:    vmov.u16 r2, q7[2]
587; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
588; CHECK-NEXT:    vmov.16 q2[2], r2
589; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
590; CHECK-NEXT:    vmov.16 q2[3], r0
591; CHECK-NEXT:    vmov.u16 r0, q0[4]
592; CHECK-NEXT:    vmov.16 q2[4], r0
593; CHECK-NEXT:    vmov.u16 r0, q7[5]
594; CHECK-NEXT:    vmov.16 q2[5], r0
595; CHECK-NEXT:    vmov.u16 r0, q4[2]
596; CHECK-NEXT:    vmov.16 q3[0], r0
597; CHECK-NEXT:    vmov.u16 r0, q1[3]
598; CHECK-NEXT:    vmov.16 q3[1], r0
599; CHECK-NEXT:    vmov.u16 r0, q4[4]
600; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
601; CHECK-NEXT:    vmov.16 q3[6], r0
602; CHECK-NEXT:    vmov.u16 r0, q1[5]
603; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
604; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
605; CHECK-NEXT:    vrev32.16 q7, q7
606; CHECK-NEXT:    vmov.16 q3[7], r0
607; CHECK-NEXT:    vmov.u16 r0, q1[2]
608; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
609; CHECK-NEXT:    vmov.16 q7[0], r0
610; CHECK-NEXT:    vmov.u16 r0, q4[3]
611; CHECK-NEXT:    vmov.f32 s1, s9
612; CHECK-NEXT:    vmov.16 q7[1], r0
613; CHECK-NEXT:    vmov.u16 r0, q1[4]
614; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
615; CHECK-NEXT:    vmov.f32 s2, s10
616; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
617; CHECK-NEXT:    vmov.16 q7[6], r0
618; CHECK-NEXT:    vmov.f32 s25, s5
619; CHECK-NEXT:    vmov.u16 r0, q4[5]
620; CHECK-NEXT:    vmov.f32 s26, s6
621; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
622; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
623; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
624; CHECK-NEXT:    vmov.f32 s21, s5
625; CHECK-NEXT:    vmov.16 q7[7], r0
626; CHECK-NEXT:    vmov.f32 s22, s6
627; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
628; CHECK-NEXT:    vmov.u16 r2, q0[2]
629; CHECK-NEXT:    vmov.u16 r0, q0[5]
630; CHECK-NEXT:    vmov.f32 s9, s5
631; CHECK-NEXT:    vmov.16 q0[2], r2
632; CHECK-NEXT:    vmov.f32 s10, s6
633; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
634; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
635; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
636; CHECK-NEXT:    vmov.f32 s29, s5
637; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
638; CHECK-NEXT:    vmov.f32 s30, s18
639; CHECK-NEXT:    vstrw.32 q6, [r1]
640; CHECK-NEXT:    vmov.u16 r2, q7[3]
641; CHECK-NEXT:    vmov.f32 s13, s5
642; CHECK-NEXT:    vmov.16 q0[3], r2
643; CHECK-NEXT:    vmov.u16 r2, q7[4]
644; CHECK-NEXT:    vmov.16 q0[4], r2
645; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
646; CHECK-NEXT:    vmov.16 q0[5], r0
647; CHECK-NEXT:    vmov.f32 s29, s1
648; CHECK-NEXT:    vmov.f32 s30, s2
649; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
650; CHECK-NEXT:    vmov.f32 s14, s6
651; CHECK-NEXT:    vstrw.32 q7, [r1, #16]
652; CHECK-NEXT:    vmov.u16 r2, q0[2]
653; CHECK-NEXT:    vmov.u16 r0, q0[5]
654; CHECK-NEXT:    vmov.16 q0[2], r2
655; CHECK-NEXT:    vmov.u16 r2, q3[3]
656; CHECK-NEXT:    vmov.16 q0[3], r2
657; CHECK-NEXT:    vmov.u16 r2, q3[4]
658; CHECK-NEXT:    vmov.16 q0[4], r2
659; CHECK-NEXT:    vmov.16 q0[5], r0
660; CHECK-NEXT:    vmov.f32 s13, s1
661; CHECK-NEXT:    vmov.f32 s14, s2
662; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
663; CHECK-NEXT:    add sp, #160
664; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
665; CHECK-NEXT:    bx lr
666entry:
667  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
668  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
669  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
670  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
671  %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
672  %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
673  %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
674  %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
675  %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
676  store <48 x i16> %s, <48 x i16> *%dst
677  ret void
678}
679
680; i8
681
682define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
683; CHECK-LABEL: vst3_v2i8:
684; CHECK:       @ %bb.0: @ %entry
685; CHECK-NEXT:    .save {r4, r5, r6, lr}
686; CHECK-NEXT:    push {r4, r5, r6, lr}
687; CHECK-NEXT:    .pad #16
688; CHECK-NEXT:    sub sp, #16
689; CHECK-NEXT:    ldrb r2, [r0]
690; CHECK-NEXT:    movs r6, #0
691; CHECK-NEXT:    ldrb r3, [r0, #1]
692; CHECK-NEXT:    vmov.32 q0[0], r2
693; CHECK-NEXT:    ldrb.w r12, [r0, #2]
694; CHECK-NEXT:    vmov.32 q0[2], r3
695; CHECK-NEXT:    ldrb.w lr, [r0, #3]
696; CHECK-NEXT:    vmov r4, s0
697; CHECK-NEXT:    ldrb r5, [r0, #5]
698; CHECK-NEXT:    vmov.16 q0[0], r4
699; CHECK-NEXT:    ldrb r0, [r0, #4]
700; CHECK-NEXT:    vmov.16 q0[1], r12
701; CHECK-NEXT:    mov r2, sp
702; CHECK-NEXT:    vmov.16 q0[2], r0
703; CHECK-NEXT:    add r0, sp, #8
704; CHECK-NEXT:    vmov.16 q0[3], r3
705; CHECK-NEXT:    vmov.16 q0[4], lr
706; CHECK-NEXT:    vmov.16 q0[5], r5
707; CHECK-NEXT:    vmov.16 q0[6], r6
708; CHECK-NEXT:    vmov.16 q0[7], r6
709; CHECK-NEXT:    vstrb.16 q0, [r2]
710; CHECK-NEXT:    vstrb.16 q0, [r0]
711; CHECK-NEXT:    vldrh.u32 q0, [r0]
712; CHECK-NEXT:    ldr r2, [sp]
713; CHECK-NEXT:    str r2, [r1]
714; CHECK-NEXT:    vmov r0, s2
715; CHECK-NEXT:    strh r0, [r1, #4]
716; CHECK-NEXT:    add sp, #16
717; CHECK-NEXT:    pop {r4, r5, r6, pc}
718entry:
719  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
720  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
721  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
722  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
723  %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
724  %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
725  %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
726  %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
727  %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
728  store <6 x i8> %s, <6 x i8> *%dst
729  ret void
730}
731
732define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) {
733; CHECK-LABEL: vst3_v4i8:
734; CHECK:       @ %bb.0: @ %entry
735; CHECK-NEXT:    .vsave {d8, d9}
736; CHECK-NEXT:    vpush {d8, d9}
737; CHECK-NEXT:    vldrb.u32 q1, [r0]
738; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
739; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
740; CHECK-NEXT:    vmov r2, s4
741; CHECK-NEXT:    vmov.16 q0[0], r2
742; CHECK-NEXT:    vmov r2, s8
743; CHECK-NEXT:    vmov.16 q0[1], r2
744; CHECK-NEXT:    vmov r0, s12
745; CHECK-NEXT:    vmov.16 q0[2], r0
746; CHECK-NEXT:    vmov r0, s5
747; CHECK-NEXT:    vmov.16 q0[3], r0
748; CHECK-NEXT:    vmov r0, s9
749; CHECK-NEXT:    vmov.16 q0[4], r0
750; CHECK-NEXT:    vmov r0, s13
751; CHECK-NEXT:    vmov.16 q0[5], r0
752; CHECK-NEXT:    vmov r0, s6
753; CHECK-NEXT:    vmov.16 q0[6], r0
754; CHECK-NEXT:    vmov r0, s10
755; CHECK-NEXT:    vmov.16 q0[7], r0
756; CHECK-NEXT:    vmov r0, s14
757; CHECK-NEXT:    vmov.8 q4[8], r0
758; CHECK-NEXT:    vmov r0, s7
759; CHECK-NEXT:    vmov.8 q4[9], r0
760; CHECK-NEXT:    vmov r0, s11
761; CHECK-NEXT:    vmov.8 q4[10], r0
762; CHECK-NEXT:    vmov r0, s15
763; CHECK-NEXT:    vmov.8 q4[11], r0
764; CHECK-NEXT:    vstrb.16 q0, [r1]
765; CHECK-NEXT:    vmov r0, s18
766; CHECK-NEXT:    str r0, [r1, #8]
767; CHECK-NEXT:    vpop {d8, d9}
768; CHECK-NEXT:    bx lr
769entry:
770  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
771  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
772  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
773  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
774  %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
775  %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
776  %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
777  %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
778  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
779  store <12 x i8> %s, <12 x i8> *%dst
780  ret void
781}
782
783define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
784; CHECK-LABEL: vst3_v8i8:
785; CHECK:       @ %bb.0: @ %entry
786; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
787; CHECK-NEXT:    vpush {d8, d9, d10, d11}
788; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
789; CHECK-NEXT:    vldrb.u16 q2, [r0, #16]
790; CHECK-NEXT:    vldrb.u16 q3, [r0]
791; CHECK-NEXT:    vmov.u16 r2, q1[5]
792; CHECK-NEXT:    vmov.16 q0[0], r2
793; CHECK-NEXT:    vmov.u16 r2, q2[5]
794; CHECK-NEXT:    vmov.16 q0[1], r2
795; CHECK-NEXT:    vmov.u16 r2, q1[6]
796; CHECK-NEXT:    vmov.16 q0[3], r2
797; CHECK-NEXT:    vmov.u16 r2, q1[7]
798; CHECK-NEXT:    vmov.16 q0[6], r2
799; CHECK-NEXT:    vmov.u16 r2, q2[7]
800; CHECK-NEXT:    vmov r0, s15
801; CHECK-NEXT:    vmov.16 q0[7], r2
802; CHECK-NEXT:    vdup.32 q4, r0
803; CHECK-NEXT:    vmov.f32 s2, s11
804; CHECK-NEXT:    vmov.u16 r0, q4[2]
805; CHECK-NEXT:    vmov.u16 r2, q0[3]
806; CHECK-NEXT:    vmov.16 q5[2], r0
807; CHECK-NEXT:    vmov.u16 r0, q0[4]
808; CHECK-NEXT:    vmov.16 q5[3], r2
809; CHECK-NEXT:    vmov.16 q5[4], r0
810; CHECK-NEXT:    vmov.u16 r0, q4[5]
811; CHECK-NEXT:    vmov.16 q5[5], r0
812; CHECK-NEXT:    vmov.u16 r0, q3[0]
813; CHECK-NEXT:    vmov.8 q4[0], r0
814; CHECK-NEXT:    vmov.u16 r0, q1[0]
815; CHECK-NEXT:    vmov.8 q4[1], r0
816; CHECK-NEXT:    vmov.u16 r0, q2[0]
817; CHECK-NEXT:    vmov.8 q4[2], r0
818; CHECK-NEXT:    vmov.u16 r0, q3[1]
819; CHECK-NEXT:    vmov.8 q4[3], r0
820; CHECK-NEXT:    vmov.u16 r0, q1[1]
821; CHECK-NEXT:    vmov.8 q4[4], r0
822; CHECK-NEXT:    vmov.u16 r0, q2[1]
823; CHECK-NEXT:    vmov.8 q4[5], r0
824; CHECK-NEXT:    vmov.u16 r0, q3[2]
825; CHECK-NEXT:    vmov.8 q4[6], r0
826; CHECK-NEXT:    vmov.u16 r0, q1[2]
827; CHECK-NEXT:    vmov.8 q4[7], r0
828; CHECK-NEXT:    vmov.u16 r0, q2[2]
829; CHECK-NEXT:    vmov.8 q4[8], r0
830; CHECK-NEXT:    vmov.u16 r0, q3[3]
831; CHECK-NEXT:    vmov.8 q4[9], r0
832; CHECK-NEXT:    vmov.u16 r0, q1[3]
833; CHECK-NEXT:    vmov.8 q4[10], r0
834; CHECK-NEXT:    vmov.u16 r0, q2[3]
835; CHECK-NEXT:    vmov.8 q4[11], r0
836; CHECK-NEXT:    vmov.u16 r0, q3[4]
837; CHECK-NEXT:    vmov.8 q4[12], r0
838; CHECK-NEXT:    vmov.u16 r0, q1[4]
839; CHECK-NEXT:    vmov.8 q4[13], r0
840; CHECK-NEXT:    vmov.u16 r0, q2[4]
841; CHECK-NEXT:    vmov.f32 s1, s21
842; CHECK-NEXT:    vmov.8 q4[14], r0
843; CHECK-NEXT:    vmov.u16 r0, q3[5]
844; CHECK-NEXT:    vmov.f32 s2, s22
845; CHECK-NEXT:    vmov.8 q4[15], r0
846; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
847; CHECK-NEXT:    vstrw.32 q4, [r1]
848; CHECK-NEXT:    vpop {d8, d9, d10, d11}
849; CHECK-NEXT:    bx lr
850entry:
851  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
852  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
853  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
854  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
855  %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
856  %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
857  %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
858  %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
859  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
860  store <24 x i8> %s, <24 x i8> *%dst
861  ret void
862}
863
864define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) {
865; CHECK-LABEL: vst3_v16i8:
866; CHECK:       @ %bb.0: @ %entry
867; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
868; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
869; CHECK-NEXT:    vldrw.u32 q3, [r0]
870; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
871; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
872; CHECK-NEXT:    vmov.u8 r3, q3[0]
873; CHECK-NEXT:    vmov.u8 r0, q2[0]
874; CHECK-NEXT:    vmov.8 q5[0], r3
875; CHECK-NEXT:    vmov.u8 r2, q1[0]
876; CHECK-NEXT:    vmov.8 q5[1], r0
877; CHECK-NEXT:    vmov.u8 r0, q3[1]
878; CHECK-NEXT:    vmov.8 q5[3], r0
879; CHECK-NEXT:    vmov.u8 r0, q2[1]
880; CHECK-NEXT:    vmov.8 q5[4], r0
881; CHECK-NEXT:    vmov.u8 r0, q3[2]
882; CHECK-NEXT:    vmov.8 q5[6], r0
883; CHECK-NEXT:    vmov.u8 r0, q2[2]
884; CHECK-NEXT:    vmov.8 q5[7], r0
885; CHECK-NEXT:    vmov.u8 r0, q3[3]
886; CHECK-NEXT:    vmov.8 q5[9], r0
887; CHECK-NEXT:    vmov.u8 r0, q2[3]
888; CHECK-NEXT:    vmov.8 q5[10], r0
889; CHECK-NEXT:    vmov.u8 r0, q3[4]
890; CHECK-NEXT:    vmov.8 q4[2], r2
891; CHECK-NEXT:    vmov.u8 r2, q1[2]
892; CHECK-NEXT:    vmov.8 q5[12], r0
893; CHECK-NEXT:    vmov.u8 r0, q2[4]
894; CHECK-NEXT:    vmov.8 q4[8], r2
895; CHECK-NEXT:    vmov.u8 r2, q1[3]
896; CHECK-NEXT:    vmov.8 q5[13], r0
897; CHECK-NEXT:    vmov.u8 r0, q3[5]
898; CHECK-NEXT:    vmov.8 q5[15], r0
899; CHECK-NEXT:    vmov.8 q4[11], r2
900; CHECK-NEXT:    vmov.u8 r2, q1[4]
901; CHECK-NEXT:    vmov.u8 r0, q5[0]
902; CHECK-NEXT:    vmov.8 q4[14], r2
903; CHECK-NEXT:    vmov.8 q0[0], r0
904; CHECK-NEXT:    vmov.f32 s17, s4
905; CHECK-NEXT:    vmov.u8 r0, q5[1]
906; CHECK-NEXT:    vmov.8 q0[1], r0
907; CHECK-NEXT:    vmov.u8 r2, q4[2]
908; CHECK-NEXT:    vmov.8 q0[2], r2
909; CHECK-NEXT:    vmov.u8 r0, q5[3]
910; CHECK-NEXT:    vmov.8 q0[3], r0
911; CHECK-NEXT:    vmov.u8 r0, q5[4]
912; CHECK-NEXT:    vmov.8 q0[4], r0
913; CHECK-NEXT:    vmov.u8 r0, q4[5]
914; CHECK-NEXT:    vmov.8 q0[5], r0
915; CHECK-NEXT:    vmov.u8 r0, q5[6]
916; CHECK-NEXT:    vmov.8 q0[6], r0
917; CHECK-NEXT:    vmov.u8 r0, q5[7]
918; CHECK-NEXT:    vmov.8 q0[7], r0
919; CHECK-NEXT:    vmov.u8 r0, q4[8]
920; CHECK-NEXT:    vmov.8 q0[8], r0
921; CHECK-NEXT:    vmov.u8 r0, q5[9]
922; CHECK-NEXT:    vmov.8 q0[9], r0
923; CHECK-NEXT:    vmov.u8 r0, q5[10]
924; CHECK-NEXT:    vmov.8 q0[10], r0
925; CHECK-NEXT:    vmov.u8 r0, q4[11]
926; CHECK-NEXT:    vmov.8 q0[11], r0
927; CHECK-NEXT:    vmov.u8 r0, q5[12]
928; CHECK-NEXT:    vmov.8 q0[12], r0
929; CHECK-NEXT:    vmov.u8 r0, q5[13]
930; CHECK-NEXT:    vmov.8 q0[13], r0
931; CHECK-NEXT:    vmov.u8 r0, q4[14]
932; CHECK-NEXT:    vmov.8 q0[14], r0
933; CHECK-NEXT:    vmov.u8 r0, q5[15]
934; CHECK-NEXT:    vmov.8 q0[15], r0
935; CHECK-NEXT:    vmov.u8 r0, q2[5]
936; CHECK-NEXT:    vmov.8 q5[0], r0
937; CHECK-NEXT:    vmov.u8 r0, q1[5]
938; CHECK-NEXT:    vmov.8 q5[1], r0
939; CHECK-NEXT:    vmov.u8 r0, q2[6]
940; CHECK-NEXT:    vmov.8 q5[3], r0
941; CHECK-NEXT:    vmov.u8 r0, q1[6]
942; CHECK-NEXT:    vmov.8 q5[4], r0
943; CHECK-NEXT:    vmov.u8 r0, q2[7]
944; CHECK-NEXT:    vmov.8 q5[6], r0
945; CHECK-NEXT:    vmov.u8 r0, q1[7]
946; CHECK-NEXT:    vmov.8 q5[7], r0
947; CHECK-NEXT:    vmov.u8 r0, q2[8]
948; CHECK-NEXT:    vmov.8 q5[9], r0
949; CHECK-NEXT:    vmov.u8 r0, q1[8]
950; CHECK-NEXT:    vmov.8 q5[10], r0
951; CHECK-NEXT:    vmov.u8 r0, q2[9]
952; CHECK-NEXT:    vmov.8 q5[12], r0
953; CHECK-NEXT:    vmov.u8 r0, q1[9]
954; CHECK-NEXT:    vmov.8 q5[13], r0
955; CHECK-NEXT:    vmov.u8 r0, q2[10]
956; CHECK-NEXT:    vmov.8 q5[15], r0
957; CHECK-NEXT:    vstrw.32 q0, [r1]
958; CHECK-NEXT:    vmov.u8 r0, q5[0]
959; CHECK-NEXT:    vmov.8 q4[0], r0
960; CHECK-NEXT:    vmov.u8 r0, q5[1]
961; CHECK-NEXT:    vmov.8 q4[1], r0
962; CHECK-NEXT:    vmov.u8 r0, q3[7]
963; CHECK-NEXT:    vmov.8 q6[5], r0
964; CHECK-NEXT:    vmov.u8 r0, q3[8]
965; CHECK-NEXT:    vmov.8 q6[8], r0
966; CHECK-NEXT:    vmov.u8 r0, q3[9]
967; CHECK-NEXT:    vmov.8 q6[11], r0
968; CHECK-NEXT:    vmov.f32 s24, s13
969; CHECK-NEXT:    vmov.f32 s27, s14
970; CHECK-NEXT:    vmov.u8 r0, q6[2]
971; CHECK-NEXT:    vmov.8 q4[2], r0
972; CHECK-NEXT:    vmov.u8 r0, q5[3]
973; CHECK-NEXT:    vmov.8 q4[3], r0
974; CHECK-NEXT:    vmov.u8 r0, q5[4]
975; CHECK-NEXT:    vmov.8 q4[4], r0
976; CHECK-NEXT:    vmov.u8 r0, q6[5]
977; CHECK-NEXT:    vmov.8 q4[5], r0
978; CHECK-NEXT:    vmov.u8 r0, q5[6]
979; CHECK-NEXT:    vmov.8 q4[6], r0
980; CHECK-NEXT:    vmov.u8 r0, q5[7]
981; CHECK-NEXT:    vmov.8 q4[7], r0
982; CHECK-NEXT:    vmov.u8 r0, q6[8]
983; CHECK-NEXT:    vmov.8 q4[8], r0
984; CHECK-NEXT:    vmov.u8 r0, q5[9]
985; CHECK-NEXT:    vmov.8 q4[9], r0
986; CHECK-NEXT:    vmov.u8 r0, q5[10]
987; CHECK-NEXT:    vmov.8 q4[10], r0
988; CHECK-NEXT:    vmov.u8 r0, q6[11]
989; CHECK-NEXT:    vmov.8 q4[11], r0
990; CHECK-NEXT:    vmov.u8 r0, q5[12]
991; CHECK-NEXT:    vmov.8 q4[12], r0
992; CHECK-NEXT:    vmov.u8 r0, q5[13]
993; CHECK-NEXT:    vmov.8 q4[13], r0
994; CHECK-NEXT:    vmov.u8 r0, q6[14]
995; CHECK-NEXT:    vmov.8 q4[14], r0
996; CHECK-NEXT:    vmov.u8 r0, q5[15]
997; CHECK-NEXT:    vmov.8 q4[15], r0
998; CHECK-NEXT:    vmov.u8 r0, q1[10]
999; CHECK-NEXT:    vmov.8 q5[0], r0
1000; CHECK-NEXT:    vmov.u8 r0, q3[11]
1001; CHECK-NEXT:    vmov.8 q5[1], r0
1002; CHECK-NEXT:    vmov.u8 r0, q1[11]
1003; CHECK-NEXT:    vmov.8 q5[3], r0
1004; CHECK-NEXT:    vmov.u8 r0, q3[12]
1005; CHECK-NEXT:    vmov.8 q5[4], r0
1006; CHECK-NEXT:    vmov.u8 r0, q1[12]
1007; CHECK-NEXT:    vmov.8 q5[6], r0
1008; CHECK-NEXT:    vmov.u8 r0, q3[13]
1009; CHECK-NEXT:    vmov.8 q5[7], r0
1010; CHECK-NEXT:    vmov.u8 r0, q1[13]
1011; CHECK-NEXT:    vmov.8 q5[9], r0
1012; CHECK-NEXT:    vmov.u8 r0, q3[14]
1013; CHECK-NEXT:    vmov.8 q5[10], r0
1014; CHECK-NEXT:    vmov.u8 r0, q1[14]
1015; CHECK-NEXT:    vmov.8 q5[12], r0
1016; CHECK-NEXT:    vmov.u8 r0, q3[15]
1017; CHECK-NEXT:    vmov.8 q5[13], r0
1018; CHECK-NEXT:    vmov.u8 r0, q1[15]
1019; CHECK-NEXT:    vmov.8 q5[15], r0
1020; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
1021; CHECK-NEXT:    vmov.u8 r0, q5[0]
1022; CHECK-NEXT:    vmov.8 q1[0], r0
1023; CHECK-NEXT:    vmov.u8 r0, q5[1]
1024; CHECK-NEXT:    vmov.8 q1[1], r0
1025; CHECK-NEXT:    vmov.u8 r0, q2[11]
1026; CHECK-NEXT:    vmov.8 q3[2], r0
1027; CHECK-NEXT:    vmov.u8 r0, q2[12]
1028; CHECK-NEXT:    vmov.8 q3[5], r0
1029; CHECK-NEXT:    vmov.u8 r0, q2[13]
1030; CHECK-NEXT:    vmov.8 q3[8], r0
1031; CHECK-NEXT:    vmov.u8 r0, q2[14]
1032; CHECK-NEXT:    vmov.8 q3[11], r0
1033; CHECK-NEXT:    vmov.u8 r0, q2[15]
1034; CHECK-NEXT:    vmov.8 q3[14], r0
1035; CHECK-NEXT:    vmov.u8 r0, q3[2]
1036; CHECK-NEXT:    vmov.8 q1[2], r0
1037; CHECK-NEXT:    vmov.u8 r0, q5[3]
1038; CHECK-NEXT:    vmov.8 q1[3], r0
1039; CHECK-NEXT:    vmov.u8 r0, q5[4]
1040; CHECK-NEXT:    vmov.8 q1[4], r0
1041; CHECK-NEXT:    vmov.u8 r0, q3[5]
1042; CHECK-NEXT:    vmov.8 q1[5], r0
1043; CHECK-NEXT:    vmov.u8 r0, q5[6]
1044; CHECK-NEXT:    vmov.8 q1[6], r0
1045; CHECK-NEXT:    vmov.u8 r0, q5[7]
1046; CHECK-NEXT:    vmov.8 q1[7], r0
1047; CHECK-NEXT:    vmov.u8 r0, q3[8]
1048; CHECK-NEXT:    vmov.8 q1[8], r0
1049; CHECK-NEXT:    vmov.u8 r0, q5[9]
1050; CHECK-NEXT:    vmov.8 q1[9], r0
1051; CHECK-NEXT:    vmov.u8 r0, q5[10]
1052; CHECK-NEXT:    vmov.8 q1[10], r0
1053; CHECK-NEXT:    vmov.u8 r0, q3[11]
1054; CHECK-NEXT:    vmov.8 q1[11], r0
1055; CHECK-NEXT:    vmov.u8 r0, q5[12]
1056; CHECK-NEXT:    vmov.8 q1[12], r0
1057; CHECK-NEXT:    vmov.u8 r0, q5[13]
1058; CHECK-NEXT:    vmov.8 q1[13], r0
1059; CHECK-NEXT:    vmov.u8 r0, q3[14]
1060; CHECK-NEXT:    vmov.8 q1[14], r0
1061; CHECK-NEXT:    vmov.u8 r0, q5[15]
1062; CHECK-NEXT:    vmov.8 q1[15], r0
1063; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1064; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1065; CHECK-NEXT:    bx lr
1066entry:
1067  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
1068  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
1069  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
1070  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
1071  %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
1072  %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
1073  %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1074  %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1075  %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1076  store <48 x i8> %s, <48 x i8> *%dst
1077  ret void
1078}
1079
1080; i64
1081
1082define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
1083; CHECK-LABEL: vst3_v2i64:
1084; CHECK:       @ %bb.0: @ %entry
1085; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1086; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1087; CHECK-NEXT:    vldrw.u32 q1, [r0]
1088; CHECK-NEXT:    vmov.f64 d6, d5
1089; CHECK-NEXT:    vmov.f32 s13, s11
1090; CHECK-NEXT:    vmov.f32 s14, s2
1091; CHECK-NEXT:    vmov.f32 s15, s3
1092; CHECK-NEXT:    vmov.f32 s2, s6
1093; CHECK-NEXT:    vmov.f32 s3, s7
1094; CHECK-NEXT:    vmov.f32 s6, s8
1095; CHECK-NEXT:    vmov.f32 s7, s9
1096; CHECK-NEXT:    vstrb.8 q1, [r1], #32
1097; CHECK-NEXT:    vstrw.32 q3, [r1]
1098; CHECK-NEXT:    vstrw.32 q0, [r1, #-16]
1099; CHECK-NEXT:    bx lr
1100entry:
1101  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
1102  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
1103  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
1104  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
1105  %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
1106  %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
1107  %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1108  %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1109  %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1110  store <6 x i64> %s, <6 x i64> *%dst
1111  ret void
1112}
1113
1114define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) {
1115; CHECK-LABEL: vst3_v4i64:
1116; CHECK:       @ %bb.0: @ %entry
1117; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1118; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1119; CHECK-NEXT:    .pad #16
1120; CHECK-NEXT:    sub sp, #16
1121; CHECK-NEXT:    vldrw.u32 q1, [r0]
1122; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
1123; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
1124; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1125; CHECK-NEXT:    vmov.f64 d10, d2
1126; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
1127; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1128; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
1129; CHECK-NEXT:    vmov.f32 s21, s5
1130; CHECK-NEXT:    vmov.f32 s22, s28
1131; CHECK-NEXT:    vmov.f32 s23, s29
1132; CHECK-NEXT:    vmov.f64 d14, d12
1133; CHECK-NEXT:    vstrw.32 q5, [r1]
1134; CHECK-NEXT:    vmov.f32 s29, s25
1135; CHECK-NEXT:    vmov.f64 d8, d7
1136; CHECK-NEXT:    vmov.f32 s30, s12
1137; CHECK-NEXT:    vmov.f32 s17, s15
1138; CHECK-NEXT:    vmov.f32 s31, s13
1139; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
1140; CHECK-NEXT:    vmov.f32 s18, s2
1141; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
1142; CHECK-NEXT:    vmov.f32 s4, s8
1143; CHECK-NEXT:    vmov.f32 s19, s3
1144; CHECK-NEXT:    vmov.f32 s2, s26
1145; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
1146; CHECK-NEXT:    vmov.f32 s5, s9
1147; CHECK-NEXT:    vmov.f32 s8, s14
1148; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1149; CHECK-NEXT:    vmov.f32 s3, s27
1150; CHECK-NEXT:    vmov.f32 s9, s15
1151; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1152; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
1153; CHECK-NEXT:    add sp, #16
1154; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1155; CHECK-NEXT:    bx lr
1156entry:
1157  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
1158  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
1159  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
1160  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
1161  %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
1162  %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
1163  %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1164  %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1165  %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1166  store <12 x i64> %s, <12 x i64> *%dst
1167  ret void
1168}
1169
1170; f32
1171
1172define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) {
1173; CHECK-LABEL: vst3_v2f32:
1174; CHECK:       @ %bb.0: @ %entry
1175; CHECK-NEXT:    vldr s0, [r0]
1176; CHECK-NEXT:    vldr s3, [r0, #4]
1177; CHECK-NEXT:    vldr s1, [r0, #8]
1178; CHECK-NEXT:    ldr r2, [r0, #20]
1179; CHECK-NEXT:    vldr s2, [r0, #16]
1180; CHECK-NEXT:    ldr r0, [r0, #12]
1181; CHECK-NEXT:    strd r0, r2, [r1, #16]
1182; CHECK-NEXT:    vstrw.32 q0, [r1]
1183; CHECK-NEXT:    bx lr
1184entry:
1185  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
1186  %l1 = load <2 x float>, <2 x float>* %s1, align 4
1187  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
1188  %l2 = load <2 x float>, <2 x float>* %s2, align 4
1189  %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
1190  %l3 = load <2 x float>, <2 x float>* %s3, align 4
1191  %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1192  %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1193  %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1194  store <6 x float> %s, <6 x float> *%dst
1195  ret void
1196}
1197
1198define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) {
1199; CHECK-LABEL: vst3_v4f32:
1200; CHECK:       @ %bb.0: @ %entry
1201; CHECK-NEXT:    .vsave {d8, d9}
1202; CHECK-NEXT:    vpush {d8, d9}
1203; CHECK-NEXT:    vldrw.u32 q3, [r0]
1204; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1205; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1206; CHECK-NEXT:    vmov.f64 d8, d6
1207; CHECK-NEXT:    vmov.f32 s17, s4
1208; CHECK-NEXT:    vmov.f32 s8, s5
1209; CHECK-NEXT:    vmov.f32 s19, s13
1210; CHECK-NEXT:    vmov.f32 s9, s1
1211; CHECK-NEXT:    vmov.f32 s18, s0
1212; CHECK-NEXT:    vmov.f32 s0, s2
1213; CHECK-NEXT:    vstrw.32 q4, [r1]
1214; CHECK-NEXT:    vmov.f32 s11, s6
1215; CHECK-NEXT:    vmov.f32 s1, s15
1216; CHECK-NEXT:    vmov.f32 s10, s14
1217; CHECK-NEXT:    vmov.f32 s2, s7
1218; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
1219; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
1220; CHECK-NEXT:    vpop {d8, d9}
1221; CHECK-NEXT:    bx lr
1222entry:
1223  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
1224  %l1 = load <4 x float>, <4 x float>* %s1, align 4
1225  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
1226  %l2 = load <4 x float>, <4 x float>* %s2, align 4
1227  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
1228  %l3 = load <4 x float>, <4 x float>* %s3, align 4
1229  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1230  %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1231  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1232  store <12 x float> %s, <12 x float> *%dst
1233  ret void
1234}
1235
1236define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) {
1237; CHECK-LABEL: vst3_v8f32:
1238; CHECK:       @ %bb.0: @ %entry
1239; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1240; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1241; CHECK-NEXT:    .pad #16
1242; CHECK-NEXT:    sub sp, #16
1243; CHECK-NEXT:    vldrw.u32 q4, [r0]
1244; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
1245; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
1246; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1247; CHECK-NEXT:    vmov.f64 d10, d8
1248; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1249; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
1250; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
1251; CHECK-NEXT:    vmov.f32 s21, s28
1252; CHECK-NEXT:    vmov.f64 d14, d12
1253; CHECK-NEXT:    vmov.f64 d4, d1
1254; CHECK-NEXT:    vmov.f32 s29, s12
1255; CHECK-NEXT:    vmov.f32 s9, s27
1256; CHECK-NEXT:    vmov.f32 s31, s25
1257; CHECK-NEXT:    vmov.f32 s11, s3
1258; CHECK-NEXT:    vmov.f32 s30, s0
1259; CHECK-NEXT:    vmov.f32 s0, s13
1260; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
1261; CHECK-NEXT:    vmov.f32 s3, s14
1262; CHECK-NEXT:    vmov.f32 s2, s26
1263; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
1264; CHECK-NEXT:    vmov.f32 s10, s15
1265; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1266; CHECK-NEXT:    vmov.f32 s23, s17
1267; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
1268; CHECK-NEXT:    vmov.f32 s12, s25
1269; CHECK-NEXT:    vmov.f32 s13, s5
1270; CHECK-NEXT:    vmov.f32 s22, s4
1271; CHECK-NEXT:    vmov.f32 s4, s6
1272; CHECK-NEXT:    vstrw.32 q5, [r1]
1273; CHECK-NEXT:    vmov.f32 s15, s26
1274; CHECK-NEXT:    vmov.f32 s5, s19
1275; CHECK-NEXT:    vmov.f32 s14, s18
1276; CHECK-NEXT:    vmov.f32 s6, s27
1277; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
1278; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1279; CHECK-NEXT:    add sp, #16
1280; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1281; CHECK-NEXT:    bx lr
1282entry:
1283  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
1284  %l1 = load <8 x float>, <8 x float>* %s1, align 4
1285  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
1286  %l2 = load <8 x float>, <8 x float>* %s2, align 4
1287  %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
1288  %l3 = load <8 x float>, <8 x float>* %s3, align 4
1289  %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1290  %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1291  %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1292  store <24 x float> %s, <24 x float> *%dst
1293  ret void
1294}
1295
1296define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) {
1297; CHECK-LABEL: vst3_v16f32:
1298; CHECK:       @ %bb.0: @ %entry
1299; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1300; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1301; CHECK-NEXT:    .pad #160
1302; CHECK-NEXT:    sub sp, #160
1303; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
1304; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1305; CHECK-NEXT:    vldrw.u32 q1, [r0, #128]
1306; CHECK-NEXT:    vldrw.u32 q6, [r0]
1307; CHECK-NEXT:    vstrw.32 q5, [sp, #112] @ 16-byte Spill
1308; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
1309; CHECK-NEXT:    vmov.f32 s16, s1
1310; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
1311; CHECK-NEXT:    vstrw.32 q5, [sp, #144] @ 16-byte Spill
1312; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
1313; CHECK-NEXT:    vmov.f32 s17, s5
1314; CHECK-NEXT:    vstrw.32 q3, [sp, #128] @ 16-byte Spill
1315; CHECK-NEXT:    vmov.f32 s19, s2
1316; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
1317; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
1318; CHECK-NEXT:    vmov.f32 s18, s26
1319; CHECK-NEXT:    vldrw.u32 q7, [r0, #144]
1320; CHECK-NEXT:    vldrw.u32 q2, [r0, #176]
1321; CHECK-NEXT:    vstrw.32 q5, [sp, #16] @ 16-byte Spill
1322; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
1323; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
1324; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
1325; CHECK-NEXT:    vmov.f64 d8, d3
1326; CHECK-NEXT:    vstrw.32 q5, [sp, #48] @ 16-byte Spill
1327; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
1328; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
1329; CHECK-NEXT:    vmov.f32 s17, s27
1330; CHECK-NEXT:    vmov.f32 s19, s7
1331; CHECK-NEXT:    vmov.f32 s18, s3
1332; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
1333; CHECK-NEXT:    vmov.f64 d8, d5
1334; CHECK-NEXT:    vmov.f32 s17, s23
1335; CHECK-NEXT:    vmov.f32 s19, s11
1336; CHECK-NEXT:    vmov.f32 s18, s15
1337; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
1338; CHECK-NEXT:    vmov.f64 d8, d12
1339; CHECK-NEXT:    vmov.f32 s17, s0
1340; CHECK-NEXT:    vmov.f32 s19, s25
1341; CHECK-NEXT:    vmov.f32 s18, s4
1342; CHECK-NEXT:    vmov q1, q5
1343; CHECK-NEXT:    vmov.f64 d0, d2
1344; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
1345; CHECK-NEXT:    vmov.f32 s1, s12
1346; CHECK-NEXT:    vmov.f32 s3, s5
1347; CHECK-NEXT:    vmov.f32 s2, s8
1348; CHECK-NEXT:    vmov.f32 s8, s13
1349; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
1350; CHECK-NEXT:    vmov.f32 s11, s14
1351; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
1352; CHECK-NEXT:    vmov.f32 s10, s6
1353; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
1354; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
1355; CHECK-NEXT:    vmov.f64 d8, d1
1356; CHECK-NEXT:    vmov q2, q1
1357; CHECK-NEXT:    vmov.f32 s20, s5
1358; CHECK-NEXT:    vmov.f32 s21, s1
1359; CHECK-NEXT:    vmov.f32 s23, s6
1360; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
1361; CHECK-NEXT:    vmov.f64 d6, d15
1362; CHECK-NEXT:    vmov q6, q1
1363; CHECK-NEXT:    vmov.f32 s17, s7
1364; CHECK-NEXT:    vmov.f32 s22, s6
1365; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
1366; CHECK-NEXT:    vmov.f32 s19, s3
1367; CHECK-NEXT:    vmov q0, q6
1368; CHECK-NEXT:    vmov.f32 s13, s7
1369; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
1370; CHECK-NEXT:    vmov.f32 s15, s31
1371; CHECK-NEXT:    vldrw.u32 q7, [sp, #112] @ 16-byte Reload
1372; CHECK-NEXT:    vmov.f32 s18, s11
1373; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
1374; CHECK-NEXT:    vmov.f32 s25, s28
1375; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
1376; CHECK-NEXT:    vmov.f32 s27, s1
1377; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
1378; CHECK-NEXT:    vmov.f32 s14, s11
1379; CHECK-NEXT:    vstrw.32 q4, [r1, #128]
1380; CHECK-NEXT:    vmov.f32 s26, s0
1381; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
1382; CHECK-NEXT:    vmov.f64 d0, d2
1383; CHECK-NEXT:    vstrw.32 q6, [r1, #96]
1384; CHECK-NEXT:    vmov.f32 s1, s8
1385; CHECK-NEXT:    vmov q2, q1
1386; CHECK-NEXT:    vmov.f32 s3, s5
1387; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
1388; CHECK-NEXT:    vmov.f32 s2, s28
1389; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
1390; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
1391; CHECK-NEXT:    vmov.f32 s28, s5
1392; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
1393; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1394; CHECK-NEXT:    vmov.f32 s31, s6
1395; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
1396; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
1397; CHECK-NEXT:    vmov.f32 s30, s10
1398; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
1399; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
1400; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
1401; CHECK-NEXT:    vstrw.32 q0, [r1]
1402; CHECK-NEXT:    add sp, #160
1403; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1404; CHECK-NEXT:    bx lr
1405entry:
1406  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
1407  %l1 = load <16 x float>, <16 x float>* %s1, align 4
1408  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
1409  %l2 = load <16 x float>, <16 x float>* %s2, align 4
1410  %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
1411  %l3 = load <16 x float>, <16 x float>* %s3, align 4
1412  %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1413  %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1414  %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1415  store <48 x float> %s, <48 x float> *%dst
1416  ret void
1417}
1418
1419; f16
1420
1421define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
1422; CHECK-LABEL: vst3_v2f16:
1423; CHECK:       @ %bb.0: @ %entry
1424; CHECK-NEXT:    vldmia r0, {s4, s5}
1425; CHECK-NEXT:    vmov r2, s5
1426; CHECK-NEXT:    ldr r0, [r0, #8]
1427; CHECK-NEXT:    vmov r3, s4
1428; CHECK-NEXT:    vmovx.f16 s12, s4
1429; CHECK-NEXT:    vmov.16 q0[0], r3
1430; CHECK-NEXT:    vmov.32 q2[0], r0
1431; CHECK-NEXT:    vmov.16 q0[1], r2
1432; CHECK-NEXT:    vmov r0, s8
1433; CHECK-NEXT:    vmov.16 q0[2], r0
1434; CHECK-NEXT:    vmov r0, s12
1435; CHECK-NEXT:    vmovx.f16 s4, s5
1436; CHECK-NEXT:    vmov.16 q0[3], r0
1437; CHECK-NEXT:    vmov r0, s4
1438; CHECK-NEXT:    vmovx.f16 s4, s8
1439; CHECK-NEXT:    vmov.16 q0[4], r0
1440; CHECK-NEXT:    vmov r0, s4
1441; CHECK-NEXT:    vmov.16 q0[5], r0
1442; CHECK-NEXT:    vmov r0, s0
1443; CHECK-NEXT:    vmov r2, s1
1444; CHECK-NEXT:    vmov r3, s2
1445; CHECK-NEXT:    stm r1!, {r0, r2, r3}
1446; CHECK-NEXT:    bx lr
1447entry:
1448  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
1449  %l1 = load <2 x half>, <2 x half>* %s1, align 4
1450  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
1451  %l2 = load <2 x half>, <2 x half>* %s2, align 4
1452  %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
1453  %l3 = load <2 x half>, <2 x half>* %s3, align 4
1454  %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1455  %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1456  %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1457  store <6 x half> %s, <6 x half> *%dst
1458  ret void
1459}
1460
1461define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
1462; CHECK-LABEL: vst3_v4f16:
1463; CHECK:       @ %bb.0: @ %entry
1464; CHECK-NEXT:    .save {r7, lr}
1465; CHECK-NEXT:    push {r7, lr}
1466; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
1467; CHECK-NEXT:    vmov.32 q0[0], r2
1468; CHECK-NEXT:    vmov.32 q0[1], r3
1469; CHECK-NEXT:    vmov.32 q0[2], r12
1470; CHECK-NEXT:    vmov.32 q0[3], lr
1471; CHECK-NEXT:    vmov r3, s0
1472; CHECK-NEXT:    vmovx.f16 s12, s0
1473; CHECK-NEXT:    vmov r2, s2
1474; CHECK-NEXT:    vmov.16 q2[0], r3
1475; CHECK-NEXT:    vmov.16 q2[1], r2
1476; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
1477; CHECK-NEXT:    vmovx.f16 s0, s3
1478; CHECK-NEXT:    vmov.32 q1[0], r2
1479; CHECK-NEXT:    vmov.32 q1[1], r0
1480; CHECK-NEXT:    vmov r0, s4
1481; CHECK-NEXT:    vmov.16 q2[2], r0
1482; CHECK-NEXT:    vmov r0, s12
1483; CHECK-NEXT:    vmovx.f16 s12, s2
1484; CHECK-NEXT:    vmov.16 q2[3], r0
1485; CHECK-NEXT:    vmov r0, s12
1486; CHECK-NEXT:    vmovx.f16 s12, s4
1487; CHECK-NEXT:    vmov.16 q2[4], r0
1488; CHECK-NEXT:    vmov r0, s12
1489; CHECK-NEXT:    vmov.16 q2[5], r0
1490; CHECK-NEXT:    vmov r0, s1
1491; CHECK-NEXT:    vmov.16 q2[6], r0
1492; CHECK-NEXT:    vmov r0, s3
1493; CHECK-NEXT:    vmov.16 q2[7], r0
1494; CHECK-NEXT:    vmov r2, s5
1495; CHECK-NEXT:    vstrw.32 q2, [r1]
1496; CHECK-NEXT:    vmovx.f16 s8, s1
1497; CHECK-NEXT:    vmov r0, s8
1498; CHECK-NEXT:    vmov.16 q2[0], r2
1499; CHECK-NEXT:    vmov.16 q2[1], r0
1500; CHECK-NEXT:    vmov r0, s0
1501; CHECK-NEXT:    vmovx.f16 s0, s5
1502; CHECK-NEXT:    vmov.16 q2[2], r0
1503; CHECK-NEXT:    vmov r0, s0
1504; CHECK-NEXT:    vmov.16 q2[3], r0
1505; CHECK-NEXT:    vmov r0, s9
1506; CHECK-NEXT:    vmov r2, s8
1507; CHECK-NEXT:    strd r2, r0, [r1, #16]
1508; CHECK-NEXT:    pop {r7, pc}
1509entry:
1510  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
1511  %l1 = load <4 x half>, <4 x half>* %s1, align 4
1512  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
1513  %l2 = load <4 x half>, <4 x half>* %s2, align 4
1514  %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
1515  %l3 = load <4 x half>, <4 x half>* %s3, align 4
1516  %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1517  %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1518  %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1519  store <12 x half> %s, <12 x half> *%dst
1520  ret void
1521}
1522
1523define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
1524; CHECK-LABEL: vst3_v8f16:
1525; CHECK:       @ %bb.0: @ %entry
1526; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1527; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1528; CHECK-NEXT:    .pad #16
1529; CHECK-NEXT:    sub sp, #16
1530; CHECK-NEXT:    vldrw.u32 q2, [r0]
1531; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1532; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
1533; CHECK-NEXT:    vmov r3, s8
1534; CHECK-NEXT:    vmovx.f16 s12, s4
1535; CHECK-NEXT:    vmov r2, s4
1536; CHECK-NEXT:    vmov.16 q0[0], r3
1537; CHECK-NEXT:    vmov.16 q0[1], r2
1538; CHECK-NEXT:    vmov r2, s12
1539; CHECK-NEXT:    vmov r0, s20
1540; CHECK-NEXT:    vmov.16 q0[4], r2
1541; CHECK-NEXT:    vdup.32 q4, r0
1542; CHECK-NEXT:    vmov r2, s9
1543; CHECK-NEXT:    vmov.16 q0[6], r2
1544; CHECK-NEXT:    vmov r2, s5
1545; CHECK-NEXT:    vmovx.f16 s12, s8
1546; CHECK-NEXT:    vmov r0, s17
1547; CHECK-NEXT:    vmov.16 q0[7], r2
1548; CHECK-NEXT:    vmov r2, s12
1549; CHECK-NEXT:    vmov.16 q3[2], r0
1550; CHECK-NEXT:    vmov.f32 s1, s8
1551; CHECK-NEXT:    vmov.16 q3[3], r2
1552; CHECK-NEXT:    vmov r0, s2
1553; CHECK-NEXT:    vmovx.f16 s16, s18
1554; CHECK-NEXT:    vmov.16 q3[4], r0
1555; CHECK-NEXT:    vmov r0, s16
1556; CHECK-NEXT:    vmovx.f16 s16, s22
1557; CHECK-NEXT:    vmov.16 q3[5], r0
1558; CHECK-NEXT:    vmov r0, s16
1559; CHECK-NEXT:    vmovx.f16 s16, s6
1560; CHECK-NEXT:    vmovx.f16 s24, s7
1561; CHECK-NEXT:    vmov r2, s16
1562; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
1563; CHECK-NEXT:    vmov.16 q4[0], r2
1564; CHECK-NEXT:    vmov r2, s11
1565; CHECK-NEXT:    vmov.16 q4[1], r0
1566; CHECK-NEXT:    vmov r0, s7
1567; CHECK-NEXT:    vmov.16 q4[3], r0
1568; CHECK-NEXT:    vmov r0, s24
1569; CHECK-NEXT:    vmovx.f16 s24, s23
1570; CHECK-NEXT:    vmov.16 q4[6], r0
1571; CHECK-NEXT:    vmov r0, s24
1572; CHECK-NEXT:    vdup.32 q7, r2
1573; CHECK-NEXT:    vmov.16 q4[7], r0
1574; CHECK-NEXT:    vmov r2, s29
1575; CHECK-NEXT:    vmov.f32 s18, s23
1576; CHECK-NEXT:    vmovx.f16 s24, s17
1577; CHECK-NEXT:    vmov r0, s24
1578; CHECK-NEXT:    vmov.16 q6[2], r2
1579; CHECK-NEXT:    vmov.16 q6[3], r0
1580; CHECK-NEXT:    vmovx.f16 s28, s30
1581; CHECK-NEXT:    vmovx.f16 s4, s10
1582; CHECK-NEXT:    vmov.f32 s1, s13
1583; CHECK-NEXT:    vmov.f32 s2, s14
1584; CHECK-NEXT:    vstrw.32 q0, [r1]
1585; CHECK-NEXT:    vmov r0, s18
1586; CHECK-NEXT:    vmov.16 q6[4], r0
1587; CHECK-NEXT:    vmov r0, s28
1588; CHECK-NEXT:    vmov.16 q6[5], r0
1589; CHECK-NEXT:    vmovx.f16 s28, s9
1590; CHECK-NEXT:    vmov r0, s21
1591; CHECK-NEXT:    vmov r2, s28
1592; CHECK-NEXT:    vmov.16 q7[0], r0
1593; CHECK-NEXT:    vmov.16 q7[1], r2
1594; CHECK-NEXT:    vmov r0, s22
1595; CHECK-NEXT:    vmov.16 q7[6], r0
1596; CHECK-NEXT:    vmov r0, s4
1597; CHECK-NEXT:    vmov.16 q7[7], r0
1598; CHECK-NEXT:    vmov.f32 s17, s25
1599; CHECK-NEXT:    vmov.f32 s29, s21
1600; CHECK-NEXT:    vmov.f32 s30, s10
1601; CHECK-NEXT:    vmovx.f16 s4, s29
1602; CHECK-NEXT:    vmov r0, s4
1603; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
1604; CHECK-NEXT:    vmov.f32 s18, s26
1605; CHECK-NEXT:    vrev32.16 q2, q1
1606; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
1607; CHECK-NEXT:    vmov r2, s9
1608; CHECK-NEXT:    vmovx.f16 s8, s10
1609; CHECK-NEXT:    vmov.16 q1[2], r2
1610; CHECK-NEXT:    vmov.16 q1[3], r0
1611; CHECK-NEXT:    vmov r0, s30
1612; CHECK-NEXT:    vmov.16 q1[4], r0
1613; CHECK-NEXT:    vmov r0, s8
1614; CHECK-NEXT:    vmov.16 q1[5], r0
1615; CHECK-NEXT:    vmov.f32 s29, s5
1616; CHECK-NEXT:    vmov.f32 s30, s6
1617; CHECK-NEXT:    vstrw.32 q7, [r1, #16]
1618; CHECK-NEXT:    add sp, #16
1619; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1620; CHECK-NEXT:    bx lr
1621entry:
1622  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
1623  %l1 = load <8 x half>, <8 x half>* %s1, align 4
1624  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
1625  %l2 = load <8 x half>, <8 x half>* %s2, align 4
1626  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
1627  %l3 = load <8 x half>, <8 x half>* %s3, align 4
1628  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1629  %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1630  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1631  store <24 x half> %s, <24 x half> *%dst
1632  ret void
1633}
1634
1635define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
1636; CHECK-LABEL: vst3_v16f16:
1637; CHECK:       @ %bb.0: @ %entry
1638; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1639; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1640; CHECK-NEXT:    .pad #144
1641; CHECK-NEXT:    sub sp, #144
1642; CHECK-NEXT:    vldrw.u32 q0, [r0]
1643; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
1644; CHECK-NEXT:    vmov r3, s0
1645; CHECK-NEXT:    vmov q3, q0
1646; CHECK-NEXT:    vmov r2, s8
1647; CHECK-NEXT:    vmov.16 q1[0], r3
1648; CHECK-NEXT:    vmovx.f16 s0, s8
1649; CHECK-NEXT:    vmov.16 q1[1], r2
1650; CHECK-NEXT:    vmov r2, s0
1651; CHECK-NEXT:    vmovx.f16 s0, s12
1652; CHECK-NEXT:    vmov.16 q1[4], r2
1653; CHECK-NEXT:    vmov r2, s13
1654; CHECK-NEXT:    vmov.16 q1[6], r2
1655; CHECK-NEXT:    vmov r2, s9
1656; CHECK-NEXT:    vmov.16 q1[7], r2
1657; CHECK-NEXT:    vmov r2, s0
1658; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1659; CHECK-NEXT:    vmov.f32 s5, s12
1660; CHECK-NEXT:    vmov q5, q3
1661; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
1662; CHECK-NEXT:    vmov r3, s0
1663; CHECK-NEXT:    vmov q4, q0
1664; CHECK-NEXT:    vdup.32 q0, r3
1665; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
1666; CHECK-NEXT:    vmov r3, s1
1667; CHECK-NEXT:    vmovx.f16 s0, s2
1668; CHECK-NEXT:    vmov.16 q3[2], r3
1669; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
1670; CHECK-NEXT:    vmov.16 q3[3], r2
1671; CHECK-NEXT:    vmov r2, s6
1672; CHECK-NEXT:    vmov.16 q3[4], r2
1673; CHECK-NEXT:    vmov r2, s0
1674; CHECK-NEXT:    vmovx.f16 s0, s18
1675; CHECK-NEXT:    vmov.16 q3[5], r2
1676; CHECK-NEXT:    vmov r2, s0
1677; CHECK-NEXT:    vmovx.f16 s0, s10
1678; CHECK-NEXT:    vmov r3, s0
1679; CHECK-NEXT:    vmovx.f16 s0, s11
1680; CHECK-NEXT:    vmov.16 q1[0], r3
1681; CHECK-NEXT:    vmov r3, s23
1682; CHECK-NEXT:    vmov.16 q1[1], r2
1683; CHECK-NEXT:    vmov r2, s11
1684; CHECK-NEXT:    vmov.16 q1[3], r2
1685; CHECK-NEXT:    vmov r2, s0
1686; CHECK-NEXT:    vmovx.f16 s0, s19
1687; CHECK-NEXT:    vmov.16 q1[6], r2
1688; CHECK-NEXT:    vmov r2, s0
1689; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
1690; CHECK-NEXT:    vmov.16 q1[7], r2
1691; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
1692; CHECK-NEXT:    vmov.f32 s6, s19
1693; CHECK-NEXT:    vmovx.f16 s0, s5
1694; CHECK-NEXT:    vmov r2, s0
1695; CHECK-NEXT:    vdup.32 q0, r3
1696; CHECK-NEXT:    vmov r3, s1
1697; CHECK-NEXT:    vmovx.f16 s0, s2
1698; CHECK-NEXT:    vmov.16 q2[2], r3
1699; CHECK-NEXT:    vmov r3, s20
1700; CHECK-NEXT:    vmov.16 q2[3], r2
1701; CHECK-NEXT:    vmov.16 q7[0], r3
1702; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
1703; CHECK-NEXT:    vstrw.32 q1, [sp, #96] @ 16-byte Spill
1704; CHECK-NEXT:    vstrw.32 q4, [sp, #48] @ 16-byte Spill
1705; CHECK-NEXT:    vmov r2, s6
1706; CHECK-NEXT:    vmov.16 q2[4], r2
1707; CHECK-NEXT:    vmov r2, s0
1708; CHECK-NEXT:    vmov.16 q2[5], r2
1709; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
1710; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
1711; CHECK-NEXT:    vmov r0, s12
1712; CHECK-NEXT:    vmov r2, s8
1713; CHECK-NEXT:    vmovx.f16 s0, s8
1714; CHECK-NEXT:    vmov.16 q7[1], r2
1715; CHECK-NEXT:    vmov r2, s0
1716; CHECK-NEXT:    vmov.16 q7[4], r2
1717; CHECK-NEXT:    vmov r2, s21
1718; CHECK-NEXT:    vmov.16 q7[6], r2
1719; CHECK-NEXT:    vmov r2, s9
1720; CHECK-NEXT:    vmovx.f16 s0, s20
1721; CHECK-NEXT:    vmov.16 q7[7], r2
1722; CHECK-NEXT:    vmov r2, s0
1723; CHECK-NEXT:    vdup.32 q0, r0
1724; CHECK-NEXT:    vmov r0, s1
1725; CHECK-NEXT:    vmovx.f16 s0, s2
1726; CHECK-NEXT:    vmov.16 q1[2], r0
1727; CHECK-NEXT:    vmov.f32 s29, s20
1728; CHECK-NEXT:    vmov.16 q1[3], r2
1729; CHECK-NEXT:    vmov r0, s30
1730; CHECK-NEXT:    vmov.16 q1[4], r0
1731; CHECK-NEXT:    vmov r0, s0
1732; CHECK-NEXT:    vmov.16 q1[5], r0
1733; CHECK-NEXT:    vmovx.f16 s0, s14
1734; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
1735; CHECK-NEXT:    vmov q1, q2
1736; CHECK-NEXT:    vmov r0, s0
1737; CHECK-NEXT:    vmovx.f16 s0, s6
1738; CHECK-NEXT:    vmov r2, s0
1739; CHECK-NEXT:    vmovx.f16 s0, s7
1740; CHECK-NEXT:    vmov.16 q2[0], r2
1741; CHECK-NEXT:    vmov r2, s23
1742; CHECK-NEXT:    vmov.16 q2[1], r0
1743; CHECK-NEXT:    vmov r0, s7
1744; CHECK-NEXT:    vmov.16 q2[3], r0
1745; CHECK-NEXT:    vmov r0, s0
1746; CHECK-NEXT:    vmovx.f16 s0, s15
1747; CHECK-NEXT:    vmov.16 q2[6], r0
1748; CHECK-NEXT:    vmov r0, s0
1749; CHECK-NEXT:    vmov.16 q2[7], r0
1750; CHECK-NEXT:    vmov.f32 s10, s15
1751; CHECK-NEXT:    vmovx.f16 s0, s9
1752; CHECK-NEXT:    vmov r0, s0
1753; CHECK-NEXT:    vdup.32 q0, r2
1754; CHECK-NEXT:    vmov r2, s1
1755; CHECK-NEXT:    vmovx.f16 s0, s2
1756; CHECK-NEXT:    vmov.16 q6[2], r2
1757; CHECK-NEXT:    vmov.16 q6[3], r0
1758; CHECK-NEXT:    vmov r0, s10
1759; CHECK-NEXT:    vmov.16 q6[4], r0
1760; CHECK-NEXT:    vmov r0, s0
1761; CHECK-NEXT:    vmov.16 q6[5], r0
1762; CHECK-NEXT:    vmov r0, s13
1763; CHECK-NEXT:    vmovx.f16 s0, s21
1764; CHECK-NEXT:    vmov.16 q4[0], r0
1765; CHECK-NEXT:    vmov r2, s0
1766; CHECK-NEXT:    vmovx.f16 s0, s22
1767; CHECK-NEXT:    vmov.16 q4[1], r2
1768; CHECK-NEXT:    vmov r0, s14
1769; CHECK-NEXT:    vmov.16 q4[6], r0
1770; CHECK-NEXT:    vmov r0, s0
1771; CHECK-NEXT:    vmov.16 q4[7], r0
1772; CHECK-NEXT:    vmov.f32 s9, s25
1773; CHECK-NEXT:    vmov.f32 s17, s13
1774; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
1775; CHECK-NEXT:    vmov.f32 s18, s22
1776; CHECK-NEXT:    vmovx.f16 s0, s17
1777; CHECK-NEXT:    vmov r0, s0
1778; CHECK-NEXT:    vrev32.16 q0, q1
1779; CHECK-NEXT:    vmov r2, s1
1780; CHECK-NEXT:    vmovx.f16 s0, s2
1781; CHECK-NEXT:    vmov.16 q1[2], r2
1782; CHECK-NEXT:    vmov.f32 s10, s26
1783; CHECK-NEXT:    vmov.16 q1[3], r0
1784; CHECK-NEXT:    vldrw.u32 q6, [sp, #112] @ 16-byte Reload
1785; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
1786; CHECK-NEXT:    vmov r0, s18
1787; CHECK-NEXT:    vmov.16 q1[4], r0
1788; CHECK-NEXT:    vmov r0, s0
1789; CHECK-NEXT:    vmov.16 q1[5], r0
1790; CHECK-NEXT:    vmovx.f16 s0, s13
1791; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
1792; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
1793; CHECK-NEXT:    vmov r2, s0
1794; CHECK-NEXT:    vmovx.f16 s0, s14
1795; CHECK-NEXT:    vmov r0, s5
1796; CHECK-NEXT:    vmov.16 q5[0], r0
1797; CHECK-NEXT:    vmov r0, s6
1798; CHECK-NEXT:    vmov.16 q5[1], r2
1799; CHECK-NEXT:    vmov.16 q5[6], r0
1800; CHECK-NEXT:    vmov r0, s0
1801; CHECK-NEXT:    vmov.16 q5[7], r0
1802; CHECK-NEXT:    vmov.f32 s21, s5
1803; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
1804; CHECK-NEXT:    vmov.f32 s22, s14
1805; CHECK-NEXT:    vmovx.f16 s0, s21
1806; CHECK-NEXT:    vmov r0, s0
1807; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
1808; CHECK-NEXT:    vrev32.16 q3, q0
1809; CHECK-NEXT:    vmov r2, s13
1810; CHECK-NEXT:    vmovx.f16 s12, s14
1811; CHECK-NEXT:    vmov.16 q0[2], r2
1812; CHECK-NEXT:    vmov.16 q0[3], r0
1813; CHECK-NEXT:    vmov r0, s22
1814; CHECK-NEXT:    vmov.16 q0[4], r0
1815; CHECK-NEXT:    vmov r0, s12
1816; CHECK-NEXT:    vldrw.u32 q3, [sp, #128] @ 16-byte Reload
1817; CHECK-NEXT:    vmov.16 q0[5], r0
1818; CHECK-NEXT:    vmov.f32 s13, s25
1819; CHECK-NEXT:    vmov.f32 s14, s26
1820; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
1821; CHECK-NEXT:    vmov.f32 s25, s5
1822; CHECK-NEXT:    vstrw.32 q3, [r1]
1823; CHECK-NEXT:    vmov.f32 s21, s1
1824; CHECK-NEXT:    vmov.f32 s26, s6
1825; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
1826; CHECK-NEXT:    vmov.f32 s22, s2
1827; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
1828; CHECK-NEXT:    vmov.f32 s29, s5
1829; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
1830; CHECK-NEXT:    vmov.f32 s17, s1
1831; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
1832; CHECK-NEXT:    vmov.f32 s30, s6
1833; CHECK-NEXT:    vmov.f32 s18, s2
1834; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
1835; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
1836; CHECK-NEXT:    add sp, #144
1837; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1838; CHECK-NEXT:    bx lr
1839entry:
1840  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
1841  %l1 = load <16 x half>, <16 x half>* %s1, align 4
1842  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
1843  %l2 = load <16 x half>, <16 x half>* %s2, align 4
1844  %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
1845  %l3 = load <16 x half>, <16 x half>* %s3, align 4
1846  %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1847  %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1848  %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1849  store <48 x half> %s, <48 x half> *%dst
1850  ret void
1851}
1852
1853; f64
1854
1855define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) {
1856; CHECK-LABEL: vst3_v2f64:
1857; CHECK:       @ %bb.0: @ %entry
1858; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
1859; CHECK-NEXT:    vldrw.u32 q0, [r0]
1860; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1861; CHECK-NEXT:    vmov.f64 d6, d2
1862; CHECK-NEXT:    vmov.f64 d7, d1
1863; CHECK-NEXT:    vmov.f64 d1, d4
1864; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
1865; CHECK-NEXT:    vmov.f64 d2, d5
1866; CHECK-NEXT:    vstrw.32 q0, [r1]
1867; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1868; CHECK-NEXT:    bx lr
1869entry:
1870  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
1871  %l1 = load <2 x double>, <2 x double>* %s1, align 4
1872  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
1873  %l2 = load <2 x double>, <2 x double>* %s2, align 4
1874  %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
1875  %l3 = load <2 x double>, <2 x double>* %s3, align 4
1876  %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1877  %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1878  %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1879  store <6 x double> %s, <6 x double> *%dst
1880  ret void
1881}
1882
1883define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) {
1884; CHECK-LABEL: vst3_v4f64:
1885; CHECK:       @ %bb.0: @ %entry
1886; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1887; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1888; CHECK-NEXT:    .pad #16
1889; CHECK-NEXT:    sub sp, #16
1890; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
1891; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
1892; CHECK-NEXT:    vldrw.u32 q1, [r0]
1893; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1894; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1895; CHECK-NEXT:    vmov.f64 d6, d15
1896; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
1897; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
1898; CHECK-NEXT:    vmov.f64 d10, d2
1899; CHECK-NEXT:    vmov.f64 d7, d1
1900; CHECK-NEXT:    vmov.f64 d11, d12
1901; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
1902; CHECK-NEXT:    vmov.f64 d12, d4
1903; CHECK-NEXT:    vstrw.32 q5, [r1]
1904; CHECK-NEXT:    vmov.f64 d1, d5
1905; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
1906; CHECK-NEXT:    vmov.f64 d2, d8
1907; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1908; CHECK-NEXT:    vmov.f64 d13, d14
1909; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1910; CHECK-NEXT:    vmov.f64 d8, d5
1911; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
1912; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
1913; CHECK-NEXT:    add sp, #16
1914; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1915; CHECK-NEXT:    bx lr
1916entry:
1917  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
1918  %l1 = load <4 x double>, <4 x double>* %s1, align 4
1919  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
1920  %l2 = load <4 x double>, <4 x double>* %s2, align 4
1921  %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
1922  %l3 = load <4 x double>, <4 x double>* %s3, align 4
1923  %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1924  %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1925  %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1926  store <12 x double> %s, <12 x double> *%dst
1927  ret void
1928}
1929