• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) {
7; CHECK-LABEL: vld4_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    vldrw.u32 q0, [r0]
10; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
11; CHECK-NEXT:    vmov.f32 s8, s3
12; CHECK-NEXT:    vmov.f32 s10, s7
13; CHECK-NEXT:    vmov r2, s6
14; CHECK-NEXT:    vmov.f32 s12, s1
15; CHECK-NEXT:    vmov.f32 s14, s5
16; CHECK-NEXT:    vmov r3, s4
17; CHECK-NEXT:    vmov r0, s10
18; CHECK-NEXT:    add r0, r2
19; CHECK-NEXT:    vmov r2, s14
20; CHECK-NEXT:    add r2, r3
21; CHECK-NEXT:    vmov r3, s2
22; CHECK-NEXT:    add.w r12, r2, r0
23; CHECK-NEXT:    vmov r2, s8
24; CHECK-NEXT:    vmov r0, s0
25; CHECK-NEXT:    add r2, r3
26; CHECK-NEXT:    vmov r3, s12
27; CHECK-NEXT:    add r0, r3
28; CHECK-NEXT:    add r0, r2
29; CHECK-NEXT:    strd r0, r12, [r1]
30; CHECK-NEXT:    bx lr
31entry:
32  %l1 = load <8 x i32>, <8 x i32>* %src, align 4
33  %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
34  %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
35  %s3 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
36  %s4 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
37  %a1 = add <2 x i32> %s1, %s2
38  %a2 = add <2 x i32> %s3, %s4
39  %a3 = add <2 x i32> %a1, %a2
40  store <2 x i32> %a3, <2 x i32> *%dst
41  ret void
42}
43
44define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
45; CHECK-LABEL: vld4_v4i32:
46; CHECK:       @ %bb.0: @ %entry
47; CHECK-NEXT:    .vsave {d8, d9}
48; CHECK-NEXT:    vpush {d8, d9}
49; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
50; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
51; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
52; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
53; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
54; CHECK-NEXT:    vadd.i32 q4, q2, q3
55; CHECK-NEXT:    vadd.i32 q0, q0, q1
56; CHECK-NEXT:    vadd.i32 q0, q0, q4
57; CHECK-NEXT:    vstrw.32 q0, [r1]
58; CHECK-NEXT:    vpop {d8, d9}
59; CHECK-NEXT:    bx lr
60entry:
61  %l1 = load <16 x i32>, <16 x i32>* %src, align 4
62  %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
63  %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
64  %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
65  %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
66  %a1 = add <4 x i32> %s1, %s2
67  %a2 = add <4 x i32> %s3, %s4
68  %a3 = add <4 x i32> %a1, %a2
69  store <4 x i32> %a3, <4 x i32> *%dst
70  ret void
71}
72
73define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) {
74; CHECK-LABEL: vld4_v8i32:
75; CHECK:       @ %bb.0: @ %entry
76; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
77; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
78; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
79; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
80; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
81; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
82; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
83; CHECK-NEXT:    vadd.i32 q6, q2, q3
84; CHECK-NEXT:    vadd.i32 q0, q0, q1
85; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
86; CHECK-NEXT:    vadd.i32 q0, q0, q6
87; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
88; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
89; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
90; CHECK-NEXT:    vstrw.32 q0, [r1]
91; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
92; CHECK-NEXT:    vadd.i32 q5, q3, q4
93; CHECK-NEXT:    vadd.i32 q1, q1, q2
94; CHECK-NEXT:    vadd.i32 q1, q1, q5
95; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
96; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
97; CHECK-NEXT:    bx lr
98entry:
99  %l1 = load <32 x i32>, <32 x i32>* %src, align 4
100  %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
101  %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
102  %s3 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
103  %s4 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
104  %a1 = add <8 x i32> %s1, %s2
105  %a2 = add <8 x i32> %s3, %s4
106  %a3 = add <8 x i32> %a1, %a2
107  store <8 x i32> %a3, <8 x i32> *%dst
108  ret void
109}
110
111define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
112; CHECK-LABEL: vld4_v16i32:
113; CHECK:       @ %bb.0: @ %entry
114; CHECK-NEXT:    .save {r4, r5}
115; CHECK-NEXT:    push {r4, r5}
116; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
117; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
118; CHECK-NEXT:    .pad #136
119; CHECK-NEXT:    sub sp, #136
120; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
121; CHECK-NEXT:    mov r2, r0
122; CHECK-NEXT:    add.w r3, r0, #192
123; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
124; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
125; CHECK-NEXT:    adds r0, #128
126; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
127; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
128; CHECK-NEXT:    vadd.i32 q4, q2, q3
129; CHECK-NEXT:    vadd.i32 q0, q0, q1
130; CHECK-NEXT:    vstrw.32 q4, [sp, #112] @ 16-byte Spill
131; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
132; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
133; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
134; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
135; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
136; CHECK-NEXT:    vldrw.u32 q6, [sp, #112] @ 16-byte Reload
137; CHECK-NEXT:    vldrw.u32 q5, [sp, #96] @ 16-byte Reload
138; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
139; CHECK-NEXT:    vmov q0, q1
140; CHECK-NEXT:    vadd.i32 q6, q5, q6
141; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
142; CHECK-NEXT:    vadd.i32 q0, q0, q2
143; CHECK-NEXT:    vstrw.32 q6, [sp, #112] @ 16-byte Spill
144; CHECK-NEXT:    vadd.i32 q1, q3, q5
145; CHECK-NEXT:    vadd.i32 q0, q0, q1
146; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
147; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
148; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
149; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
150; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
151; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
152; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
153; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
154; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
155; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
156; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
157; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
158; CHECK-NEXT:    vmov q5, q1
159; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
160; CHECK-NEXT:    vadd.i32 q0, q0, q5
161; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
162; CHECK-NEXT:    vadd.i32 q1, q2, q1
163; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
164; CHECK-NEXT:    vadd.i32 q2, q3, q4
165; CHECK-NEXT:    vadd.i32 q0, q0, q1
166; CHECK-NEXT:    vadd.i32 q1, q5, q6
167; CHECK-NEXT:    vadd.i32 q1, q2, q1
168; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
169; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
170; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
171; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
172; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
173; CHECK-NEXT:    vstrw.32 q0, [r1]
174; CHECK-NEXT:    add sp, #136
175; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
176; CHECK-NEXT:    pop {r4, r5}
177; CHECK-NEXT:    bx lr
178entry:
179  %l1 = load <64 x i32>, <64 x i32>* %src, align 4
180  %s1 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
181  %s2 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
182  %s3 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
183  %s4 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
184  %a1 = add <16 x i32> %s1, %s2
185  %a2 = add <16 x i32> %s3, %s4
186  %a3 = add <16 x i32> %a1, %a2
187  store <16 x i32> %a3, <16 x i32> *%dst
188  ret void
189}
190
191; i16
192
193define void @vld4_v2i16(<8 x i16> *%src, <2 x i16> *%dst) {
194; CHECK-LABEL: vld4_v2i16:
195; CHECK:       @ %bb.0: @ %entry
196; CHECK-NEXT:    vldrw.u32 q0, [r0]
197; CHECK-NEXT:    vmov.u16 r0, q0[7]
198; CHECK-NEXT:    vmov.u16 r2, q0[6]
199; CHECK-NEXT:    add r0, r2
200; CHECK-NEXT:    vmov.u16 r2, q0[5]
201; CHECK-NEXT:    vmov.u16 r3, q0[4]
202; CHECK-NEXT:    add r2, r3
203; CHECK-NEXT:    vmov.u16 r3, q0[0]
204; CHECK-NEXT:    add r0, r2
205; CHECK-NEXT:    strh r0, [r1, #2]
206; CHECK-NEXT:    vmov.u16 r0, q0[3]
207; CHECK-NEXT:    vmov.u16 r2, q0[2]
208; CHECK-NEXT:    add r0, r2
209; CHECK-NEXT:    vmov.u16 r2, q0[1]
210; CHECK-NEXT:    add r2, r3
211; CHECK-NEXT:    add r0, r2
212; CHECK-NEXT:    strh r0, [r1]
213; CHECK-NEXT:    bx lr
214entry:
215  %l1 = load <8 x i16>, <8 x i16>* %src, align 4
216  %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
217  %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
218  %s3 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
219  %s4 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
220  %a1 = add <2 x i16> %s1, %s2
221  %a2 = add <2 x i16> %s3, %s4
222  %a3 = add <2 x i16> %a1, %a2
223  store <2 x i16> %a3, <2 x i16> *%dst
224  ret void
225}
226
227define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
228; CHECK-LABEL: vld4_v4i16:
229; CHECK:       @ %bb.0: @ %entry
230; CHECK-NEXT:    .vsave {d8, d9}
231; CHECK-NEXT:    vpush {d8, d9}
232; CHECK-NEXT:    vldrw.u32 q1, [r0]
233; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
234; CHECK-NEXT:    vmov.u16 r2, q1[3]
235; CHECK-NEXT:    vmov.u16 r0, q0[3]
236; CHECK-NEXT:    vmov.32 q2[0], r2
237; CHECK-NEXT:    vmov.u16 r2, q1[7]
238; CHECK-NEXT:    vmov.32 q2[1], r2
239; CHECK-NEXT:    vmov.32 q2[2], r0
240; CHECK-NEXT:    vmov.u16 r0, q0[7]
241; CHECK-NEXT:    vmov.32 q2[3], r0
242; CHECK-NEXT:    vmov.u16 r0, q1[2]
243; CHECK-NEXT:    vmov.32 q3[0], r0
244; CHECK-NEXT:    vmov.u16 r0, q1[6]
245; CHECK-NEXT:    vmov.32 q3[1], r0
246; CHECK-NEXT:    vmov.u16 r0, q0[2]
247; CHECK-NEXT:    vmov.32 q3[2], r0
248; CHECK-NEXT:    vmov.u16 r0, q0[6]
249; CHECK-NEXT:    vmov.32 q3[3], r0
250; CHECK-NEXT:    vmov.u16 r0, q1[0]
251; CHECK-NEXT:    vadd.i32 q2, q3, q2
252; CHECK-NEXT:    vmov.32 q3[0], r0
253; CHECK-NEXT:    vmov.u16 r0, q1[4]
254; CHECK-NEXT:    vmov.32 q3[1], r0
255; CHECK-NEXT:    vmov.u16 r0, q0[0]
256; CHECK-NEXT:    vmov.32 q3[2], r0
257; CHECK-NEXT:    vmov.u16 r0, q1[1]
258; CHECK-NEXT:    vmov.32 q4[0], r0
259; CHECK-NEXT:    vmov.u16 r0, q1[5]
260; CHECK-NEXT:    vmov.32 q4[1], r0
261; CHECK-NEXT:    vmov.u16 r0, q0[1]
262; CHECK-NEXT:    vmov.32 q4[2], r0
263; CHECK-NEXT:    vmov.u16 r0, q0[5]
264; CHECK-NEXT:    vmov.32 q4[3], r0
265; CHECK-NEXT:    vmov.u16 r0, q0[4]
266; CHECK-NEXT:    vmov.32 q3[3], r0
267; CHECK-NEXT:    vadd.i32 q0, q3, q4
268; CHECK-NEXT:    vadd.i32 q0, q0, q2
269; CHECK-NEXT:    vstrh.32 q0, [r1]
270; CHECK-NEXT:    vpop {d8, d9}
271; CHECK-NEXT:    bx lr
272entry:
273  %l1 = load <16 x i16>, <16 x i16>* %src, align 4
274  %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
275  %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
276  %s3 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
277  %s4 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
278  %a1 = add <4 x i16> %s1, %s2
279  %a2 = add <4 x i16> %s3, %s4
280  %a3 = add <4 x i16> %a1, %a2
281  store <4 x i16> %a3, <4 x i16> *%dst
282  ret void
283}
284
285define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
286; CHECK-LABEL: vld4_v8i16:
287; CHECK:       @ %bb.0: @ %entry
288; CHECK-NEXT:    .vsave {d8, d9}
289; CHECK-NEXT:    vpush {d8, d9}
290; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
291; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
292; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
293; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
294; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
295; CHECK-NEXT:    vadd.i16 q4, q2, q3
296; CHECK-NEXT:    vadd.i16 q0, q0, q1
297; CHECK-NEXT:    vadd.i16 q0, q0, q4
298; CHECK-NEXT:    vstrw.32 q0, [r1]
299; CHECK-NEXT:    vpop {d8, d9}
300; CHECK-NEXT:    bx lr
301entry:
302  %l1 = load <32 x i16>, <32 x i16>* %src, align 4
303  %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
304  %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
305  %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
306  %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
307  %a1 = add <8 x i16> %s1, %s2
308  %a2 = add <8 x i16> %s3, %s4
309  %a3 = add <8 x i16> %a1, %a2
310  store <8 x i16> %a3, <8 x i16> *%dst
311  ret void
312}
313
314define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
315; CHECK-LABEL: vld4_v16i16:
316; CHECK:       @ %bb.0: @ %entry
317; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
318; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
319; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
320; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
321; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
322; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
323; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
324; CHECK-NEXT:    vadd.i16 q6, q2, q3
325; CHECK-NEXT:    vadd.i16 q0, q0, q1
326; CHECK-NEXT:    vld40.16 {q1, q2, q3, q4}, [r0]
327; CHECK-NEXT:    vadd.i16 q0, q0, q6
328; CHECK-NEXT:    vld41.16 {q1, q2, q3, q4}, [r0]
329; CHECK-NEXT:    vld42.16 {q1, q2, q3, q4}, [r0]
330; CHECK-NEXT:    vld43.16 {q1, q2, q3, q4}, [r0]
331; CHECK-NEXT:    vstrw.32 q0, [r1]
332; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
333; CHECK-NEXT:    vadd.i16 q5, q3, q4
334; CHECK-NEXT:    vadd.i16 q1, q1, q2
335; CHECK-NEXT:    vadd.i16 q1, q1, q5
336; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
337; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
338; CHECK-NEXT:    bx lr
339entry:
340  %l1 = load <64 x i16>, <64 x i16>* %src, align 4
341  %s1 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
342  %s2 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
343  %s3 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
344  %s4 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
345  %a1 = add <16 x i16> %s1, %s2
346  %a2 = add <16 x i16> %s3, %s4
347  %a3 = add <16 x i16> %a1, %a2
348  store <16 x i16> %a3, <16 x i16> *%dst
349  ret void
350}
351
352; i8
353
354define void @vld4_v2i8(<8 x i8> *%src, <2 x i8> *%dst) {
355; CHECK-LABEL: vld4_v2i8:
356; CHECK:       @ %bb.0: @ %entry
357; CHECK-NEXT:    vldrb.u16 q0, [r0]
358; CHECK-NEXT:    vmov.u16 r0, q0[7]
359; CHECK-NEXT:    vmov.u16 r2, q0[6]
360; CHECK-NEXT:    add r0, r2
361; CHECK-NEXT:    vmov.u16 r2, q0[5]
362; CHECK-NEXT:    vmov.u16 r3, q0[4]
363; CHECK-NEXT:    add r2, r3
364; CHECK-NEXT:    vmov.u16 r3, q0[0]
365; CHECK-NEXT:    add r0, r2
366; CHECK-NEXT:    strb r0, [r1, #1]
367; CHECK-NEXT:    vmov.u16 r0, q0[3]
368; CHECK-NEXT:    vmov.u16 r2, q0[2]
369; CHECK-NEXT:    add r0, r2
370; CHECK-NEXT:    vmov.u16 r2, q0[1]
371; CHECK-NEXT:    add r2, r3
372; CHECK-NEXT:    add r0, r2
373; CHECK-NEXT:    strb r0, [r1]
374; CHECK-NEXT:    bx lr
375entry:
376  %l1 = load <8 x i8>, <8 x i8>* %src, align 4
377  %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
378  %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
379  %s3 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
380  %s4 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
381  %a1 = add <2 x i8> %s1, %s2
382  %a2 = add <2 x i8> %s3, %s4
383  %a3 = add <2 x i8> %a1, %a2
384  store <2 x i8> %a3, <2 x i8> *%dst
385  ret void
386}
387
388define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
389; CHECK-LABEL: vld4_v4i8:
390; CHECK:       @ %bb.0: @ %entry
391; CHECK-NEXT:    vldrw.u32 q0, [r0]
392; CHECK-NEXT:    vmov.u8 r0, q0[2]
393; CHECK-NEXT:    vrev32.8 q2, q0
394; CHECK-NEXT:    vmov.32 q1[0], r0
395; CHECK-NEXT:    vmov.u8 r0, q0[6]
396; CHECK-NEXT:    vmov.32 q1[1], r0
397; CHECK-NEXT:    vmov.u8 r0, q0[10]
398; CHECK-NEXT:    vmov.32 q1[2], r0
399; CHECK-NEXT:    vmov.u8 r0, q0[14]
400; CHECK-NEXT:    vmov.32 q1[3], r0
401; CHECK-NEXT:    vadd.i32 q1, q1, q2
402; CHECK-NEXT:    vrev16.8 q2, q0
403; CHECK-NEXT:    vadd.i32 q0, q0, q2
404; CHECK-NEXT:    vadd.i32 q0, q0, q1
405; CHECK-NEXT:    vstrb.32 q0, [r1]
406; CHECK-NEXT:    bx lr
407entry:
408  %l1 = load <16 x i8>, <16 x i8>* %src, align 4
409  %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
410  %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
411  %s3 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
412  %s4 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
413  %a1 = add <4 x i8> %s1, %s2
414  %a2 = add <4 x i8> %s3, %s4
415  %a3 = add <4 x i8> %a1, %a2
416  store <4 x i8> %a3, <4 x i8> *%dst
417  ret void
418}
419
420define void @vld4_v8i8(<32 x i8> *%src, <8 x i8> *%dst) {
421; CHECK-LABEL: vld4_v8i8:
422; CHECK:       @ %bb.0: @ %entry
423; CHECK-NEXT:    .vsave {d8, d9}
424; CHECK-NEXT:    vpush {d8, d9}
425; CHECK-NEXT:    vldrw.u32 q1, [r0]
426; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
427; CHECK-NEXT:    vmov.u8 r2, q1[3]
428; CHECK-NEXT:    vmov.u8 r0, q0[3]
429; CHECK-NEXT:    vmov.16 q2[0], r2
430; CHECK-NEXT:    vmov.u8 r2, q1[7]
431; CHECK-NEXT:    vmov.16 q2[1], r2
432; CHECK-NEXT:    vmov.u8 r2, q1[11]
433; CHECK-NEXT:    vmov.16 q2[2], r2
434; CHECK-NEXT:    vmov.u8 r2, q1[15]
435; CHECK-NEXT:    vmov.16 q2[3], r2
436; CHECK-NEXT:    vmov.16 q2[4], r0
437; CHECK-NEXT:    vmov.u8 r0, q0[7]
438; CHECK-NEXT:    vmov.16 q2[5], r0
439; CHECK-NEXT:    vmov.u8 r0, q0[11]
440; CHECK-NEXT:    vmov.16 q2[6], r0
441; CHECK-NEXT:    vmov.u8 r0, q0[15]
442; CHECK-NEXT:    vmov.16 q2[7], r0
443; CHECK-NEXT:    vmov.u8 r0, q1[2]
444; CHECK-NEXT:    vmov.16 q3[0], r0
445; CHECK-NEXT:    vmov.u8 r0, q1[6]
446; CHECK-NEXT:    vmov.16 q3[1], r0
447; CHECK-NEXT:    vmov.u8 r0, q1[10]
448; CHECK-NEXT:    vmov.16 q3[2], r0
449; CHECK-NEXT:    vmov.u8 r0, q1[14]
450; CHECK-NEXT:    vmov.16 q3[3], r0
451; CHECK-NEXT:    vmov.u8 r0, q0[2]
452; CHECK-NEXT:    vmov.16 q3[4], r0
453; CHECK-NEXT:    vmov.u8 r0, q0[6]
454; CHECK-NEXT:    vmov.16 q3[5], r0
455; CHECK-NEXT:    vmov.u8 r0, q0[10]
456; CHECK-NEXT:    vmov.16 q3[6], r0
457; CHECK-NEXT:    vmov.u8 r0, q0[14]
458; CHECK-NEXT:    vmov.16 q3[7], r0
459; CHECK-NEXT:    vmov.u8 r0, q1[0]
460; CHECK-NEXT:    vadd.i16 q2, q3, q2
461; CHECK-NEXT:    vmov.16 q3[0], r0
462; CHECK-NEXT:    vmov.u8 r0, q1[4]
463; CHECK-NEXT:    vmov.16 q3[1], r0
464; CHECK-NEXT:    vmov.u8 r0, q1[8]
465; CHECK-NEXT:    vmov.16 q3[2], r0
466; CHECK-NEXT:    vmov.u8 r0, q1[12]
467; CHECK-NEXT:    vmov.16 q3[3], r0
468; CHECK-NEXT:    vmov.u8 r0, q0[0]
469; CHECK-NEXT:    vmov.16 q3[4], r0
470; CHECK-NEXT:    vmov.u8 r0, q0[4]
471; CHECK-NEXT:    vmov.16 q3[5], r0
472; CHECK-NEXT:    vmov.u8 r0, q0[8]
473; CHECK-NEXT:    vmov.16 q3[6], r0
474; CHECK-NEXT:    vmov.u8 r0, q1[1]
475; CHECK-NEXT:    vmov.16 q4[0], r0
476; CHECK-NEXT:    vmov.u8 r0, q1[5]
477; CHECK-NEXT:    vmov.16 q4[1], r0
478; CHECK-NEXT:    vmov.u8 r0, q1[9]
479; CHECK-NEXT:    vmov.16 q4[2], r0
480; CHECK-NEXT:    vmov.u8 r0, q1[13]
481; CHECK-NEXT:    vmov.16 q4[3], r0
482; CHECK-NEXT:    vmov.u8 r0, q0[1]
483; CHECK-NEXT:    vmov.16 q4[4], r0
484; CHECK-NEXT:    vmov.u8 r0, q0[5]
485; CHECK-NEXT:    vmov.16 q4[5], r0
486; CHECK-NEXT:    vmov.u8 r0, q0[9]
487; CHECK-NEXT:    vmov.16 q4[6], r0
488; CHECK-NEXT:    vmov.u8 r0, q0[13]
489; CHECK-NEXT:    vmov.16 q4[7], r0
490; CHECK-NEXT:    vmov.u8 r0, q0[12]
491; CHECK-NEXT:    vmov.16 q3[7], r0
492; CHECK-NEXT:    vadd.i16 q0, q3, q4
493; CHECK-NEXT:    vadd.i16 q0, q0, q2
494; CHECK-NEXT:    vstrb.16 q0, [r1]
495; CHECK-NEXT:    vpop {d8, d9}
496; CHECK-NEXT:    bx lr
497entry:
498  %l1 = load <32 x i8>, <32 x i8>* %src, align 4
499  %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
500  %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
501  %s3 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
502  %s4 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
503  %a1 = add <8 x i8> %s1, %s2
504  %a2 = add <8 x i8> %s3, %s4
505  %a3 = add <8 x i8> %a1, %a2
506  store <8 x i8> %a3, <8 x i8> *%dst
507  ret void
508}
509
510define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
511; CHECK-LABEL: vld4_v16i8:
512; CHECK:       @ %bb.0: @ %entry
513; CHECK-NEXT:    .vsave {d8, d9}
514; CHECK-NEXT:    vpush {d8, d9}
515; CHECK-NEXT:    vld40.8 {q0, q1, q2, q3}, [r0]
516; CHECK-NEXT:    vld41.8 {q0, q1, q2, q3}, [r0]
517; CHECK-NEXT:    vld42.8 {q0, q1, q2, q3}, [r0]
518; CHECK-NEXT:    vld43.8 {q0, q1, q2, q3}, [r0]
519; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
520; CHECK-NEXT:    vadd.i8 q4, q2, q3
521; CHECK-NEXT:    vadd.i8 q0, q0, q1
522; CHECK-NEXT:    vadd.i8 q0, q0, q4
523; CHECK-NEXT:    vstrw.32 q0, [r1]
524; CHECK-NEXT:    vpop {d8, d9}
525; CHECK-NEXT:    bx lr
526entry:
527  %l1 = load <64 x i8>, <64 x i8>* %src, align 4
528  %s1 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
529  %s2 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
530  %s3 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
531  %s4 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
532  %a1 = add <16 x i8> %s1, %s2
533  %a2 = add <16 x i8> %s3, %s4
534  %a3 = add <16 x i8> %a1, %a2
535  store <16 x i8> %a3, <16 x i8> *%dst
536  ret void
537}
538
539; i64
540
541define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
542; CHECK-LABEL: vld4_v2i64:
543; CHECK:       @ %bb.0: @ %entry
544; CHECK-NEXT:    .save {r4, r5, r6, lr}
545; CHECK-NEXT:    push {r4, r5, r6, lr}
546; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
547; CHECK-NEXT:    vpush {d8, d9, d10, d11}
548; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
549; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
550; CHECK-NEXT:    vldrw.u32 q0, [r0]
551; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
552; CHECK-NEXT:    vmov.f64 d8, d7
553; CHECK-NEXT:    vmov.f32 s17, s15
554; CHECK-NEXT:    vmov.f32 s18, s22
555; CHECK-NEXT:    vmov.f32 s14, s20
556; CHECK-NEXT:    vmov.f32 s19, s23
557; CHECK-NEXT:    vmov.f32 s15, s21
558; CHECK-NEXT:    vmov r3, s18
559; CHECK-NEXT:    vmov r0, s14
560; CHECK-NEXT:    vmov.f64 d2, d1
561; CHECK-NEXT:    vmov r12, s19
562; CHECK-NEXT:    vmov r2, s15
563; CHECK-NEXT:    vmov.f32 s5, s3
564; CHECK-NEXT:    vmov.f32 s6, s10
565; CHECK-NEXT:    vmov.f32 s2, s8
566; CHECK-NEXT:    vmov.f32 s3, s9
567; CHECK-NEXT:    vmov.f32 s7, s11
568; CHECK-NEXT:    vmov r4, s2
569; CHECK-NEXT:    vmov r5, s4
570; CHECK-NEXT:    vmov r6, s0
571; CHECK-NEXT:    adds.w lr, r0, r3
572; CHECK-NEXT:    vmov r3, s7
573; CHECK-NEXT:    vmov r0, s3
574; CHECK-NEXT:    adc.w r12, r12, r2
575; CHECK-NEXT:    vmov r2, s6
576; CHECK-NEXT:    adds r2, r2, r4
577; CHECK-NEXT:    vmov r4, s13
578; CHECK-NEXT:    adcs r0, r3
579; CHECK-NEXT:    adds.w lr, lr, r2
580; CHECK-NEXT:    adc.w r12, r12, r0
581; CHECK-NEXT:    vmov r0, s16
582; CHECK-NEXT:    vmov r2, s12
583; CHECK-NEXT:    vmov r3, s17
584; CHECK-NEXT:    adds r0, r0, r2
585; CHECK-NEXT:    adc.w r2, r4, r3
586; CHECK-NEXT:    vmov r3, s5
587; CHECK-NEXT:    vmov r4, s1
588; CHECK-NEXT:    adds r5, r5, r6
589; CHECK-NEXT:    adcs r3, r4
590; CHECK-NEXT:    adds r0, r0, r5
591; CHECK-NEXT:    adcs r2, r3
592; CHECK-NEXT:    vmov.32 q0[0], r0
593; CHECK-NEXT:    vmov.32 q0[1], r2
594; CHECK-NEXT:    vmov.32 q0[2], lr
595; CHECK-NEXT:    vmov.32 q0[3], r12
596; CHECK-NEXT:    vstrw.32 q0, [r1]
597; CHECK-NEXT:    vpop {d8, d9, d10, d11}
598; CHECK-NEXT:    pop {r4, r5, r6, pc}
599entry:
600  %l1 = load <8 x i64>, <8 x i64>* %src, align 4
601  %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
602  %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
603  %s3 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
604  %s4 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
605  %a1 = add <2 x i64> %s1, %s2
606  %a2 = add <2 x i64> %s3, %s4
607  %a3 = add <2 x i64> %a1, %a2
608  store <2 x i64> %a3, <2 x i64> *%dst
609  ret void
610}
611
612define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
613; CHECK-LABEL: vld4_v4i64:
614; CHECK:       @ %bb.0: @ %entry
615; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
616; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
617; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
618; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
619; CHECK-NEXT:    .pad #72
620; CHECK-NEXT:    sub sp, #72
621; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
622; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
623; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
624; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
625; CHECK-NEXT:    vmov.f64 d8, d3
626; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
627; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
628; CHECK-NEXT:    vmov.f32 s17, s7
629; CHECK-NEXT:    vldrw.u32 q1, [r0]
630; CHECK-NEXT:    vmov.f32 s18, s2
631; CHECK-NEXT:    vmov.f32 s19, s3
632; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
633; CHECK-NEXT:    vmov.f64 d12, d11
634; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
635; CHECK-NEXT:    vmov.f32 s25, s23
636; CHECK-NEXT:    vmov.f32 s26, s2
637; CHECK-NEXT:    vmov.f64 d6, d3
638; CHECK-NEXT:    vmov.f32 s27, s3
639; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
640; CHECK-NEXT:    vmov.f32 s13, s7
641; CHECK-NEXT:    vmov.f32 s14, s2
642; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
643; CHECK-NEXT:    vmov.f64 d4, d15
644; CHECK-NEXT:    vmov.f32 s15, s3
645; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
646; CHECK-NEXT:    vmov.f32 s9, s31
647; CHECK-NEXT:    vmov.f32 s10, s2
648; CHECK-NEXT:    vmov.f32 s30, s0
649; CHECK-NEXT:    vmov.f32 s11, s3
650; CHECK-NEXT:    vmov.f32 s31, s1
651; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
652; CHECK-NEXT:    vmov r3, s10
653; CHECK-NEXT:    vmov r0, s30
654; CHECK-NEXT:    vmov.f32 s6, s0
655; CHECK-NEXT:    vmov.f32 s7, s1
656; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
657; CHECK-NEXT:    vmov r4, s6
658; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
659; CHECK-NEXT:    vmov r12, s11
660; CHECK-NEXT:    vmov r2, s31
661; CHECK-NEXT:    vmov.f32 s22, s0
662; CHECK-NEXT:    vmov.f32 s23, s1
663; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
664; CHECK-NEXT:    vmov r5, s18
665; CHECK-NEXT:    vmov r7, s16
666; CHECK-NEXT:    adds.w lr, r0, r3
667; CHECK-NEXT:    vmov r3, s14
668; CHECK-NEXT:    vmov r0, s7
669; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
670; CHECK-NEXT:    vmov.f32 s2, s4
671; CHECK-NEXT:    vmov.f32 s3, s5
672; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
673; CHECK-NEXT:    adc.w r12, r12, r2
674; CHECK-NEXT:    vmov r2, s15
675; CHECK-NEXT:    vmov r6, s2
676; CHECK-NEXT:    adds r3, r3, r4
677; CHECK-NEXT:    vmov r4, s23
678; CHECK-NEXT:    adcs r0, r2
679; CHECK-NEXT:    adds.w lr, lr, r3
680; CHECK-NEXT:    adc.w r12, r12, r0
681; CHECK-NEXT:    vmov r0, s26
682; CHECK-NEXT:    vmov r2, s22
683; CHECK-NEXT:    vmov r3, s27
684; CHECK-NEXT:    adds r0, r0, r2
685; CHECK-NEXT:    adc.w r2, r4, r3
686; CHECK-NEXT:    vmov r3, s19
687; CHECK-NEXT:    vmov r4, s3
688; CHECK-NEXT:    adds r5, r5, r6
689; CHECK-NEXT:    vmov r6, s20
690; CHECK-NEXT:    adcs r3, r4
691; CHECK-NEXT:    adds r0, r0, r5
692; CHECK-NEXT:    vmov r5, s24
693; CHECK-NEXT:    adc.w r8, r3, r2
694; CHECK-NEXT:    vmov r2, s25
695; CHECK-NEXT:    vmov r4, s21
696; CHECK-NEXT:    vmov r3, s0
697; CHECK-NEXT:    adds r5, r5, r6
698; CHECK-NEXT:    vmov r6, s1
699; CHECK-NEXT:    adcs r2, r4
700; CHECK-NEXT:    vmov r4, s17
701; CHECK-NEXT:    adds r3, r3, r7
702; CHECK-NEXT:    vmov r7, s28
703; CHECK-NEXT:    adcs r4, r6
704; CHECK-NEXT:    adds r3, r3, r5
705; CHECK-NEXT:    vmov r6, s8
706; CHECK-NEXT:    adcs r2, r4
707; CHECK-NEXT:    vmov r4, s9
708; CHECK-NEXT:    vmov.32 q0[0], r3
709; CHECK-NEXT:    vmov r5, s29
710; CHECK-NEXT:    vmov.32 q0[1], r2
711; CHECK-NEXT:    vmov.32 q0[2], r0
712; CHECK-NEXT:    vmov r0, s12
713; CHECK-NEXT:    vmov r3, s13
714; CHECK-NEXT:    vmov.32 q0[3], r8
715; CHECK-NEXT:    vmov r2, s5
716; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
717; CHECK-NEXT:    adds r6, r6, r7
718; CHECK-NEXT:    adcs r4, r5
719; CHECK-NEXT:    vmov r5, s4
720; CHECK-NEXT:    adds r0, r0, r5
721; CHECK-NEXT:    adcs r2, r3
722; CHECK-NEXT:    adds r0, r0, r6
723; CHECK-NEXT:    adcs r2, r4
724; CHECK-NEXT:    vmov.32 q0[0], r0
725; CHECK-NEXT:    vmov.32 q0[1], r2
726; CHECK-NEXT:    vmov.32 q0[2], lr
727; CHECK-NEXT:    vmov.32 q0[3], r12
728; CHECK-NEXT:    vstrw.32 q0, [r1]
729; CHECK-NEXT:    add sp, #72
730; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
731; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
732entry:
733  %l1 = load <16 x i64>, <16 x i64>* %src, align 4
734  %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
735  %s2 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
736  %s3 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
737  %s4 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
738  %a1 = add <4 x i64> %s1, %s2
739  %a2 = add <4 x i64> %s3, %s4
740  %a3 = add <4 x i64> %a1, %a2
741  store <4 x i64> %a3, <4 x i64> *%dst
742  ret void
743}
744
745; f32
746
747define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) {
748; CHECK-LABEL: vld4_v2f32:
749; CHECK:       @ %bb.0: @ %entry
750; CHECK-NEXT:    vldrw.u32 q1, [r0]
751; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
752; CHECK-NEXT:    vmov.f32 s8, s7
753; CHECK-NEXT:    vmov.f64 d6, d3
754; CHECK-NEXT:    vmov.f32 s9, s3
755; CHECK-NEXT:    vmov.f32 s13, s2
756; CHECK-NEXT:    vadd.f32 q2, q3, q2
757; CHECK-NEXT:    vmov.f32 s12, s5
758; CHECK-NEXT:    vmov.f32 s13, s1
759; CHECK-NEXT:    vmov.f32 s5, s0
760; CHECK-NEXT:    vadd.f32 q0, q1, q3
761; CHECK-NEXT:    vadd.f32 q0, q0, q2
762; CHECK-NEXT:    vstmia r1, {s0, s1}
763; CHECK-NEXT:    bx lr
764entry:
765  %l1 = load <8 x float>, <8 x float>* %src, align 4
766  %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 0, i32 4>
767  %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 1, i32 5>
768  %s3 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 2, i32 6>
769  %s4 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 3, i32 7>
770  %a1 = fadd <2 x float> %s1, %s2
771  %a2 = fadd <2 x float> %s3, %s4
772  %a3 = fadd <2 x float> %a1, %a2
773  store <2 x float> %a3, <2 x float> *%dst
774  ret void
775}
776
777define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
778; CHECK-LABEL: vld4_v4f32:
779; CHECK:       @ %bb.0: @ %entry
780; CHECK-NEXT:    .vsave {d8, d9}
781; CHECK-NEXT:    vpush {d8, d9}
782; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
783; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
784; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
785; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
786; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
787; CHECK-NEXT:    vadd.f32 q4, q2, q3
788; CHECK-NEXT:    vadd.f32 q0, q0, q1
789; CHECK-NEXT:    vadd.f32 q0, q0, q4
790; CHECK-NEXT:    vstrw.32 q0, [r1]
791; CHECK-NEXT:    vpop {d8, d9}
792; CHECK-NEXT:    bx lr
793entry:
794  %l1 = load <16 x float>, <16 x float>* %src, align 4
795  %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
796  %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
797  %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
798  %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
799  %a1 = fadd <4 x float> %s1, %s2
800  %a2 = fadd <4 x float> %s3, %s4
801  %a3 = fadd <4 x float> %a1, %a2
802  store <4 x float> %a3, <4 x float> *%dst
803  ret void
804}
805
806define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) {
807; CHECK-LABEL: vld4_v8f32:
808; CHECK:       @ %bb.0: @ %entry
809; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
810; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
811; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
812; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
813; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
814; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
815; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
816; CHECK-NEXT:    vadd.f32 q6, q2, q3
817; CHECK-NEXT:    vadd.f32 q0, q0, q1
818; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
819; CHECK-NEXT:    vadd.f32 q0, q0, q6
820; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
821; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
822; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
823; CHECK-NEXT:    vstrw.32 q0, [r1]
824; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
825; CHECK-NEXT:    vadd.f32 q5, q3, q4
826; CHECK-NEXT:    vadd.f32 q1, q1, q2
827; CHECK-NEXT:    vadd.f32 q1, q1, q5
828; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
829; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
830; CHECK-NEXT:    bx lr
831entry:
832  %l1 = load <32 x float>, <32 x float>* %src, align 4
833  %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
834  %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
835  %s3 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
836  %s4 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
837  %a1 = fadd <8 x float> %s1, %s2
838  %a2 = fadd <8 x float> %s3, %s4
839  %a3 = fadd <8 x float> %a1, %a2
840  store <8 x float> %a3, <8 x float> *%dst
841  ret void
842}
843
844define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
845; CHECK-LABEL: vld4_v16f32:
846; CHECK:       @ %bb.0: @ %entry
847; CHECK-NEXT:    .save {r4, r5}
848; CHECK-NEXT:    push {r4, r5}
849; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
850; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
851; CHECK-NEXT:    .pad #136
852; CHECK-NEXT:    sub sp, #136
853; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
854; CHECK-NEXT:    mov r2, r0
855; CHECK-NEXT:    add.w r3, r0, #192
856; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
857; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
858; CHECK-NEXT:    adds r0, #128
859; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
860; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
861; CHECK-NEXT:    vadd.f32 q4, q2, q3
862; CHECK-NEXT:    vadd.f32 q0, q0, q1
863; CHECK-NEXT:    vstrw.32 q4, [sp, #112] @ 16-byte Spill
864; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
865; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
866; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
867; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
868; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
869; CHECK-NEXT:    vldrw.u32 q6, [sp, #112] @ 16-byte Reload
870; CHECK-NEXT:    vldrw.u32 q5, [sp, #96] @ 16-byte Reload
871; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
872; CHECK-NEXT:    vmov q0, q1
873; CHECK-NEXT:    vadd.f32 q6, q5, q6
874; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
875; CHECK-NEXT:    vadd.f32 q0, q0, q2
876; CHECK-NEXT:    vstrw.32 q6, [sp, #112] @ 16-byte Spill
877; CHECK-NEXT:    vadd.f32 q1, q3, q5
878; CHECK-NEXT:    vadd.f32 q0, q0, q1
879; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
880; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
881; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
882; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
883; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
884; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
885; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
886; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
887; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
888; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
889; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
890; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
891; CHECK-NEXT:    vmov q5, q1
892; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
893; CHECK-NEXT:    vadd.f32 q0, q0, q5
894; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
895; CHECK-NEXT:    vadd.f32 q1, q2, q1
896; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
897; CHECK-NEXT:    vadd.f32 q2, q3, q4
898; CHECK-NEXT:    vadd.f32 q0, q0, q1
899; CHECK-NEXT:    vadd.f32 q1, q5, q6
900; CHECK-NEXT:    vadd.f32 q1, q2, q1
901; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
902; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
903; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
904; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
905; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
906; CHECK-NEXT:    vstrw.32 q0, [r1]
907; CHECK-NEXT:    add sp, #136
908; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
909; CHECK-NEXT:    pop {r4, r5}
910; CHECK-NEXT:    bx lr
911entry:
912  %l1 = load <64 x float>, <64 x float>* %src, align 4
913  %s1 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
914  %s2 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
915  %s3 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
916  %s4 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
917  %a1 = fadd <16 x float> %s1, %s2
918  %a2 = fadd <16 x float> %s3, %s4
919  %a3 = fadd <16 x float> %a1, %a2
920  store <16 x float> %a3, <16 x float> *%dst
921  ret void
922}
923
924; f16
925
926define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
927; CHECK-LABEL: vld4_v2f16:
928; CHECK:       @ %bb.0: @ %entry
929; CHECK-NEXT:    vldrw.u32 q0, [r0]
930; CHECK-NEXT:    vmovx.f16 s4, s3
931; CHECK-NEXT:    vmov r0, s4
932; CHECK-NEXT:    vmovx.f16 s4, s1
933; CHECK-NEXT:    vmov r2, s4
934; CHECK-NEXT:    vmov.16 q1[0], r2
935; CHECK-NEXT:    vmov r2, s1
936; CHECK-NEXT:    vmov.16 q1[1], r0
937; CHECK-NEXT:    vmov r0, s3
938; CHECK-NEXT:    vmov.16 q2[0], r2
939; CHECK-NEXT:    vmov.16 q2[1], r0
940; CHECK-NEXT:    vadd.f16 q1, q2, q1
941; CHECK-NEXT:    vmovx.f16 s8, s0
942; CHECK-NEXT:    vmov r0, s8
943; CHECK-NEXT:    vmovx.f16 s8, s2
944; CHECK-NEXT:    vmov r2, s8
945; CHECK-NEXT:    vmov.16 q2[0], r0
946; CHECK-NEXT:    vmov r0, s0
947; CHECK-NEXT:    vmov.16 q2[1], r2
948; CHECK-NEXT:    vmov r2, s2
949; CHECK-NEXT:    vmov.16 q0[0], r0
950; CHECK-NEXT:    vmov.16 q0[1], r2
951; CHECK-NEXT:    vadd.f16 q0, q0, q2
952; CHECK-NEXT:    vadd.f16 q0, q0, q1
953; CHECK-NEXT:    vmov r0, s0
954; CHECK-NEXT:    str r0, [r1]
955; CHECK-NEXT:    bx lr
956entry:
957  %l1 = load <8 x half>, <8 x half>* %src, align 4
958  %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 0, i32 4>
959  %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 1, i32 5>
960  %s3 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 2, i32 6>
961  %s4 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 3, i32 7>
962  %a1 = fadd <2 x half> %s1, %s2
963  %a2 = fadd <2 x half> %s3, %s4
964  %a3 = fadd <2 x half> %a1, %a2
965  store <2 x half> %a3, <2 x half> *%dst
966  ret void
967}
968
969define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
970; CHECK-LABEL: vld4_v4f16:
971; CHECK:       @ %bb.0: @ %entry
972; CHECK-NEXT:    .vsave {d8}
973; CHECK-NEXT:    vpush {d8}
974; CHECK-NEXT:    vldrw.u32 q1, [r0]
975; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
976; CHECK-NEXT:    vmov r2, s5
977; CHECK-NEXT:    vmovx.f16 s12, s5
978; CHECK-NEXT:    vmov r3, s7
979; CHECK-NEXT:    vmov.16 q2[0], r2
980; CHECK-NEXT:    vmov.16 q2[1], r3
981; CHECK-NEXT:    vmov r0, s1
982; CHECK-NEXT:    vmov.16 q2[2], r0
983; CHECK-NEXT:    vmov r0, s12
984; CHECK-NEXT:    vmovx.f16 s12, s7
985; CHECK-NEXT:    vmovx.f16 s16, s1
986; CHECK-NEXT:    vmov r2, s12
987; CHECK-NEXT:    vmov.16 q3[0], r0
988; CHECK-NEXT:    vmov r0, s16
989; CHECK-NEXT:    vmov.16 q3[1], r2
990; CHECK-NEXT:    vmovx.f16 s16, s3
991; CHECK-NEXT:    vmov.16 q3[2], r0
992; CHECK-NEXT:    vmov r0, s16
993; CHECK-NEXT:    vmovx.f16 s16, s0
994; CHECK-NEXT:    vmov.16 q3[3], r0
995; CHECK-NEXT:    vmov r0, s3
996; CHECK-NEXT:    vmov.16 q2[3], r0
997; CHECK-NEXT:    vadd.f16 q2, q2, q3
998; CHECK-NEXT:    vmovx.f16 s12, s4
999; CHECK-NEXT:    vmov r0, s12
1000; CHECK-NEXT:    vmovx.f16 s12, s6
1001; CHECK-NEXT:    vmov r2, s12
1002; CHECK-NEXT:    vmov.16 q3[0], r0
1003; CHECK-NEXT:    vmov.16 q3[1], r2
1004; CHECK-NEXT:    vmov r0, s16
1005; CHECK-NEXT:    vmovx.f16 s16, s2
1006; CHECK-NEXT:    vmov.16 q3[2], r0
1007; CHECK-NEXT:    vmov r0, s16
1008; CHECK-NEXT:    vmov.16 q3[3], r0
1009; CHECK-NEXT:    vmov r0, s4
1010; CHECK-NEXT:    vmov r2, s6
1011; CHECK-NEXT:    vmov.16 q1[0], r0
1012; CHECK-NEXT:    vmov.16 q1[1], r2
1013; CHECK-NEXT:    vmov r0, s0
1014; CHECK-NEXT:    vmov.16 q1[2], r0
1015; CHECK-NEXT:    vmov r0, s2
1016; CHECK-NEXT:    vmov.16 q1[3], r0
1017; CHECK-NEXT:    vadd.f16 q0, q1, q3
1018; CHECK-NEXT:    vadd.f16 q0, q0, q2
1019; CHECK-NEXT:    vmov r2, s1
1020; CHECK-NEXT:    vmov r0, s0
1021; CHECK-NEXT:    strd r0, r2, [r1]
1022; CHECK-NEXT:    vpop {d8}
1023; CHECK-NEXT:    bx lr
1024entry:
1025  %l1 = load <16 x half>, <16 x half>* %src, align 4
1026  %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1027  %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
1028  %s3 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
1029  %s4 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
1030  %a1 = fadd <4 x half> %s1, %s2
1031  %a2 = fadd <4 x half> %s3, %s4
1032  %a3 = fadd <4 x half> %a1, %a2
1033  store <4 x half> %a3, <4 x half> *%dst
1034  ret void
1035}
1036
1037define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
1038; CHECK-LABEL: vld4_v8f16:
1039; CHECK:       @ %bb.0: @ %entry
1040; CHECK-NEXT:    .vsave {d8, d9}
1041; CHECK-NEXT:    vpush {d8, d9}
1042; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
1043; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
1044; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
1045; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
1046; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
1047; CHECK-NEXT:    vadd.f16 q4, q2, q3
1048; CHECK-NEXT:    vadd.f16 q0, q0, q1
1049; CHECK-NEXT:    vadd.f16 q0, q0, q4
1050; CHECK-NEXT:    vstrw.32 q0, [r1]
1051; CHECK-NEXT:    vpop {d8, d9}
1052; CHECK-NEXT:    bx lr
1053entry:
1054  %l1 = load <32 x half>, <32 x half>* %src, align 4
1055  %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
1056  %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
1057  %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
1058  %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
1059  %a1 = fadd <8 x half> %s1, %s2
1060  %a2 = fadd <8 x half> %s3, %s4
1061  %a3 = fadd <8 x half> %a1, %a2
1062  store <8 x half> %a3, <8 x half> *%dst
1063  ret void
1064}
1065
1066define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) {
1067; CHECK-LABEL: vld4_v16f16:
1068; CHECK:       @ %bb.0: @ %entry
1069; CHECK-NEXT:    .save {r4, r5}
1070; CHECK-NEXT:    push {r4, r5}
1071; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1072; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1073; CHECK-NEXT:    .pad #88
1074; CHECK-NEXT:    sub sp, #88
1075; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
1076; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
1077; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
1078; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
1079; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
1080; CHECK-NEXT:    vld40.16 {q4, q5, q6, q7}, [r0]
1081; CHECK-NEXT:    vld41.16 {q4, q5, q6, q7}, [r0]
1082; CHECK-NEXT:    vld42.16 {q4, q5, q6, q7}, [r0]
1083; CHECK-NEXT:    vld43.16 {q4, q5, q6, q7}, [r0]
1084; CHECK-NEXT:    @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7
1085; CHECK-NEXT:    vadd.f16 q0, q6, q7
1086; CHECK-NEXT:    vadd.f16 q4, q4, q5
1087; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
1088; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
1089; CHECK-NEXT:    vadd.f16 q4, q4, q0
1090; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
1091; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
1092; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
1093; CHECK-NEXT:    vadd.f16 q4, q2, q3
1094; CHECK-NEXT:    vadd.f16 q0, q0, q1
1095; CHECK-NEXT:    vadd.f16 q0, q0, q4
1096; CHECK-NEXT:    vstrw.32 q0, [r1]
1097; CHECK-NEXT:    add sp, #88
1098; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1099; CHECK-NEXT:    pop {r4, r5}
1100; CHECK-NEXT:    bx lr
1101entry:
1102  %l1 = load <64 x half>, <64 x half>* %src, align 4
1103  %s1 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
1104  %s2 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
1105  %s3 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
1106  %s4 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
1107  %a1 = fadd <16 x half> %s1, %s2
1108  %a2 = fadd <16 x half> %s3, %s4
1109  %a3 = fadd <16 x half> %a1, %a2
1110  store <16 x half> %a3, <16 x half> *%dst
1111  ret void
1112}
1113
1114; f64
1115
1116define void @vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) {
1117; CHECK-LABEL: vld4_v2f64:
1118; CHECK:       @ %bb.0: @ %entry
1119; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
1120; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
1121; CHECK-NEXT:    vldrw.u32 q2, [r0]
1122; CHECK-NEXT:    vadd.f64 d0, d0, d1
1123; CHECK-NEXT:    vadd.f64 d1, d2, d3
1124; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1125; CHECK-NEXT:    vadd.f64 d2, d2, d3
1126; CHECK-NEXT:    vadd.f64 d3, d4, d5
1127; CHECK-NEXT:    vadd.f64 d1, d1, d0
1128; CHECK-NEXT:    vadd.f64 d0, d3, d2
1129; CHECK-NEXT:    vstrw.32 q0, [r1]
1130; CHECK-NEXT:    bx lr
1131entry:
1132  %l1 = load <8 x double>, <8 x double>* %src, align 4
1133  %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 0, i32 4>
1134  %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 1, i32 5>
1135  %s3 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 2, i32 6>
1136  %s4 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 3, i32 7>
1137  %a1 = fadd <2 x double> %s1, %s2
1138  %a2 = fadd <2 x double> %s3, %s4
1139  %a3 = fadd <2 x double> %a1, %a2
1140  store <2 x double> %a3, <2 x double> *%dst
1141  ret void
1142}
1143
1144define void @vld4_v4f64(<16 x double> *%src, <4 x double> *%dst) {
1145; CHECK-LABEL: vld4_v4f64:
1146; CHECK:       @ %bb.0: @ %entry
1147; CHECK-NEXT:    .vsave {d8, d9}
1148; CHECK-NEXT:    vpush {d8, d9}
1149; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
1150; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
1151; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
1152; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1153; CHECK-NEXT:    vadd.f64 d0, d0, d1
1154; CHECK-NEXT:    vldrw.u32 q4, [r0]
1155; CHECK-NEXT:    vadd.f64 d1, d2, d3
1156; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
1157; CHECK-NEXT:    vadd.f64 d2, d2, d3
1158; CHECK-NEXT:    vadd.f64 d3, d4, d5
1159; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
1160; CHECK-NEXT:    vadd.f64 d4, d4, d5
1161; CHECK-NEXT:    vadd.f64 d5, d6, d7
1162; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
1163; CHECK-NEXT:    vadd.f64 d6, d6, d7
1164; CHECK-NEXT:    vadd.f64 d7, d8, d9
1165; CHECK-NEXT:    vadd.f64 d1, d1, d0
1166; CHECK-NEXT:    vadd.f64 d0, d3, d2
1167; CHECK-NEXT:    vadd.f64 d3, d5, d4
1168; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1169; CHECK-NEXT:    vadd.f64 d2, d7, d6
1170; CHECK-NEXT:    vstrw.32 q1, [r1]
1171; CHECK-NEXT:    vpop {d8, d9}
1172; CHECK-NEXT:    bx lr
1173entry:
1174  %l1 = load <16 x double>, <16 x double>* %src, align 4
1175  %s1 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1176  %s2 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
1177  %s3 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
1178  %s4 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
1179  %a1 = fadd <4 x double> %s1, %s2
1180  %a2 = fadd <4 x double> %s3, %s4
1181  %a3 = fadd <4 x double> %a1, %a2
1182  store <4 x double> %a3, <4 x double> *%dst
1183  ret void
1184}
1185