• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) {
5; CHECK-LABEL: gather_inc_mini_4i32:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vmov.i32 q1, #0x4
8; CHECK-NEXT:    vadd.i32 q1, q0, q1
9; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
10; CHECK-NEXT:    bx lr
11  %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
12  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
13  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
14  ret <4 x i32> %wide.masked.gather
15}
16
17define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) {
18; CHECK-LABEL: gather_inc_minipred_4i32:
19; CHECK:       @ %bb.0:
20; CHECK-NEXT:    vmov.i32 q1, #0x4
21; CHECK-NEXT:    movw r1, #3855
22; CHECK-NEXT:    vadd.i32 q1, q0, q1
23; CHECK-NEXT:    vmsr p0, r1
24; CHECK-NEXT:    vpst
25; CHECK-NEXT:    vldrwt.u32 q0, [r0, q1, uxtw #2]
26; CHECK-NEXT:    bx lr
27  %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
28  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
29  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
30  ret <4 x i32> %wide.masked.gather
31}
32
33define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) {
34; CHECK-LABEL: gather_inc_mini_8i16:
35; CHECK:       @ %bb.0:
36; CHECK-NEXT:    .save {r4, r5, r7, lr}
37; CHECK-NEXT:    push {r4, r5, r7, lr}
38; CHECK-NEXT:    vshl.i32 q0, q0, #1
39; CHECK-NEXT:    vmov.i32 q2, #0x10
40; CHECK-NEXT:    vadd.i32 q0, q0, r0
41; CHECK-NEXT:    vshl.i32 q1, q1, #1
42; CHECK-NEXT:    vadd.i32 q0, q0, q2
43; CHECK-NEXT:    vadd.i32 q1, q1, r0
44; CHECK-NEXT:    vmov r1, s2
45; CHECK-NEXT:    vadd.i32 q1, q1, q2
46; CHECK-NEXT:    vmov r2, s3
47; CHECK-NEXT:    vmov r3, s0
48; CHECK-NEXT:    vmov r5, s1
49; CHECK-NEXT:    vmov r0, s4
50; CHECK-NEXT:    vmov r4, s7
51; CHECK-NEXT:    ldrh.w r12, [r1]
52; CHECK-NEXT:    vmov r1, s5
53; CHECK-NEXT:    ldrh.w lr, [r2]
54; CHECK-NEXT:    vmov r2, s6
55; CHECK-NEXT:    ldrh r3, [r3]
56; CHECK-NEXT:    ldrh r5, [r5]
57; CHECK-NEXT:    vmov.16 q0[0], r3
58; CHECK-NEXT:    ldrh r0, [r0]
59; CHECK-NEXT:    vmov.16 q0[1], r5
60; CHECK-NEXT:    ldrh r4, [r4]
61; CHECK-NEXT:    vmov.16 q0[2], r12
62; CHECK-NEXT:    vmov.16 q0[3], lr
63; CHECK-NEXT:    vmov.16 q0[4], r0
64; CHECK-NEXT:    ldrh r1, [r1]
65; CHECK-NEXT:    ldrh r2, [r2]
66; CHECK-NEXT:    vmov.16 q0[5], r1
67; CHECK-NEXT:    vmov.16 q0[6], r2
68; CHECK-NEXT:    vmov.16 q0[7], r4
69; CHECK-NEXT:    pop {r4, r5, r7, pc}
70  %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
71  %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1
72  %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
73  ret <8 x i16> %wide.masked.gather
74}
75
76define arm_aapcs_vfpcc <8 x i16> @gather_inc_minipred_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) {
77; CHECK-LABEL: gather_inc_minipred_8i16:
78; CHECK:       @ %bb.0:
79; CHECK-NEXT:    vshl.i32 q0, q0, #1
80; CHECK-NEXT:    vmov.i32 q2, #0x10
81; CHECK-NEXT:    vadd.i32 q0, q0, r0
82; CHECK-NEXT:    vshl.i32 q1, q1, #1
83; CHECK-NEXT:    vadd.i32 q0, q0, q2
84; CHECK-NEXT:    vadd.i32 q1, q1, r0
85; CHECK-NEXT:    vmov r1, s0
86; CHECK-NEXT:    vadd.i32 q1, q1, q2
87; CHECK-NEXT:    vmov r3, s2
88; CHECK-NEXT:    vmov r0, s4
89; CHECK-NEXT:    vmov r2, s6
90; CHECK-NEXT:    ldrh r1, [r1]
91; CHECK-NEXT:    ldrh r3, [r3]
92; CHECK-NEXT:    vmov.16 q0[0], r1
93; CHECK-NEXT:    ldrh r0, [r0]
94; CHECK-NEXT:    vmov.16 q0[2], r3
95; CHECK-NEXT:    ldrh r2, [r2]
96; CHECK-NEXT:    vmov.16 q0[4], r0
97; CHECK-NEXT:    vmov.16 q0[6], r2
98; CHECK-NEXT:    bx lr
99  %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
100  %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1
101  %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> undef)
102  ret <8 x i16> %wide.masked.gather
103}
104
105define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) {
106; CHECK-LABEL: gather_inc_mini_16i8:
107; CHECK:       @ %bb.0:
108; CHECK-NEXT:    .save {r4, r5, r6, lr}
109; CHECK-NEXT:    push {r4, r5, r6, lr}
110; CHECK-NEXT:    .vsave {d8, d9}
111; CHECK-NEXT:    vpush {d8, d9}
112; CHECK-NEXT:    vmov.i32 q4, #0x10
113; CHECK-NEXT:    vadd.i32 q3, q3, r0
114; CHECK-NEXT:    vadd.i32 q3, q3, q4
115; CHECK-NEXT:    vadd.i32 q0, q0, r0
116; CHECK-NEXT:    vmov r1, s14
117; CHECK-NEXT:    vadd.i32 q1, q1, r0
118; CHECK-NEXT:    vadd.i32 q1, q1, q4
119; CHECK-NEXT:    vadd.i32 q2, q2, r0
120; CHECK-NEXT:    vmov r0, s4
121; CHECK-NEXT:    vadd.i32 q2, q2, q4
122; CHECK-NEXT:    vmov r2, s10
123; CHECK-NEXT:    vmov r4, s11
124; CHECK-NEXT:    ldrb.w r12, [r1]
125; CHECK-NEXT:    vmov r1, s15
126; CHECK-NEXT:    ldrb r0, [r0]
127; CHECK-NEXT:    ldrb r2, [r2]
128; CHECK-NEXT:    ldrb r4, [r4]
129; CHECK-NEXT:    ldrb.w lr, [r1]
130; CHECK-NEXT:    vmov r1, s12
131; CHECK-NEXT:    ldrb r3, [r1]
132; CHECK-NEXT:    vmov r1, s13
133; CHECK-NEXT:    vadd.i32 q3, q0, q4
134; CHECK-NEXT:    vmov r5, s12
135; CHECK-NEXT:    vmov r6, s15
136; CHECK-NEXT:    ldrb r1, [r1]
137; CHECK-NEXT:    ldrb r5, [r5]
138; CHECK-NEXT:    ldrb r6, [r6]
139; CHECK-NEXT:    vmov.8 q0[0], r5
140; CHECK-NEXT:    vmov r5, s13
141; CHECK-NEXT:    ldrb r5, [r5]
142; CHECK-NEXT:    vmov.8 q0[1], r5
143; CHECK-NEXT:    vmov r5, s14
144; CHECK-NEXT:    ldrb r5, [r5]
145; CHECK-NEXT:    vmov.8 q0[2], r5
146; CHECK-NEXT:    vmov r5, s8
147; CHECK-NEXT:    vmov.8 q0[3], r6
148; CHECK-NEXT:    vmov.8 q0[4], r0
149; CHECK-NEXT:    vmov r0, s5
150; CHECK-NEXT:    ldrb r5, [r5]
151; CHECK-NEXT:    ldrb r0, [r0]
152; CHECK-NEXT:    vmov.8 q0[5], r0
153; CHECK-NEXT:    vmov r0, s6
154; CHECK-NEXT:    ldrb r0, [r0]
155; CHECK-NEXT:    vmov.8 q0[6], r0
156; CHECK-NEXT:    vmov r0, s7
157; CHECK-NEXT:    ldrb r0, [r0]
158; CHECK-NEXT:    vmov.8 q0[7], r0
159; CHECK-NEXT:    vmov r0, s9
160; CHECK-NEXT:    vmov.8 q0[8], r5
161; CHECK-NEXT:    ldrb r0, [r0]
162; CHECK-NEXT:    vmov.8 q0[9], r0
163; CHECK-NEXT:    vmov.8 q0[10], r2
164; CHECK-NEXT:    vmov.8 q0[11], r4
165; CHECK-NEXT:    vmov.8 q0[12], r3
166; CHECK-NEXT:    vmov.8 q0[13], r1
167; CHECK-NEXT:    vmov.8 q0[14], r12
168; CHECK-NEXT:    vmov.8 q0[15], lr
169; CHECK-NEXT:    vpop {d8, d9}
170; CHECK-NEXT:    pop {r4, r5, r6, pc}
171  %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
172  %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1
173  %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
174  ret <16 x i8> %wide.masked.gather
175}
176
177define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) {
178; CHECK-LABEL: gather_inc_minipred_16i8:
179; CHECK:       @ %bb.0:
180; CHECK-NEXT:    .save {r4, r5, r7, lr}
181; CHECK-NEXT:    push {r4, r5, r7, lr}
182; CHECK-NEXT:    .vsave {d8, d9}
183; CHECK-NEXT:    vpush {d8, d9}
184; CHECK-NEXT:    vmov.i32 q4, #0x10
185; CHECK-NEXT:    vadd.i32 q2, q2, r0
186; CHECK-NEXT:    vadd.i32 q2, q2, q4
187; CHECK-NEXT:    vadd.i32 q1, q1, r0
188; CHECK-NEXT:    vmov r2, s8
189; CHECK-NEXT:    vadd.i32 q1, q1, q4
190; CHECK-NEXT:    vmov r1, s4
191; CHECK-NEXT:    vadd.i32 q0, q0, r0
192; CHECK-NEXT:    vadd.i32 q0, q0, q4
193; CHECK-NEXT:    vmov r3, s10
194; CHECK-NEXT:    vmov r5, s2
195; CHECK-NEXT:    ldrb.w lr, [r2]
196; CHECK-NEXT:    vmov r2, s0
197; CHECK-NEXT:    ldrb.w r12, [r1]
198; CHECK-NEXT:    vmov r1, s6
199; CHECK-NEXT:    vadd.i32 q1, q3, r0
200; CHECK-NEXT:    ldrb r3, [r3]
201; CHECK-NEXT:    vadd.i32 q1, q1, q4
202; CHECK-NEXT:    ldrb r5, [r5]
203; CHECK-NEXT:    vmov r0, s4
204; CHECK-NEXT:    vmov r4, s6
205; CHECK-NEXT:    ldrb r2, [r2]
206; CHECK-NEXT:    ldrb r1, [r1]
207; CHECK-NEXT:    vmov.8 q0[0], r2
208; CHECK-NEXT:    vmov.8 q0[2], r5
209; CHECK-NEXT:    vmov.8 q0[4], r12
210; CHECK-NEXT:    ldrb r0, [r0]
211; CHECK-NEXT:    vmov.8 q0[6], r1
212; CHECK-NEXT:    ldrb r4, [r4]
213; CHECK-NEXT:    vmov.8 q0[8], lr
214; CHECK-NEXT:    vmov.8 q0[10], r3
215; CHECK-NEXT:    vmov.8 q0[12], r0
216; CHECK-NEXT:    vmov.8 q0[14], r4
217; CHECK-NEXT:    vpop {d8, d9}
218; CHECK-NEXT:    pop {r4, r5, r7, pc}
219  %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
220  %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1
221  %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
222  ret <16 x i8> %wide.masked.gather
223}
224
225define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
226; CHECK-LABEL: gather_pre_inc:
227; CHECK:       @ %bb.0: @ %vector.ph
228; CHECK-NEXT:    adr r3, .LCPI6_0
229; CHECK-NEXT:    vldrw.u32 q0, [r3]
230; CHECK-NEXT:    vadd.i32 q0, q0, r0
231; CHECK-NEXT:  .LBB6_1: @ %vector.body
232; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
233; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
234; CHECK-NEXT:    subs r2, #4
235; CHECK-NEXT:    vstrb.8 q1, [r1], #16
236; CHECK-NEXT:    bne .LBB6_1
237; CHECK-NEXT:  @ %bb.2: @ %end
238; CHECK-NEXT:    bx lr
239; CHECK-NEXT:    .p2align 4
240; CHECK-NEXT:  @ %bb.3:
241; CHECK-NEXT:  .LCPI6_0:
242; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
243; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
244; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
245; CHECK-NEXT:    .long 0 @ 0x0
246vector.ph:                                        ; preds = %for.body.preheader
247  %ind.end = shl i32 %n.vec, 1
248  br label %vector.body
249
250vector.body:                                      ; preds = %vector.body, %vector.ph
251  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
252  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
253  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
254  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
255  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
256  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
257  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
258  %4 = bitcast i32* %3 to <4 x i32>*
259  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
260  %index.next = add i32 %index, 4
261  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
262  %5 = icmp eq i32 %index.next, %n.vec
263  br i1 %5, label %end, label %vector.body
264
265end:
266  ret void;
267}
268
269define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec43) {
270; CHECK-LABEL: gather_post_inc:
271; CHECK:       @ %bb.0: @ %vector.ph41
272; CHECK-NEXT:    adr r3, .LCPI7_0
273; CHECK-NEXT:    vldrw.u32 q0, [r3]
274; CHECK-NEXT:    vadd.i32 q0, q0, r0
275; CHECK-NEXT:  .LBB7_1: @ %vector.body39
276; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
277; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
278; CHECK-NEXT:    subs r2, #4
279; CHECK-NEXT:    vstrb.8 q1, [r1], #16
280; CHECK-NEXT:    bne .LBB7_1
281; CHECK-NEXT:  @ %bb.2: @ %end
282; CHECK-NEXT:    bx lr
283; CHECK-NEXT:    .p2align 4
284; CHECK-NEXT:  @ %bb.3:
285; CHECK-NEXT:  .LCPI7_0:
286; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
287; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
288; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
289; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
290vector.ph41:                                      ; preds = %for.body6.preheader
291  %ind.end47 = shl i32 %n.vec43, 1
292  br label %vector.body39
293
294vector.body39:                                    ; preds = %vector.body39, %vector.ph41
295  %index44 = phi i32 [ 0, %vector.ph41 ], [ %index.next45, %vector.body39 ]
296  %vec.ind50 = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph41 ], [ %vec.ind.next51, %vector.body39 ]
297  %0 = mul nuw nsw <4 x i32> %vec.ind50, <i32 3, i32 3, i32 3, i32 3>
298  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
299  %wide.masked.gather55 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
300  %2 = getelementptr inbounds i32, i32* %dst, i32 %index44
301  %3 = bitcast i32* %2 to <4 x i32>*
302  store <4 x i32> %wide.masked.gather55, <4 x i32>* %3, align 4
303  %index.next45 = add i32 %index44, 4
304  %vec.ind.next51 = add <4 x i32> %vec.ind50, <i32 8, i32 8, i32 8, i32 8>
305  %4 = icmp eq i32 %index.next45, %n.vec43
306  br i1 %4, label %end, label %vector.body39
307
308end:
309  ret void;
310}
311
312define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) {
313; CHECK-LABEL: gather_inc_v4i32_simple:
314; CHECK:       @ %bb.0: @ %entry
315; CHECK-NEXT:    .save {r4, lr}
316; CHECK-NEXT:    push {r4, lr}
317; CHECK-NEXT:    cmp r2, #1
318; CHECK-NEXT:    it lt
319; CHECK-NEXT:    poplt {r4, pc}
320; CHECK-NEXT:  .LBB8_1: @ %vector.ph.preheader
321; CHECK-NEXT:    bic r12, r2, #3
322; CHECK-NEXT:    movs r3, #1
323; CHECK-NEXT:    sub.w lr, r12, #4
324; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
325; CHECK-NEXT:    adr r3, .LCPI8_0
326; CHECK-NEXT:    vldrw.u32 q0, [r3]
327; CHECK-NEXT:    vadd.i32 q0, q0, r0
328; CHECK-NEXT:  .LBB8_2: @ %vector.ph
329; CHECK-NEXT:    @ =>This Loop Header: Depth=1
330; CHECK-NEXT:    @ Child Loop BB8_3 Depth 2
331; CHECK-NEXT:    dls lr, r4
332; CHECK-NEXT:    mov r0, r1
333; CHECK-NEXT:    vmov q1, q0
334; CHECK-NEXT:  .LBB8_3: @ %vector.body
335; CHECK-NEXT:    @ Parent Loop BB8_2 Depth=1
336; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
337; CHECK-NEXT:    vldrw.u32 q2, [q1, #16]!
338; CHECK-NEXT:    vstrb.8 q2, [r0], #16
339; CHECK-NEXT:    le lr, .LBB8_3
340; CHECK-NEXT:  @ %bb.4: @ %middle.block
341; CHECK-NEXT:    @ in Loop: Header=BB8_2 Depth=1
342; CHECK-NEXT:    cmp r12, r2
343; CHECK-NEXT:    bne .LBB8_2
344; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
345; CHECK-NEXT:    pop {r4, pc}
346; CHECK-NEXT:    .p2align 4
347; CHECK-NEXT:  @ %bb.6:
348; CHECK-NEXT:  .LCPI8_0:
349; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
350; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
351; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
352; CHECK-NEXT:    .long 4294967292 @ 0xfffffffc
353entry:
354  %cmp22 = icmp sgt i32 %n, 0
355  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
356
357vector.ph:                                        ; preds = %for.body.preheader
358  %n.vec = and i32 %n, -4
359  br label %vector.body
360
361vector.body:                                      ; preds = %vector.body, %vector.ph
362  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
363  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
364  %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind
365  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
366  %1 = getelementptr inbounds i32, i32* %dst, i32 %index
367  %2 = bitcast i32* %1 to <4 x i32>*
368  store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4
369  %index.next = add i32 %index, 4
370  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
371  %3 = icmp eq i32 %index.next, %n.vec
372  br i1 %3, label %middle.block, label %vector.body
373
374middle.block:                                     ; preds = %vector.body
375  %cmp.n = icmp eq i32 %n.vec, %n
376  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
377
378for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
379  ret void
380}
381
382define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) {
383; CHECK-LABEL: gather_inc_v4i32_complex:
384; CHECK:       @ %bb.0: @ %entry
385; CHECK-NEXT:    .save {r4, r5, r7, lr}
386; CHECK-NEXT:    push {r4, r5, r7, lr}
387; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
388; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
389; CHECK-NEXT:    cmp r2, #1
390; CHECK-NEXT:    blt .LBB9_5
391; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
392; CHECK-NEXT:    bic r12, r2, #3
393; CHECK-NEXT:    movs r3, #1
394; CHECK-NEXT:    sub.w lr, r12, #4
395; CHECK-NEXT:    adr r4, .LCPI9_1
396; CHECK-NEXT:    adr r5, .LCPI9_2
397; CHECK-NEXT:    vldrw.u32 q1, [r4]
398; CHECK-NEXT:    add.w r3, r3, lr, lsr #2
399; CHECK-NEXT:    adr.w lr, .LCPI9_0
400; CHECK-NEXT:    vldrw.u32 q0, [r5]
401; CHECK-NEXT:    vldrw.u32 q2, [lr]
402; CHECK-NEXT:    vadd.i32 q1, q1, r0
403; CHECK-NEXT:    vadd.i32 q0, q0, r0
404; CHECK-NEXT:    vadd.i32 q2, q2, r0
405; CHECK-NEXT:  .LBB9_2: @ %vector.ph
406; CHECK-NEXT:    @ =>This Loop Header: Depth=1
407; CHECK-NEXT:    @ Child Loop BB9_3 Depth 2
408; CHECK-NEXT:    dls lr, r3
409; CHECK-NEXT:    mov r0, r1
410; CHECK-NEXT:    vmov q3, q1
411; CHECK-NEXT:    vmov q4, q0
412; CHECK-NEXT:    vmov q5, q2
413; CHECK-NEXT:  .LBB9_3: @ %vector.body
414; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
415; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
416; CHECK-NEXT:    vldrw.u32 q6, [q5, #48]!
417; CHECK-NEXT:    vldrw.u32 q7, [q3, #48]!
418; CHECK-NEXT:    vadd.i32 q6, q7, q6
419; CHECK-NEXT:    vldrw.u32 q7, [q4, #48]!
420; CHECK-NEXT:    vadd.i32 q6, q6, q7
421; CHECK-NEXT:    vstrb.8 q6, [r0], #16
422; CHECK-NEXT:    le lr, .LBB9_3
423; CHECK-NEXT:  @ %bb.4: @ %middle.block
424; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
425; CHECK-NEXT:    cmp r12, r2
426; CHECK-NEXT:    bne .LBB9_2
427; CHECK-NEXT:  .LBB9_5: @ %for.cond.cleanup
428; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
429; CHECK-NEXT:    pop {r4, r5, r7, pc}
430; CHECK-NEXT:    .p2align 4
431; CHECK-NEXT:  @ %bb.6:
432; CHECK-NEXT:  .LCPI9_0:
433; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
434; CHECK-NEXT:    .long 4294967260 @ 0xffffffdc
435; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
436; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
437; CHECK-NEXT:  .LCPI9_1:
438; CHECK-NEXT:    .long 4294967252 @ 0xffffffd4
439; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
440; CHECK-NEXT:    .long 4294967276 @ 0xffffffec
441; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
442; CHECK-NEXT:  .LCPI9_2:
443; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
444; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
445; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
446; CHECK-NEXT:    .long 4294967292 @ 0xfffffffc
447entry:
448  %cmp22 = icmp sgt i32 %n, 0
449  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
450
451vector.ph:                                        ; preds = %for.body.preheader
452  %n.vec = and i32 %n, -4
453  br label %vector.body
454
455vector.body:                                      ; preds = %vector.body, %vector.ph
456  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
457  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
458  %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
459  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
460  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
461  %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
462  %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2
463  %wide.masked.gather24 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
464  %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
465  %5 = getelementptr inbounds i32, i32* %data, <4 x i32> %4
466  %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
467  %6 = add nsw <4 x i32> %wide.masked.gather24, %wide.masked.gather
468  %7 = add nsw <4 x i32> %6, %wide.masked.gather25
469  %8 = getelementptr inbounds i32, i32* %dst, i32 %index
470  %9 = bitcast i32* %8 to <4 x i32>*
471  store <4 x i32> %7, <4 x i32>* %9, align 4
472  %index.next = add i32 %index, 4
473  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
474  %10 = icmp eq i32 %index.next, %n.vec
475  br i1 %10, label %middle.block, label %vector.body
476
477middle.block:                                     ; preds = %vector.body
478  %cmp.n = icmp eq i32 %n.vec, %n
479  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
480
481for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
482  ret void
483}
484
485define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) {
486; CHECK-LABEL: gather_inc_v4i32_large:
487; CHECK:       @ %bb.0: @ %entry
488; CHECK-NEXT:    .save {r4, lr}
489; CHECK-NEXT:    push {r4, lr}
490; CHECK-NEXT:    cmp r2, #1
491; CHECK-NEXT:    it lt
492; CHECK-NEXT:    poplt {r4, pc}
493; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
494; CHECK-NEXT:    bic r12, r2, #3
495; CHECK-NEXT:    movs r3, #1
496; CHECK-NEXT:    sub.w lr, r12, #4
497; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
498; CHECK-NEXT:    adr r3, .LCPI10_0
499; CHECK-NEXT:    vldrw.u32 q0, [r3]
500; CHECK-NEXT:    vadd.i32 q0, q0, r0
501; CHECK-NEXT:  .LBB10_2: @ %vector.ph
502; CHECK-NEXT:    @ =>This Loop Header: Depth=1
503; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
504; CHECK-NEXT:    dls lr, r4
505; CHECK-NEXT:    mov r0, r1
506; CHECK-NEXT:    vmov q1, q0
507; CHECK-NEXT:  .LBB10_3: @ %vector.body
508; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
509; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
510; CHECK-NEXT:    vldrw.u32 q2, [q1, #508]!
511; CHECK-NEXT:    vstrb.8 q2, [r0], #16
512; CHECK-NEXT:    le lr, .LBB10_3
513; CHECK-NEXT:  @ %bb.4: @ %middle.block
514; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
515; CHECK-NEXT:    cmp r12, r2
516; CHECK-NEXT:    bne .LBB10_2
517; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
518; CHECK-NEXT:    pop {r4, pc}
519; CHECK-NEXT:    .p2align 4
520; CHECK-NEXT:  @ %bb.6:
521; CHECK-NEXT:  .LCPI10_0:
522; CHECK-NEXT:    .long 4294966788 @ 0xfffffe04
523; CHECK-NEXT:    .long 4294966792 @ 0xfffffe08
524; CHECK-NEXT:    .long 4294966796 @ 0xfffffe0c
525; CHECK-NEXT:    .long 4294966800 @ 0xfffffe10
526entry:
527  %cmp22 = icmp sgt i32 %n, 0
528  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
529
530vector.ph:                                        ; preds = %for.body.preheader
531  %n.vec = and i32 %n, -4
532  br label %vector.body
533
534vector.body:                                      ; preds = %vector.body, %vector.ph
535  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
536  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
537  %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind
538  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
539  %1 = getelementptr inbounds i32, i32* %dst, i32 %index
540  %2 = bitcast i32* %1 to <4 x i32>*
541  store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4
542  %index.next = add i32 %index, 4
543  %vec.ind.next = add <4 x i32> %vec.ind, <i32 127, i32 127, i32 127, i32 127>
544  %3 = icmp eq i32 %index.next, %n.vec
545  br i1 %3, label %middle.block, label %vector.body
546
547middle.block:                                     ; preds = %vector.body
548  %cmp.n = icmp eq i32 %n.vec, %n
549  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
550
551for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
552  ret void
553}
554
555; TODO: uneven - I think it's not possible to create such an example, because vec.ind will always be increased by a vector with 4 elements (=> x*4 = even)
556
557; TODO: What is sxth?
558define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) {
559; CHECK-LABEL: gather_inc_v8i16_simple:
560; CHECK:       @ %bb.0: @ %entry
561; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
562; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
563; CHECK-NEXT:    .pad #4
564; CHECK-NEXT:    sub sp, #4
565; CHECK-NEXT:    .vsave {d8, d9}
566; CHECK-NEXT:    vpush {d8, d9}
567; CHECK-NEXT:    .pad #8
568; CHECK-NEXT:    sub sp, #8
569; CHECK-NEXT:    cmp r2, #1
570; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
571; CHECK-NEXT:    blt .LBB11_5
572; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
573; CHECK-NEXT:    bic r8, r2, #7
574; CHECK-NEXT:    movs r5, #1
575; CHECK-NEXT:    sub.w r6, r8, #8
576; CHECK-NEXT:    vmov.i16 q1, #0x8
577; CHECK-NEXT:    add.w r1, r5, r6, lsr #3
578; CHECK-NEXT:    adr r6, .LCPI11_0
579; CHECK-NEXT:    vldrw.u32 q0, [r6]
580; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
581; CHECK-NEXT:  .LBB11_2: @ %vector.ph
582; CHECK-NEXT:    @ =>This Loop Header: Depth=1
583; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
584; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
585; CHECK-NEXT:    vmov q2, q0
586; CHECK-NEXT:    dls lr, r1
587; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
588; CHECK-NEXT:  .LBB11_3: @ %vector.body
589; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
590; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
591; CHECK-NEXT:    vmov.u16 r7, q2[4]
592; CHECK-NEXT:    vmov.u16 r5, q2[0]
593; CHECK-NEXT:    vmov.32 q4[0], r7
594; CHECK-NEXT:    vmov.u16 r7, q2[5]
595; CHECK-NEXT:    vmov.32 q4[1], r7
596; CHECK-NEXT:    vmov.u16 r7, q2[6]
597; CHECK-NEXT:    vmov.32 q4[2], r7
598; CHECK-NEXT:    vmov.u16 r7, q2[7]
599; CHECK-NEXT:    vmov.32 q4[3], r7
600; CHECK-NEXT:    vmov.32 q3[0], r5
601; CHECK-NEXT:    vmovlb.s16 q4, q4
602; CHECK-NEXT:    vmov.u16 r5, q2[1]
603; CHECK-NEXT:    vshl.i32 q4, q4, #1
604; CHECK-NEXT:    vmov.32 q3[1], r5
605; CHECK-NEXT:    vadd.i32 q4, q4, r0
606; CHECK-NEXT:    vmov.u16 r5, q2[2]
607; CHECK-NEXT:    vmov r7, s16
608; CHECK-NEXT:    vmov.32 q3[2], r5
609; CHECK-NEXT:    vmov.u16 r5, q2[3]
610; CHECK-NEXT:    vmov r3, s17
611; CHECK-NEXT:    vmov.32 q3[3], r5
612; CHECK-NEXT:    vadd.i16 q2, q2, q1
613; CHECK-NEXT:    vmovlb.s16 q3, q3
614; CHECK-NEXT:    vshl.i32 q3, q3, #1
615; CHECK-NEXT:    vadd.i32 q3, q3, r0
616; CHECK-NEXT:    vmov r5, s15
617; CHECK-NEXT:    vmov r6, s14
618; CHECK-NEXT:    vmov r12, s13
619; CHECK-NEXT:    ldrh.w r11, [r7]
620; CHECK-NEXT:    vmov r7, s12
621; CHECK-NEXT:    ldrh r3, [r3]
622; CHECK-NEXT:    ldrh.w r9, [r5]
623; CHECK-NEXT:    vmov r5, s18
624; CHECK-NEXT:    ldrh.w r10, [r6]
625; CHECK-NEXT:    vmov r6, s19
626; CHECK-NEXT:    ldrh.w r1, [r12]
627; CHECK-NEXT:    ldrh r7, [r7]
628; CHECK-NEXT:    vmov.16 q3[0], r7
629; CHECK-NEXT:    vmov.16 q3[1], r1
630; CHECK-NEXT:    vmov.16 q3[2], r10
631; CHECK-NEXT:    vmov.16 q3[3], r9
632; CHECK-NEXT:    vmov.16 q3[4], r11
633; CHECK-NEXT:    ldrh r5, [r5]
634; CHECK-NEXT:    vmov.16 q3[5], r3
635; CHECK-NEXT:    ldrh r6, [r6]
636; CHECK-NEXT:    vmov.16 q3[6], r5
637; CHECK-NEXT:    vmov.16 q3[7], r6
638; CHECK-NEXT:    vstrb.8 q3, [r4], #16
639; CHECK-NEXT:    le lr, .LBB11_3
640; CHECK-NEXT:  @ %bb.4: @ %middle.block
641; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
642; CHECK-NEXT:    cmp r8, r2
643; CHECK-NEXT:    bne .LBB11_2
644; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
645; CHECK-NEXT:    add sp, #8
646; CHECK-NEXT:    vpop {d8, d9}
647; CHECK-NEXT:    add sp, #4
648; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
649; CHECK-NEXT:    .p2align 4
650; CHECK-NEXT:  @ %bb.6:
651; CHECK-NEXT:  .LCPI11_0:
652; CHECK-NEXT:    .short 0 @ 0x0
653; CHECK-NEXT:    .short 1 @ 0x1
654; CHECK-NEXT:    .short 2 @ 0x2
655; CHECK-NEXT:    .short 3 @ 0x3
656; CHECK-NEXT:    .short 4 @ 0x4
657; CHECK-NEXT:    .short 5 @ 0x5
658; CHECK-NEXT:    .short 6 @ 0x6
659; CHECK-NEXT:    .short 7 @ 0x7
660
661
662entry:
663  %cmp22 = icmp sgt i32 %n, 0
664  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
665
666vector.ph:                                        ; preds = %for.body.preheader
667  %n.vec = and i32 %n, -8
668  br label %vector.body
669
670vector.body:                                      ; preds = %vector.body, %vector.ph
671  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
672  %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
673  %0 = getelementptr inbounds i16, i16* %data, <8 x i16> %vec.ind
674  %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %0, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
675  %1 = getelementptr inbounds i16, i16* %dst, i32 %index
676  %2 = bitcast i16* %1 to <8 x i16>*
677  store <8 x i16> %wide.masked.gather, <8 x i16>* %2, align 2
678  %index.next = add i32 %index, 8
679  %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
680  %3 = icmp eq i32 %index.next, %n.vec
681  br i1 %3, label %middle.block, label %vector.body
682
683middle.block:                                     ; preds = %vector.body
684  %cmp.n = icmp eq i32 %n.vec, %n
685  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
686
687for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
688  ret void
689}
690
691; TODO: This looks absolutely terrifying :(
692define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) {
693; CHECK-LABEL: gather_inc_v8i16_complex:
694; CHECK:       @ %bb.0: @ %entry
695; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
696; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
697; CHECK-NEXT:    .pad #4
698; CHECK-NEXT:    sub sp, #4
699; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
700; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
701; CHECK-NEXT:    .pad #104
702; CHECK-NEXT:    sub sp, #104
703; CHECK-NEXT:    cmp r2, #1
704; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
705; CHECK-NEXT:    blt.w .LBB12_5
706; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
707; CHECK-NEXT:    bic r8, r2, #7
708; CHECK-NEXT:    adr r6, .LCPI12_2
709; CHECK-NEXT:    sub.w r3, r8, #8
710; CHECK-NEXT:    vldrw.u32 q0, [r6]
711; CHECK-NEXT:    movs r7, #1
712; CHECK-NEXT:    vmov.i16 q3, #0x18
713; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
714; CHECK-NEXT:    adr r3, .LCPI12_0
715; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
716; CHECK-NEXT:    vldrw.u32 q0, [r3]
717; CHECK-NEXT:    adr r7, .LCPI12_1
718; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
719; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
720; CHECK-NEXT:    vldrw.u32 q0, [r7]
721; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
722; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
723; CHECK-NEXT:  .LBB12_2: @ %vector.ph
724; CHECK-NEXT:    @ =>This Loop Header: Depth=1
725; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
726; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
727; CHECK-NEXT:    dls lr, r1
728; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
729; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
730; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
731; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
732; CHECK-NEXT:  .LBB12_3: @ %vector.body
733; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
734; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
735; CHECK-NEXT:    vmov.u16 r3, q5[0]
736; CHECK-NEXT:    vmov.u16 r7, q7[4]
737; CHECK-NEXT:    vmov.32 q0[0], r3
738; CHECK-NEXT:    vmov.u16 r3, q5[1]
739; CHECK-NEXT:    vmov.32 q0[1], r3
740; CHECK-NEXT:    vmov.u16 r3, q5[2]
741; CHECK-NEXT:    vmov.32 q0[2], r3
742; CHECK-NEXT:    vmov.u16 r3, q5[3]
743; CHECK-NEXT:    vmov.32 q0[3], r3
744; CHECK-NEXT:    vmov.u16 r12, q6[0]
745; CHECK-NEXT:    vmovlb.s16 q0, q0
746; CHECK-NEXT:    vmov.32 q1[0], r12
747; CHECK-NEXT:    vshl.i32 q0, q0, #1
748; CHECK-NEXT:    vmov.u16 r1, q6[1]
749; CHECK-NEXT:    vadd.i32 q2, q0, r0
750; CHECK-NEXT:    vmov.32 q1[1], r1
751; CHECK-NEXT:    vmov r3, s10
752; CHECK-NEXT:    vmov.u16 r1, q6[2]
753; CHECK-NEXT:    vmov.32 q1[2], r1
754; CHECK-NEXT:    vmov.u16 r1, q6[3]
755; CHECK-NEXT:    vmov.32 q1[3], r1
756; CHECK-NEXT:    vmov.u16 r1, q6[4]
757; CHECK-NEXT:    vmovlb.s16 q1, q1
758; CHECK-NEXT:    vmov r6, s11
759; CHECK-NEXT:    vshl.i32 q1, q1, #1
760; CHECK-NEXT:    vadd.i32 q4, q1, r0
761; CHECK-NEXT:    ldrh.w r9, [r3]
762; CHECK-NEXT:    vmov.u16 r3, q5[4]
763; CHECK-NEXT:    vmov.32 q0[0], r3
764; CHECK-NEXT:    vmov.u16 r3, q5[5]
765; CHECK-NEXT:    vmov.32 q0[1], r3
766; CHECK-NEXT:    vmov.u16 r3, q5[6]
767; CHECK-NEXT:    vmov.32 q0[2], r3
768; CHECK-NEXT:    vmov.u16 r3, q5[7]
769; CHECK-NEXT:    vmov.32 q0[3], r3
770; CHECK-NEXT:    ldrh r6, [r6]
771; CHECK-NEXT:    vmovlb.s16 q0, q0
772; CHECK-NEXT:    vshl.i32 q0, q0, #1
773; CHECK-NEXT:    vadd.i32 q0, q0, r0
774; CHECK-NEXT:    vmov r3, s0
775; CHECK-NEXT:    vmov r5, s3
776; CHECK-NEXT:    ldrh.w r10, [r3]
777; CHECK-NEXT:    vmov r3, s1
778; CHECK-NEXT:    ldrh r5, [r5]
779; CHECK-NEXT:    ldrh.w r11, [r3]
780; CHECK-NEXT:    vmov r3, s2
781; CHECK-NEXT:    vmov.32 q0[0], r7
782; CHECK-NEXT:    vmov.u16 r7, q7[5]
783; CHECK-NEXT:    vmov.32 q0[1], r7
784; CHECK-NEXT:    vmov.u16 r7, q7[6]
785; CHECK-NEXT:    vmov.32 q0[2], r7
786; CHECK-NEXT:    vmov.u16 r7, q7[7]
787; CHECK-NEXT:    vmov.32 q0[3], r7
788; CHECK-NEXT:    vmovlb.s16 q0, q0
789; CHECK-NEXT:    vshl.i32 q0, q0, #1
790; CHECK-NEXT:    vadd.i32 q0, q0, r0
791; CHECK-NEXT:    vmov r7, s2
792; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
793; CHECK-NEXT:    vmov.32 q0[0], r1
794; CHECK-NEXT:    vmov.u16 r1, q6[5]
795; CHECK-NEXT:    vmov.32 q0[1], r1
796; CHECK-NEXT:    vmov.u16 r1, q6[6]
797; CHECK-NEXT:    vmov.32 q0[2], r1
798; CHECK-NEXT:    vmov.u16 r1, q6[7]
799; CHECK-NEXT:    vmov.32 q0[3], r1
800; CHECK-NEXT:    vmov.u16 r1, q7[0]
801; CHECK-NEXT:    vmov.32 q3[0], r1
802; CHECK-NEXT:    vmov.u16 r1, q7[1]
803; CHECK-NEXT:    vmov.32 q3[1], r1
804; CHECK-NEXT:    vmov.u16 r1, q7[2]
805; CHECK-NEXT:    vmov.32 q3[2], r1
806; CHECK-NEXT:    vmov.u16 r1, q7[3]
807; CHECK-NEXT:    vmov.32 q3[3], r1
808; CHECK-NEXT:    vmov r1, s8
809; CHECK-NEXT:    vmovlb.s16 q0, q0
810; CHECK-NEXT:    vmovlb.s16 q3, q3
811; CHECK-NEXT:    vshl.i32 q0, q0, #1
812; CHECK-NEXT:    vshl.i32 q3, q3, #1
813; CHECK-NEXT:    vadd.i32 q0, q0, r0
814; CHECK-NEXT:    vadd.i32 q3, q3, r0
815; CHECK-NEXT:    ldrh r3, [r3]
816; CHECK-NEXT:    ldrh r7, [r7]
817; CHECK-NEXT:    ldrh r1, [r1]
818; CHECK-NEXT:    vmov.16 q1[0], r1
819; CHECK-NEXT:    vmov r1, s9
820; CHECK-NEXT:    ldrh r1, [r1]
821; CHECK-NEXT:    vmov.16 q1[1], r1
822; CHECK-NEXT:    vmov r1, s16
823; CHECK-NEXT:    vmov.16 q1[2], r9
824; CHECK-NEXT:    vmov.16 q1[3], r6
825; CHECK-NEXT:    vmov.16 q1[4], r10
826; CHECK-NEXT:    vmov.16 q1[5], r11
827; CHECK-NEXT:    vmov.16 q1[6], r3
828; CHECK-NEXT:    vmov.16 q1[7], r5
829; CHECK-NEXT:    ldrh r1, [r1]
830; CHECK-NEXT:    vmov.16 q2[0], r1
831; CHECK-NEXT:    vmov r1, s17
832; CHECK-NEXT:    ldrh r1, [r1]
833; CHECK-NEXT:    vmov.16 q2[1], r1
834; CHECK-NEXT:    vmov r1, s18
835; CHECK-NEXT:    ldrh r1, [r1]
836; CHECK-NEXT:    vmov.16 q2[2], r1
837; CHECK-NEXT:    vmov r1, s19
838; CHECK-NEXT:    vldrw.u32 q4, [sp, #80] @ 16-byte Reload
839; CHECK-NEXT:    ldrh r1, [r1]
840; CHECK-NEXT:    vmov.16 q2[3], r1
841; CHECK-NEXT:    vmov r1, s0
842; CHECK-NEXT:    ldrh r1, [r1]
843; CHECK-NEXT:    vmov.16 q2[4], r1
844; CHECK-NEXT:    vmov r1, s1
845; CHECK-NEXT:    ldrh r1, [r1]
846; CHECK-NEXT:    vmov.16 q2[5], r1
847; CHECK-NEXT:    vmov r1, s2
848; CHECK-NEXT:    ldrh r1, [r1]
849; CHECK-NEXT:    vmov.16 q2[6], r1
850; CHECK-NEXT:    vmov r1, s3
851; CHECK-NEXT:    ldrh r1, [r1]
852; CHECK-NEXT:    vmov.16 q2[7], r1
853; CHECK-NEXT:    vmov r1, s12
854; CHECK-NEXT:    ldrh r1, [r1]
855; CHECK-NEXT:    vmov.16 q0[0], r1
856; CHECK-NEXT:    vmov r1, s13
857; CHECK-NEXT:    ldrh r1, [r1]
858; CHECK-NEXT:    vmov.16 q0[1], r1
859; CHECK-NEXT:    vmov r1, s14
860; CHECK-NEXT:    ldrh r1, [r1]
861; CHECK-NEXT:    vmov.16 q0[2], r1
862; CHECK-NEXT:    vmov r1, s15
863; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
864; CHECK-NEXT:    vadd.i16 q6, q6, q3
865; CHECK-NEXT:    vadd.i16 q5, q5, q3
866; CHECK-NEXT:    vadd.i16 q7, q7, q3
867; CHECK-NEXT:    ldrh r1, [r1]
868; CHECK-NEXT:    vmov.16 q0[3], r1
869; CHECK-NEXT:    vmov r1, s16
870; CHECK-NEXT:    ldrh r1, [r1]
871; CHECK-NEXT:    vmov.16 q0[4], r1
872; CHECK-NEXT:    vmov r1, s17
873; CHECK-NEXT:    ldrh r1, [r1]
874; CHECK-NEXT:    vmov.16 q0[5], r1
875; CHECK-NEXT:    vmov r1, s19
876; CHECK-NEXT:    vmov.16 q0[6], r7
877; CHECK-NEXT:    ldrh r1, [r1]
878; CHECK-NEXT:    vmov.16 q0[7], r1
879; CHECK-NEXT:    vadd.i16 q0, q0, q2
880; CHECK-NEXT:    vadd.i16 q0, q0, q1
881; CHECK-NEXT:    vstrb.8 q0, [r4], #16
882; CHECK-NEXT:    le lr, .LBB12_3
883; CHECK-NEXT:  @ %bb.4: @ %middle.block
884; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
885; CHECK-NEXT:    cmp r8, r2
886; CHECK-NEXT:    bne.w .LBB12_2
887; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
888; CHECK-NEXT:    add sp, #104
889; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
890; CHECK-NEXT:    add sp, #4
891; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
892; CHECK-NEXT:    .p2align 4
893; CHECK-NEXT:  @ %bb.6:
894; CHECK-NEXT:  .LCPI12_0:
895; CHECK-NEXT:    .short 1 @ 0x1
896; CHECK-NEXT:    .short 4 @ 0x4
897; CHECK-NEXT:    .short 7 @ 0x7
898; CHECK-NEXT:    .short 10 @ 0xa
899; CHECK-NEXT:    .short 13 @ 0xd
900; CHECK-NEXT:    .short 16 @ 0x10
901; CHECK-NEXT:    .short 19 @ 0x13
902; CHECK-NEXT:    .short 22 @ 0x16
903; CHECK-NEXT:  .LCPI12_1:
904; CHECK-NEXT:    .short 0 @ 0x0
905; CHECK-NEXT:    .short 3 @ 0x3
906; CHECK-NEXT:    .short 6 @ 0x6
907; CHECK-NEXT:    .short 9 @ 0x9
908; CHECK-NEXT:    .short 12 @ 0xc
909; CHECK-NEXT:    .short 15 @ 0xf
910; CHECK-NEXT:    .short 18 @ 0x12
911; CHECK-NEXT:    .short 21 @ 0x15
912; CHECK-NEXT:  .LCPI12_2:
913; CHECK-NEXT:    .short 2 @ 0x2
914; CHECK-NEXT:    .short 5 @ 0x5
915; CHECK-NEXT:    .short 8 @ 0x8
916; CHECK-NEXT:    .short 11 @ 0xb
917; CHECK-NEXT:    .short 14 @ 0xe
918; CHECK-NEXT:    .short 17 @ 0x11
919; CHECK-NEXT:    .short 20 @ 0x14
920; CHECK-NEXT:    .short 23 @ 0x17
921
922
923entry:
924  %cmp22 = icmp sgt i32 %n, 0
925  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
926
927vector.ph:                                        ; preds = %for.body.preheader
928  %n.vec = and i32 %n, -8
929  br label %vector.body
930
931vector.body:                                      ; preds = %vector.body, %vector.ph
932  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
933  %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
934  %0 = mul nuw nsw <8 x i16> %vec.ind, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
935  %1 = getelementptr inbounds i16, i16* %data, <8 x i16> %0
936  %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
937  %2 = add nuw nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
938  %3 = getelementptr inbounds i16, i16* %data, <8 x i16> %2
939  %wide.masked.gather24 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %3, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
940  %4 = add nuw nsw <8 x i16> %0, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
941  %5 = getelementptr inbounds i16, i16* %data, <8 x i16> %4
942  %wide.masked.gather25 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %5, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
943  %6 = add nsw <8 x i16> %wide.masked.gather24, %wide.masked.gather
944  %7 = add nsw <8 x i16> %6, %wide.masked.gather25
945  %8 = getelementptr inbounds i16, i16* %dst, i32 %index
946  %9 = bitcast i16* %8 to <8 x i16>*
947  store <8 x i16> %7, <8 x i16>* %9, align 2
948  %index.next = add i32 %index, 8
949  %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
950  %10 = icmp eq i32 %index.next, %n.vec
951  br i1 %10, label %middle.block, label %vector.body
952
953middle.block:                                     ; preds = %vector.body
954  %cmp.n = icmp eq i32 %n.vec, %n
955  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
956
957for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
958  ret void
959}
960
961
962define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) {
963; CHECK-LABEL: gather_inc_v16i8_complex:
964; CHECK:       @ %bb.0: @ %entry
965; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
966; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
967; CHECK-NEXT:    .pad #4
968; CHECK-NEXT:    sub sp, #4
969; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
970; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
971; CHECK-NEXT:    .pad #328
972; CHECK-NEXT:    sub sp, #328
973; CHECK-NEXT:    cmp r2, #1
974; CHECK-NEXT:    str r1, [sp, #120] @ 4-byte Spill
975; CHECK-NEXT:    mov r1, r2
976; CHECK-NEXT:    str r2, [sp, #124] @ 4-byte Spill
977; CHECK-NEXT:    blt.w .LBB13_5
978; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
979; CHECK-NEXT:    ldr r1, [sp, #124] @ 4-byte Reload
980; CHECK-NEXT:    adr.w r6, .LCPI13_8
981; CHECK-NEXT:    adr.w r7, .LCPI13_7
982; CHECK-NEXT:    adr.w r3, .LCPI13_6
983; CHECK-NEXT:    bic r11, r1, #7
984; CHECK-NEXT:    adr r1, .LCPI13_0
985; CHECK-NEXT:    vldrw.u32 q0, [r1]
986; CHECK-NEXT:    adr r1, .LCPI13_1
987; CHECK-NEXT:    vmov.i32 q5, #0x30
988; CHECK-NEXT:    str.w r11, [sp, #116] @ 4-byte Spill
989; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
990; CHECK-NEXT:    vldrw.u32 q0, [r1]
991; CHECK-NEXT:    adr r1, .LCPI13_5
992; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
993; CHECK-NEXT:    vldrw.u32 q0, [r6]
994; CHECK-NEXT:    adr.w r6, .LCPI13_9
995; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
996; CHECK-NEXT:    vldrw.u32 q0, [r7]
997; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
998; CHECK-NEXT:    vldrw.u32 q0, [r6]
999; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
1000; CHECK-NEXT:    vldrw.u32 q0, [r1]
1001; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
1002; CHECK-NEXT:    vldrw.u32 q0, [r3]
1003; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
1004; CHECK-NEXT:  .LBB13_2: @ %vector.ph
1005; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1006; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
1007; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
1008; CHECK-NEXT:    adr r1, .LCPI13_3
1009; CHECK-NEXT:    vldrw.u32 q1, [r1]
1010; CHECK-NEXT:    adr r1, .LCPI13_4
1011; CHECK-NEXT:    vstrw.32 q2, [sp, #288] @ 16-byte Spill
1012; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
1013; CHECK-NEXT:    vldrw.u32 q3, [r1]
1014; CHECK-NEXT:    adr r1, .LCPI13_2
1015; CHECK-NEXT:    vstrw.32 q2, [sp, #224] @ 16-byte Spill
1016; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
1017; CHECK-NEXT:    vldrw.u32 q0, [r1]
1018; CHECK-NEXT:    adr r1, .LCPI13_10
1019; CHECK-NEXT:    vstrw.32 q2, [sp, #272] @ 16-byte Spill
1020; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
1021; CHECK-NEXT:    vstrw.32 q0, [sp, #304] @ 16-byte Spill
1022; CHECK-NEXT:    vldrw.u32 q0, [r1]
1023; CHECK-NEXT:    adr r1, .LCPI13_11
1024; CHECK-NEXT:    ldr.w r9, [sp, #120] @ 4-byte Reload
1025; CHECK-NEXT:    vstrw.32 q2, [sp, #208] @ 16-byte Spill
1026; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
1027; CHECK-NEXT:    vldrw.u32 q6, [r1]
1028; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
1029; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
1030; CHECK-NEXT:    vstrw.32 q2, [sp, #192] @ 16-byte Spill
1031; CHECK-NEXT:  .LBB13_3: @ %vector.body
1032; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
1033; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1034; CHECK-NEXT:    vstrw.32 q3, [sp, #240] @ 16-byte Spill
1035; CHECK-NEXT:    vadd.i32 q3, q6, r0
1036; CHECK-NEXT:    vmov r1, s15
1037; CHECK-NEXT:    vstrw.32 q1, [sp, #256] @ 16-byte Spill
1038; CHECK-NEXT:    vadd.i32 q1, q0, r0
1039; CHECK-NEXT:    vstrw.32 q0, [sp, #176] @ 16-byte Spill
1040; CHECK-NEXT:    vadd.i32 q0, q7, r0
1041; CHECK-NEXT:    vstrw.32 q6, [sp, #160] @ 16-byte Spill
1042; CHECK-NEXT:    vldrw.u32 q6, [sp, #256] @ 16-byte Reload
1043; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
1044; CHECK-NEXT:    vmov r5, s7
1045; CHECK-NEXT:    vldrw.u32 q2, [sp, #240] @ 16-byte Reload
1046; CHECK-NEXT:    vadd.i32 q6, q6, r0
1047; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
1048; CHECK-NEXT:    subs.w r11, r11, #16
1049; CHECK-NEXT:    ldrb.w r12, [r1]
1050; CHECK-NEXT:    vmov r1, s5
1051; CHECK-NEXT:    ldrb r5, [r5]
1052; CHECK-NEXT:    ldrb.w lr, [r1]
1053; CHECK-NEXT:    vmov r1, s0
1054; CHECK-NEXT:    ldrb r6, [r1]
1055; CHECK-NEXT:    vmov r1, s2
1056; CHECK-NEXT:    ldrb.w r10, [r1]
1057; CHECK-NEXT:    vmov r1, s3
1058; CHECK-NEXT:    ldrb r4, [r1]
1059; CHECK-NEXT:    vmov r1, s6
1060; CHECK-NEXT:    ldrb.w r8, [r1]
1061; CHECK-NEXT:    vmov r1, s24
1062; CHECK-NEXT:    ldrb r1, [r1]
1063; CHECK-NEXT:    vmov.8 q7[0], r1
1064; CHECK-NEXT:    vmov r1, s25
1065; CHECK-NEXT:    ldrb r1, [r1]
1066; CHECK-NEXT:    vmov.8 q7[1], r1
1067; CHECK-NEXT:    vmov r1, s26
1068; CHECK-NEXT:    ldrb r1, [r1]
1069; CHECK-NEXT:    vmov.8 q7[2], r1
1070; CHECK-NEXT:    vmov r1, s27
1071; CHECK-NEXT:    ldrb r1, [r1]
1072; CHECK-NEXT:    vmov.8 q7[3], r1
1073; CHECK-NEXT:    vmov r1, s12
1074; CHECK-NEXT:    vmov.8 q7[4], r6
1075; CHECK-NEXT:    ldrb r1, [r1]
1076; CHECK-NEXT:    vmov.8 q6[0], r1
1077; CHECK-NEXT:    vmov r1, s13
1078; CHECK-NEXT:    ldrb r1, [r1]
1079; CHECK-NEXT:    vmov.8 q6[1], r1
1080; CHECK-NEXT:    vmov r1, s14
1081; CHECK-NEXT:    vadd.i32 q3, q4, r0
1082; CHECK-NEXT:    vldrw.u32 q4, [sp, #224] @ 16-byte Reload
1083; CHECK-NEXT:    vmov r2, s12
1084; CHECK-NEXT:    vmov r3, s15
1085; CHECK-NEXT:    ldrb r1, [r1]
1086; CHECK-NEXT:    vmov.8 q6[2], r1
1087; CHECK-NEXT:    vmov r1, s4
1088; CHECK-NEXT:    vmov.8 q6[3], r12
1089; CHECK-NEXT:    ldrb r2, [r2]
1090; CHECK-NEXT:    vldrw.u32 q1, [sp, #304] @ 16-byte Reload
1091; CHECK-NEXT:    ldrb r3, [r3]
1092; CHECK-NEXT:    vstrw.32 q1, [sp, #304] @ 16-byte Spill
1093; CHECK-NEXT:    vadd.i32 q1, q1, r0
1094; CHECK-NEXT:    ldrb r1, [r1]
1095; CHECK-NEXT:    vmov.8 q6[4], r1
1096; CHECK-NEXT:    vmov r1, s13
1097; CHECK-NEXT:    vmov.8 q6[5], lr
1098; CHECK-NEXT:    vmov.8 q6[6], r8
1099; CHECK-NEXT:    vmov.8 q6[7], r5
1100; CHECK-NEXT:    vmov r5, s1
1101; CHECK-NEXT:    vldrw.u32 q0, [sp, #288] @ 16-byte Reload
1102; CHECK-NEXT:    vstrw.32 q0, [sp, #288] @ 16-byte Spill
1103; CHECK-NEXT:    vadd.i32 q0, q0, r0
1104; CHECK-NEXT:    vmov r6, s0
1105; CHECK-NEXT:    ldrb r7, [r1]
1106; CHECK-NEXT:    vmov r1, s14
1107; CHECK-NEXT:    vldrw.u32 q3, [sp, #208] @ 16-byte Reload
1108; CHECK-NEXT:    ldrb r5, [r5]
1109; CHECK-NEXT:    vmov.8 q7[5], r5
1110; CHECK-NEXT:    vmov r5, s1
1111; CHECK-NEXT:    vmov.8 q7[6], r10
1112; CHECK-NEXT:    vmov.8 q7[7], r4
1113; CHECK-NEXT:    vmov r4, s2
1114; CHECK-NEXT:    vmov.8 q7[8], r2
1115; CHECK-NEXT:    ldrb r6, [r6]
1116; CHECK-NEXT:    vmov.8 q7[9], r7
1117; CHECK-NEXT:    ldrb r1, [r1]
1118; CHECK-NEXT:    vmov.8 q7[10], r1
1119; CHECK-NEXT:    vmov r1, s4
1120; CHECK-NEXT:    vmov.8 q7[11], r3
1121; CHECK-NEXT:    vmov.8 q7[12], r6
1122; CHECK-NEXT:    ldrb r5, [r5]
1123; CHECK-NEXT:    ldrb r4, [r4]
1124; CHECK-NEXT:    vmov.8 q7[13], r5
1125; CHECK-NEXT:    vmov.8 q7[14], r4
1126; CHECK-NEXT:    ldrb r1, [r1]
1127; CHECK-NEXT:    vmov.8 q6[8], r1
1128; CHECK-NEXT:    vmov r1, s5
1129; CHECK-NEXT:    ldrb r1, [r1]
1130; CHECK-NEXT:    vmov.8 q6[9], r1
1131; CHECK-NEXT:    vmov r1, s6
1132; CHECK-NEXT:    ldrb r1, [r1]
1133; CHECK-NEXT:    vmov.8 q6[10], r1
1134; CHECK-NEXT:    vmov r1, s7
1135; CHECK-NEXT:    vadd.i32 q1, q2, r0
1136; CHECK-NEXT:    vldrw.u32 q2, [sp, #192] @ 16-byte Reload
1137; CHECK-NEXT:    ldrb r1, [r1]
1138; CHECK-NEXT:    vmov.8 q6[11], r1
1139; CHECK-NEXT:    vmov r1, s4
1140; CHECK-NEXT:    ldrb r1, [r1]
1141; CHECK-NEXT:    vmov.8 q6[12], r1
1142; CHECK-NEXT:    vmov r1, s5
1143; CHECK-NEXT:    ldrb r1, [r1]
1144; CHECK-NEXT:    vmov.8 q6[13], r1
1145; CHECK-NEXT:    vmov r1, s6
1146; CHECK-NEXT:    ldrb r1, [r1]
1147; CHECK-NEXT:    vmov.8 q6[14], r1
1148; CHECK-NEXT:    vmov r1, s7
1149; CHECK-NEXT:    vldrw.u32 q1, [sp, #256] @ 16-byte Reload
1150; CHECK-NEXT:    vadd.i32 q1, q1, q5
1151; CHECK-NEXT:    ldrb r1, [r1]
1152; CHECK-NEXT:    vmov.8 q6[15], r1
1153; CHECK-NEXT:    vmov r1, s3
1154; CHECK-NEXT:    vadd.i32 q0, q4, r0
1155; CHECK-NEXT:    vadd.i32 q4, q4, q5
1156; CHECK-NEXT:    vmov r2, s1
1157; CHECK-NEXT:    vstrw.32 q4, [sp, #224] @ 16-byte Spill
1158; CHECK-NEXT:    ldrb r1, [r1]
1159; CHECK-NEXT:    vmov.8 q7[15], r1
1160; CHECK-NEXT:    vmov r1, s0
1161; CHECK-NEXT:    vadd.i8 q6, q7, q6
1162; CHECK-NEXT:    ldrb r2, [r2]
1163; CHECK-NEXT:    ldrb r1, [r1]
1164; CHECK-NEXT:    vmov.8 q7[0], r1
1165; CHECK-NEXT:    vmov r1, s2
1166; CHECK-NEXT:    vmov.8 q7[1], r2
1167; CHECK-NEXT:    ldrb r1, [r1]
1168; CHECK-NEXT:    vmov.8 q7[2], r1
1169; CHECK-NEXT:    vmov r1, s3
1170; CHECK-NEXT:    vldrw.u32 q0, [sp, #272] @ 16-byte Reload
1171; CHECK-NEXT:    vstrw.32 q0, [sp, #272] @ 16-byte Spill
1172; CHECK-NEXT:    vadd.i32 q0, q0, r0
1173; CHECK-NEXT:    vldrw.u32 q4, [sp, #272] @ 16-byte Reload
1174; CHECK-NEXT:    vadd.i32 q4, q4, q5
1175; CHECK-NEXT:    vstrw.32 q4, [sp, #272] @ 16-byte Spill
1176; CHECK-NEXT:    vldrw.u32 q4, [sp, #304] @ 16-byte Reload
1177; CHECK-NEXT:    vadd.i32 q4, q4, q5
1178; CHECK-NEXT:    vstrw.32 q4, [sp, #304] @ 16-byte Spill
1179; CHECK-NEXT:    vldrw.u32 q4, [sp, #128] @ 16-byte Reload
1180; CHECK-NEXT:    vadd.i32 q4, q4, q5
1181; CHECK-NEXT:    ldrb r1, [r1]
1182; CHECK-NEXT:    vmov.8 q7[3], r1
1183; CHECK-NEXT:    vmov r1, s0
1184; CHECK-NEXT:    ldrb r1, [r1]
1185; CHECK-NEXT:    vmov.8 q7[4], r1
1186; CHECK-NEXT:    vmov r1, s1
1187; CHECK-NEXT:    ldrb r1, [r1]
1188; CHECK-NEXT:    vmov.8 q7[5], r1
1189; CHECK-NEXT:    vmov r1, s2
1190; CHECK-NEXT:    ldrb r1, [r1]
1191; CHECK-NEXT:    vmov.8 q7[6], r1
1192; CHECK-NEXT:    vmov r1, s3
1193; CHECK-NEXT:    vadd.i32 q0, q3, r0
1194; CHECK-NEXT:    vadd.i32 q3, q3, q5
1195; CHECK-NEXT:    vstrw.32 q3, [sp, #208] @ 16-byte Spill
1196; CHECK-NEXT:    vldrw.u32 q3, [sp, #240] @ 16-byte Reload
1197; CHECK-NEXT:    vadd.i32 q3, q3, q5
1198; CHECK-NEXT:    ldrb r1, [r1]
1199; CHECK-NEXT:    vmov.8 q7[7], r1
1200; CHECK-NEXT:    vmov r1, s0
1201; CHECK-NEXT:    ldrb r1, [r1]
1202; CHECK-NEXT:    vmov.8 q7[8], r1
1203; CHECK-NEXT:    vmov r1, s1
1204; CHECK-NEXT:    ldrb r1, [r1]
1205; CHECK-NEXT:    vmov.8 q7[9], r1
1206; CHECK-NEXT:    vmov r1, s2
1207; CHECK-NEXT:    ldrb r1, [r1]
1208; CHECK-NEXT:    vmov.8 q7[10], r1
1209; CHECK-NEXT:    vmov r1, s3
1210; CHECK-NEXT:    vadd.i32 q0, q2, r0
1211; CHECK-NEXT:    vadd.i32 q2, q2, q5
1212; CHECK-NEXT:    vstrw.32 q2, [sp, #192] @ 16-byte Spill
1213; CHECK-NEXT:    vldrw.u32 q2, [sp, #288] @ 16-byte Reload
1214; CHECK-NEXT:    vadd.i32 q2, q2, q5
1215; CHECK-NEXT:    vstrw.32 q2, [sp, #288] @ 16-byte Spill
1216; CHECK-NEXT:    ldrb r1, [r1]
1217; CHECK-NEXT:    vmov.8 q7[11], r1
1218; CHECK-NEXT:    vmov r1, s0
1219; CHECK-NEXT:    ldrb r1, [r1]
1220; CHECK-NEXT:    vmov.8 q7[12], r1
1221; CHECK-NEXT:    vmov r1, s1
1222; CHECK-NEXT:    ldrb r1, [r1]
1223; CHECK-NEXT:    vmov.8 q7[13], r1
1224; CHECK-NEXT:    vmov r1, s2
1225; CHECK-NEXT:    ldrb r1, [r1]
1226; CHECK-NEXT:    vmov.8 q7[14], r1
1227; CHECK-NEXT:    vmov r1, s3
1228; CHECK-NEXT:    ldrb r1, [r1]
1229; CHECK-NEXT:    vmov.8 q7[15], r1
1230; CHECK-NEXT:    vadd.i8 q0, q6, q7
1231; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
1232; CHECK-NEXT:    vldrw.u32 q6, [sp, #160] @ 16-byte Reload
1233; CHECK-NEXT:    vstrb.8 q0, [r9], #16
1234; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
1235; CHECK-NEXT:    vadd.i32 q7, q7, q5
1236; CHECK-NEXT:    vadd.i32 q6, q6, q5
1237; CHECK-NEXT:    vadd.i32 q0, q0, q5
1238; CHECK-NEXT:    bne.w .LBB13_3
1239; CHECK-NEXT:  @ %bb.4: @ %middle.block
1240; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
1241; CHECK-NEXT:    ldr r1, [sp, #124] @ 4-byte Reload
1242; CHECK-NEXT:    ldr.w r11, [sp, #116] @ 4-byte Reload
1243; CHECK-NEXT:    cmp r11, r1
1244; CHECK-NEXT:    bne.w .LBB13_2
1245; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
1246; CHECK-NEXT:    add sp, #328
1247; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1248; CHECK-NEXT:    add sp, #4
1249; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1250; CHECK-NEXT:    .p2align 4
1251; CHECK-NEXT:  @ %bb.6:
1252; CHECK-NEXT:  .LCPI13_0:
1253; CHECK-NEXT:    .long 38 @ 0x26
1254; CHECK-NEXT:    .long 41 @ 0x29
1255; CHECK-NEXT:    .long 44 @ 0x2c
1256; CHECK-NEXT:    .long 47 @ 0x2f
1257; CHECK-NEXT:  .LCPI13_1:
1258; CHECK-NEXT:    .long 14 @ 0xe
1259; CHECK-NEXT:    .long 17 @ 0x11
1260; CHECK-NEXT:    .long 20 @ 0x14
1261; CHECK-NEXT:    .long 23 @ 0x17
1262; CHECK-NEXT:  .LCPI13_2:
1263; CHECK-NEXT:    .long 24 @ 0x18
1264; CHECK-NEXT:    .long 27 @ 0x1b
1265; CHECK-NEXT:    .long 30 @ 0x1e
1266; CHECK-NEXT:    .long 33 @ 0x21
1267; CHECK-NEXT:  .LCPI13_3:
1268; CHECK-NEXT:    .long 1 @ 0x1
1269; CHECK-NEXT:    .long 4 @ 0x4
1270; CHECK-NEXT:    .long 7 @ 0x7
1271; CHECK-NEXT:    .long 10 @ 0xa
1272; CHECK-NEXT:  .LCPI13_4:
1273; CHECK-NEXT:    .long 36 @ 0x24
1274; CHECK-NEXT:    .long 39 @ 0x27
1275; CHECK-NEXT:    .long 42 @ 0x2a
1276; CHECK-NEXT:    .long 45 @ 0x2d
1277; CHECK-NEXT:  .LCPI13_5:
1278; CHECK-NEXT:    .long 25 @ 0x19
1279; CHECK-NEXT:    .long 28 @ 0x1c
1280; CHECK-NEXT:    .long 31 @ 0x1f
1281; CHECK-NEXT:    .long 34 @ 0x22
1282; CHECK-NEXT:  .LCPI13_6:
1283; CHECK-NEXT:    .long 13 @ 0xd
1284; CHECK-NEXT:    .long 16 @ 0x10
1285; CHECK-NEXT:    .long 19 @ 0x13
1286; CHECK-NEXT:    .long 22 @ 0x16
1287; CHECK-NEXT:  .LCPI13_7:
1288; CHECK-NEXT:    .long 2 @ 0x2
1289; CHECK-NEXT:    .long 5 @ 0x5
1290; CHECK-NEXT:    .long 8 @ 0x8
1291; CHECK-NEXT:    .long 11 @ 0xb
1292; CHECK-NEXT:  .LCPI13_8:
1293; CHECK-NEXT:    .long 26 @ 0x1a
1294; CHECK-NEXT:    .long 29 @ 0x1d
1295; CHECK-NEXT:    .long 32 @ 0x20
1296; CHECK-NEXT:    .long 35 @ 0x23
1297; CHECK-NEXT:  .LCPI13_9:
1298; CHECK-NEXT:    .long 37 @ 0x25
1299; CHECK-NEXT:    .long 40 @ 0x28
1300; CHECK-NEXT:    .long 43 @ 0x2b
1301; CHECK-NEXT:    .long 46 @ 0x2e
1302; CHECK-NEXT:  .LCPI13_10:
1303; CHECK-NEXT:    .long 12 @ 0xc
1304; CHECK-NEXT:    .long 15 @ 0xf
1305; CHECK-NEXT:    .long 18 @ 0x12
1306; CHECK-NEXT:    .long 21 @ 0x15
1307; CHECK-NEXT:  .LCPI13_11:
1308; CHECK-NEXT:    .long 0 @ 0x0
1309; CHECK-NEXT:    .long 3 @ 0x3
1310; CHECK-NEXT:    .long 6 @ 0x6
1311; CHECK-NEXT:    .long 9 @ 0x9
1312
1313
1314entry:
1315  %cmp22 = icmp sgt i32 %n, 0
1316  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
1317
1318vector.ph:                                        ; preds = %for.body.preheader
1319  %n.vec = and i32 %n, -8
1320  br label %vector.body
1321
1322vector.body:                                      ; preds = %vector.body, %vector.ph
1323  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1324  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1325  %0 = mul nuw nsw <16 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1326  %1 = getelementptr inbounds i8, i8* %data, <16 x i32> %0
1327  %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1328  %2 = add nuw nsw <16 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1329  %3 = getelementptr inbounds i8, i8* %data, <16 x i32> %2
1330  %wide.masked.gather24 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %3, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1331  %4 = add nuw nsw <16 x i32> %0, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1332  %5 = getelementptr inbounds i8, i8* %data, <16 x i32> %4
1333  %wide.masked.gather25 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %5, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1334  %6 = add nsw <16 x i8> %wide.masked.gather24, %wide.masked.gather
1335  %7 = add nsw <16 x i8> %6, %wide.masked.gather25
1336  %8 = getelementptr inbounds i8, i8* %dst, i32 %index
1337  %9 = bitcast i8* %8 to <16 x i8>*
1338  store <16 x i8> %7, <16 x i8>* %9, align 2
1339  %index.next = add i32 %index, 16
1340  %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1341  %10 = icmp eq i32 %index.next, %n.vec
1342  br i1 %10, label %middle.block, label %vector.body
1343
1344middle.block:                                     ; preds = %vector.body
1345  %cmp.n = icmp eq i32 %n.vec, %n
1346  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
1347
1348for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1349  ret void
1350}
1351
1352define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) {
1353; CHECK-LABEL: gather_inc_v16i8_simple:
1354; CHECK:       @ %bb.0: @ %entry
1355; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1356; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1357; CHECK-NEXT:    .pad #4
1358; CHECK-NEXT:    sub sp, #4
1359; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1360; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1361; CHECK-NEXT:    .pad #72
1362; CHECK-NEXT:    sub sp, #72
1363; CHECK-NEXT:    cmp r2, #1
1364; CHECK-NEXT:    blt.w .LBB14_5
1365; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
1366; CHECK-NEXT:    adr r5, .LCPI14_3
1367; CHECK-NEXT:    adr r7, .LCPI14_1
1368; CHECK-NEXT:    vldrw.u32 q0, [r5]
1369; CHECK-NEXT:    adr r6, .LCPI14_2
1370; CHECK-NEXT:    adr r3, .LCPI14_0
1371; CHECK-NEXT:    bic r12, r2, #7
1372; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
1373; CHECK-NEXT:    vldrw.u32 q0, [r7]
1374; CHECK-NEXT:    vmov.i32 q4, #0x10
1375; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
1376; CHECK-NEXT:    vldrw.u32 q0, [r6]
1377; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
1378; CHECK-NEXT:    vldrw.u32 q0, [r3]
1379; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
1380; CHECK-NEXT:  .LBB14_2: @ %vector.ph
1381; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1382; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
1383; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
1384; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
1385; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
1386; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
1387; CHECK-NEXT:    mov lr, r1
1388; CHECK-NEXT:    mov r3, r12
1389; CHECK-NEXT:  .LBB14_3: @ %vector.body
1390; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
1391; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1392; CHECK-NEXT:    vadd.i32 q1, q7, r0
1393; CHECK-NEXT:    vadd.i32 q2, q0, r0
1394; CHECK-NEXT:    vmov r4, s6
1395; CHECK-NEXT:    vadd.i32 q3, q5, r0
1396; CHECK-NEXT:    vmov r6, s12
1397; CHECK-NEXT:    subs r3, #16
1398; CHECK-NEXT:    vmov r5, s8
1399; CHECK-NEXT:    vadd.i32 q5, q5, q4
1400; CHECK-NEXT:    vadd.i32 q7, q7, q4
1401; CHECK-NEXT:    vadd.i32 q0, q0, q4
1402; CHECK-NEXT:    ldrb.w r8, [r4]
1403; CHECK-NEXT:    vmov r4, s7
1404; CHECK-NEXT:    ldrb r6, [r6]
1405; CHECK-NEXT:    ldrb r5, [r5]
1406; CHECK-NEXT:    ldrb.w r10, [r4]
1407; CHECK-NEXT:    vmov r4, s9
1408; CHECK-NEXT:    ldrb.w r9, [r4]
1409; CHECK-NEXT:    vmov r4, s10
1410; CHECK-NEXT:    ldrb.w r11, [r4]
1411; CHECK-NEXT:    vmov r4, s11
1412; CHECK-NEXT:    vmov.8 q2[0], r6
1413; CHECK-NEXT:    vmov r6, s13
1414; CHECK-NEXT:    ldrb r4, [r4]
1415; CHECK-NEXT:    ldrb r6, [r6]
1416; CHECK-NEXT:    vmov.8 q2[1], r6
1417; CHECK-NEXT:    vmov r6, s14
1418; CHECK-NEXT:    ldrb r6, [r6]
1419; CHECK-NEXT:    vmov.8 q2[2], r6
1420; CHECK-NEXT:    vmov r6, s15
1421; CHECK-NEXT:    vadd.i32 q3, q6, r0
1422; CHECK-NEXT:    vadd.i32 q6, q6, q4
1423; CHECK-NEXT:    vmov r7, s12
1424; CHECK-NEXT:    ldrb r6, [r6]
1425; CHECK-NEXT:    ldrb r7, [r7]
1426; CHECK-NEXT:    vmov.8 q2[3], r6
1427; CHECK-NEXT:    vmov r6, s5
1428; CHECK-NEXT:    vmov.8 q2[4], r7
1429; CHECK-NEXT:    vmov r7, s13
1430; CHECK-NEXT:    ldrb r6, [r6]
1431; CHECK-NEXT:    ldrb r7, [r7]
1432; CHECK-NEXT:    vmov.8 q2[5], r7
1433; CHECK-NEXT:    vmov r7, s14
1434; CHECK-NEXT:    ldrb r7, [r7]
1435; CHECK-NEXT:    vmov.8 q2[6], r7
1436; CHECK-NEXT:    vmov r7, s15
1437; CHECK-NEXT:    ldrb r7, [r7]
1438; CHECK-NEXT:    vmov.8 q2[7], r7
1439; CHECK-NEXT:    vmov r7, s4
1440; CHECK-NEXT:    ldrb r7, [r7]
1441; CHECK-NEXT:    vmov.8 q2[8], r7
1442; CHECK-NEXT:    vmov.8 q2[9], r6
1443; CHECK-NEXT:    vmov.8 q2[10], r8
1444; CHECK-NEXT:    vmov.8 q2[11], r10
1445; CHECK-NEXT:    vmov.8 q2[12], r5
1446; CHECK-NEXT:    vmov.8 q2[13], r9
1447; CHECK-NEXT:    vmov.8 q2[14], r11
1448; CHECK-NEXT:    vmov.8 q2[15], r4
1449; CHECK-NEXT:    vstrb.8 q2, [lr], #16
1450; CHECK-NEXT:    bne .LBB14_3
1451; CHECK-NEXT:  @ %bb.4: @ %middle.block
1452; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
1453; CHECK-NEXT:    cmp r12, r2
1454; CHECK-NEXT:    bne .LBB14_2
1455; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
1456; CHECK-NEXT:    add sp, #72
1457; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1458; CHECK-NEXT:    add sp, #4
1459; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1460; CHECK-NEXT:    .p2align 4
1461; CHECK-NEXT:  @ %bb.6:
1462; CHECK-NEXT:  .LCPI14_0:
1463; CHECK-NEXT:    .long 0 @ 0x0
1464; CHECK-NEXT:    .long 1 @ 0x1
1465; CHECK-NEXT:    .long 2 @ 0x2
1466; CHECK-NEXT:    .long 3 @ 0x3
1467; CHECK-NEXT:  .LCPI14_1:
1468; CHECK-NEXT:    .long 8 @ 0x8
1469; CHECK-NEXT:    .long 9 @ 0x9
1470; CHECK-NEXT:    .long 10 @ 0xa
1471; CHECK-NEXT:    .long 11 @ 0xb
1472; CHECK-NEXT:  .LCPI14_2:
1473; CHECK-NEXT:    .long 4 @ 0x4
1474; CHECK-NEXT:    .long 5 @ 0x5
1475; CHECK-NEXT:    .long 6 @ 0x6
1476; CHECK-NEXT:    .long 7 @ 0x7
1477; CHECK-NEXT:  .LCPI14_3:
1478; CHECK-NEXT:    .long 12 @ 0xc
1479; CHECK-NEXT:    .long 13 @ 0xd
1480; CHECK-NEXT:    .long 14 @ 0xe
1481; CHECK-NEXT:    .long 15 @ 0xf
1482
1483
1484entry:
1485  %cmp22 = icmp sgt i32 %n, 0
1486  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
1487
1488vector.ph:                                        ; preds = %for.body.preheader
1489  %n.vec = and i32 %n, -8
1490  br label %vector.body
1491
1492vector.body:                                      ; preds = %vector.body, %vector.ph
1493  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1494  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1495  %0 = getelementptr inbounds i8, i8* %data, <16 x i32> %vec.ind
1496  %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %0, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1497  %1 = getelementptr inbounds i8, i8* %dst, i32 %index
1498  %2 = bitcast i8* %1 to <16 x i8>*
1499  store <16 x i8> %wide.masked.gather, <16 x i8>* %2, align 2
1500  %index.next = add i32 %index, 16
1501  %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1502  %3 = icmp eq i32 %index.next, %n.vec
1503  br i1 %3, label %middle.block, label %vector.body
1504
1505middle.block:                                     ; preds = %vector.body
1506  %cmp.n = icmp eq i32 %n.vec, %n
1507  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
1508
1509for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1510  ret void
1511}
1512
1513
1514declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1515declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
1516declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
1517declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
1518declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1519declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
1520declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
1521declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>)
1522declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
1523declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>)
1524declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>)
1525declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
1526declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>)
1527declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>)
1528declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
1529declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
1530declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
1531declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>)
1532declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
1533