• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4; i32
5
6define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) {
7; CHECK-LABEL: ptr_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    ldrd r1, r0, [r0]
10; CHECK-NEXT:    ldr r0, [r0]
11; CHECK-NEXT:    ldr r1, [r1]
12; CHECK-NEXT:    vmov.32 q0[0], r1
13; CHECK-NEXT:    vmov.32 q0[2], r0
14; CHECK-NEXT:    bx lr
15entry:
16  %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4
17  %gather = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
18  ret <2 x i32> %gather
19}
20
21define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(<4 x i32*>* %offptr) {
22; CHECK-LABEL: ptr_v4i32:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrw.u32 q1, [r0]
25; CHECK-NEXT:    vldrw.u32 q0, [q1]
26; CHECK-NEXT:    bx lr
27entry:
28  %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4
29  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
30  ret <4 x i32> %gather
31}
32
33define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) {
34; CHECK-LABEL: ptr_v8i32:
35; CHECK:       @ %bb.0: @ %entry
36; CHECK-NEXT:    .save {r4, r5, r7, lr}
37; CHECK-NEXT:    push {r4, r5, r7, lr}
38; CHECK-NEXT:    vldrw.u32 q0, [r0]
39; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
40; CHECK-NEXT:    vmov r1, s2
41; CHECK-NEXT:    vmov r2, s3
42; CHECK-NEXT:    vmov r3, s0
43; CHECK-NEXT:    vmov r0, s4
44; CHECK-NEXT:    vmov r5, s1
45; CHECK-NEXT:    vmov r4, s7
46; CHECK-NEXT:    ldr.w r12, [r1]
47; CHECK-NEXT:    vmov r1, s5
48; CHECK-NEXT:    ldr.w lr, [r2]
49; CHECK-NEXT:    vmov r2, s6
50; CHECK-NEXT:    ldr r3, [r3]
51; CHECK-NEXT:    ldr r0, [r0]
52; CHECK-NEXT:    ldr r5, [r5]
53; CHECK-NEXT:    vmov.32 q0[0], r3
54; CHECK-NEXT:    vmov.32 q1[0], r0
55; CHECK-NEXT:    ldr r4, [r4]
56; CHECK-NEXT:    vmov.32 q0[1], r5
57; CHECK-NEXT:    vmov.32 q0[2], r12
58; CHECK-NEXT:    vmov.32 q0[3], lr
59; CHECK-NEXT:    ldr r1, [r1]
60; CHECK-NEXT:    ldr r2, [r2]
61; CHECK-NEXT:    vmov.32 q1[1], r1
62; CHECK-NEXT:    vmov.32 q1[2], r2
63; CHECK-NEXT:    vmov.32 q1[3], r4
64; CHECK-NEXT:    pop {r4, r5, r7, pc}
65entry:
66  %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
67  %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
68  ret <8 x i32> %gather
69}
70
71define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) {
72; CHECK-LABEL: ptr_v16i32:
73; CHECK:       @ %bb.0: @ %entry
74; CHECK-NEXT:    .save {r4, r5, r6, lr}
75; CHECK-NEXT:    push {r4, r5, r6, lr}
76; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
77; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
78; CHECK-NEXT:    vldrw.u32 q1, [r0]
79; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
80; CHECK-NEXT:    vmov r1, s10
81; CHECK-NEXT:    vmov r5, s4
82; CHECK-NEXT:    vmov r2, s3
83; CHECK-NEXT:    vmov r0, s12
84; CHECK-NEXT:    vmov r6, s7
85; CHECK-NEXT:    vmov r4, s11
86; CHECK-NEXT:    ldr.w r12, [r1]
87; CHECK-NEXT:    vmov r1, s0
88; CHECK-NEXT:    ldr r5, [r5]
89; CHECK-NEXT:    ldr r2, [r2]
90; CHECK-NEXT:    ldr r0, [r0]
91; CHECK-NEXT:    ldr r6, [r6]
92; CHECK-NEXT:    ldr r4, [r4]
93; CHECK-NEXT:    ldr.w lr, [r1]
94; CHECK-NEXT:    vmov r1, s1
95; CHECK-NEXT:    ldr r3, [r1]
96; CHECK-NEXT:    vmov r1, s2
97; CHECK-NEXT:    vmov.32 q0[0], r5
98; CHECK-NEXT:    vmov r5, s5
99; CHECK-NEXT:    ldr r1, [r1]
100; CHECK-NEXT:    ldr r5, [r5]
101; CHECK-NEXT:    vmov.32 q0[1], r5
102; CHECK-NEXT:    vmov r5, s6
103; CHECK-NEXT:    vmov.32 q1[0], r0
104; CHECK-NEXT:    vmov r0, s13
105; CHECK-NEXT:    ldr r5, [r5]
106; CHECK-NEXT:    ldr r0, [r0]
107; CHECK-NEXT:    vmov.32 q0[2], r5
108; CHECK-NEXT:    vmov r5, s8
109; CHECK-NEXT:    vmov.32 q1[1], r0
110; CHECK-NEXT:    vmov r0, s14
111; CHECK-NEXT:    vmov.32 q0[3], r6
112; CHECK-NEXT:    ldr r5, [r5]
113; CHECK-NEXT:    ldr r0, [r0]
114; CHECK-NEXT:    vmov.32 q1[2], r0
115; CHECK-NEXT:    vmov r0, s15
116; CHECK-NEXT:    vmov.32 q3[0], lr
117; CHECK-NEXT:    vmov.32 q3[1], r3
118; CHECK-NEXT:    vmov.32 q3[2], r1
119; CHECK-NEXT:    vmov.32 q3[3], r2
120; CHECK-NEXT:    ldr r0, [r0]
121; CHECK-NEXT:    vmov.32 q1[3], r0
122; CHECK-NEXT:    vmov r0, s9
123; CHECK-NEXT:    vmov.32 q2[0], r5
124; CHECK-NEXT:    ldr r0, [r0]
125; CHECK-NEXT:    vmov.32 q2[1], r0
126; CHECK-NEXT:    vmov.32 q2[2], r12
127; CHECK-NEXT:    vmov.32 q2[3], r4
128; CHECK-NEXT:    pop {r4, r5, r6, pc}
129entry:
130  %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
131  %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
132  ret <16 x i32> %gather
133}
134
135; f32
136
137define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(<2 x float*>* %offptr) {
138; CHECK-LABEL: ptr_v2f32:
139; CHECK:       @ %bb.0: @ %entry
140; CHECK-NEXT:    ldrd r1, r0, [r0]
141; CHECK-NEXT:    vldr s1, [r0]
142; CHECK-NEXT:    vldr s0, [r1]
143; CHECK-NEXT:    bx lr
144entry:
145  %offs = load <2 x float*>, <2 x float*>* %offptr, align 4
146  %gather = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
147  ret <2 x float> %gather
148}
149
150define arm_aapcs_vfpcc <4 x float> @ptr_v4f32(<4 x float*>* %offptr) {
151; CHECK-LABEL: ptr_v4f32:
152; CHECK:       @ %bb.0: @ %entry
153; CHECK-NEXT:    vldrw.u32 q1, [r0]
154; CHECK-NEXT:    vldrw.u32 q0, [q1]
155; CHECK-NEXT:    bx lr
156entry:
157  %offs = load <4 x float*>, <4 x float*>* %offptr, align 4
158  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
159  ret <4 x float> %gather
160}
161
162define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) {
163; CHECK-LABEL: ptr_v8f32:
164; CHECK:       @ %bb.0: @ %entry
165; CHECK-NEXT:    vldrw.u32 q1, [r0]
166; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
167; CHECK-NEXT:    vmov r1, s7
168; CHECK-NEXT:    vmov r0, s11
169; CHECK-NEXT:    vldr s3, [r1]
170; CHECK-NEXT:    vmov r1, s6
171; CHECK-NEXT:    vldr s2, [r1]
172; CHECK-NEXT:    vmov r1, s5
173; CHECK-NEXT:    vldr s1, [r1]
174; CHECK-NEXT:    vmov r1, s4
175; CHECK-NEXT:    vldr s7, [r0]
176; CHECK-NEXT:    vmov r0, s10
177; CHECK-NEXT:    vldr s0, [r1]
178; CHECK-NEXT:    vldr s6, [r0]
179; CHECK-NEXT:    vmov r0, s9
180; CHECK-NEXT:    vldr s5, [r0]
181; CHECK-NEXT:    vmov r0, s8
182; CHECK-NEXT:    vldr s4, [r0]
183; CHECK-NEXT:    bx lr
184entry:
185  %offs = load <8 x float*>, <8 x float*>* %offptr, align 4
186  %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
187  ret <8 x float> %gather
188}
189
190; i16
191
192define arm_aapcs_vfpcc <8 x i16> @ptr_i16(<8 x i16*>* %offptr) {
193; CHECK-LABEL: ptr_i16:
194; CHECK:       @ %bb.0: @ %entry
195; CHECK-NEXT:    .save {r4, r5, r7, lr}
196; CHECK-NEXT:    push {r4, r5, r7, lr}
197; CHECK-NEXT:    vldrw.u32 q0, [r0]
198; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
199; CHECK-NEXT:    vmov r1, s2
200; CHECK-NEXT:    vmov r2, s3
201; CHECK-NEXT:    vmov r3, s0
202; CHECK-NEXT:    vmov r5, s1
203; CHECK-NEXT:    vmov r0, s4
204; CHECK-NEXT:    vmov r4, s7
205; CHECK-NEXT:    ldrh.w r12, [r1]
206; CHECK-NEXT:    vmov r1, s5
207; CHECK-NEXT:    ldrh.w lr, [r2]
208; CHECK-NEXT:    vmov r2, s6
209; CHECK-NEXT:    ldrh r3, [r3]
210; CHECK-NEXT:    ldrh r5, [r5]
211; CHECK-NEXT:    vmov.16 q0[0], r3
212; CHECK-NEXT:    ldrh r0, [r0]
213; CHECK-NEXT:    vmov.16 q0[1], r5
214; CHECK-NEXT:    ldrh r4, [r4]
215; CHECK-NEXT:    vmov.16 q0[2], r12
216; CHECK-NEXT:    vmov.16 q0[3], lr
217; CHECK-NEXT:    vmov.16 q0[4], r0
218; CHECK-NEXT:    ldrh r1, [r1]
219; CHECK-NEXT:    ldrh r2, [r2]
220; CHECK-NEXT:    vmov.16 q0[5], r1
221; CHECK-NEXT:    vmov.16 q0[6], r2
222; CHECK-NEXT:    vmov.16 q0[7], r4
223; CHECK-NEXT:    pop {r4, r5, r7, pc}
224entry:
225  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
226  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
227  ret <8 x i16> %gather
228}
229
230define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) {
231; CHECK-LABEL: ptr_v2i16_sext:
232; CHECK:       @ %bb.0: @ %entry
233; CHECK-NEXT:    ldrd r1, r0, [r0]
234; CHECK-NEXT:    ldrsh.w r0, [r0]
235; CHECK-NEXT:    ldrsh.w r1, [r1]
236; CHECK-NEXT:    asrs r2, r0, #31
237; CHECK-NEXT:    vmov.32 q0[0], r1
238; CHECK-NEXT:    asrs r1, r1, #31
239; CHECK-NEXT:    vmov.32 q0[1], r1
240; CHECK-NEXT:    vmov.32 q0[2], r0
241; CHECK-NEXT:    vmov.32 q0[3], r2
242; CHECK-NEXT:    bx lr
243entry:
244  %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
245  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
246  %ext = sext <2 x i16> %gather to <2 x i32>
247  ret <2 x i32> %ext
248}
249
250define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) {
251; CHECK-LABEL: ptr_v2i16_zext:
252; CHECK:       @ %bb.0: @ %entry
253; CHECK-NEXT:    ldrd r1, r0, [r0]
254; CHECK-NEXT:    vmov.i64 q0, #0xffff
255; CHECK-NEXT:    ldrh r0, [r0]
256; CHECK-NEXT:    ldrh r1, [r1]
257; CHECK-NEXT:    vmov.32 q1[0], r1
258; CHECK-NEXT:    vmov.32 q1[2], r0
259; CHECK-NEXT:    vand q0, q1, q0
260; CHECK-NEXT:    bx lr
261entry:
262  %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
263  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
264  %ext = zext <2 x i16> %gather to <2 x i32>
265  ret <2 x i32> %ext
266}
267
268define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
269; CHECK-LABEL: ptr_v4i16_sext:
270; CHECK:       @ %bb.0: @ %entry
271; CHECK-NEXT:    vldrw.u32 q0, [r0]
272; CHECK-NEXT:    vmov r2, s0
273; CHECK-NEXT:    vmov r0, s1
274; CHECK-NEXT:    vmov r3, s2
275; CHECK-NEXT:    vmov r1, s3
276; CHECK-NEXT:    ldrh r2, [r2]
277; CHECK-NEXT:    ldrh r0, [r0]
278; CHECK-NEXT:    vmov.32 q0[0], r2
279; CHECK-NEXT:    ldrh r3, [r3]
280; CHECK-NEXT:    vmov.32 q0[1], r0
281; CHECK-NEXT:    ldrh r1, [r1]
282; CHECK-NEXT:    vmov.32 q0[2], r3
283; CHECK-NEXT:    vmov.32 q0[3], r1
284; CHECK-NEXT:    vmovlb.s16 q0, q0
285; CHECK-NEXT:    bx lr
286entry:
287  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
288  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
289  %ext = sext <4 x i16> %gather to <4 x i32>
290  ret <4 x i32> %ext
291}
292
293define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
294; CHECK-LABEL: ptr_v4i16_zext:
295; CHECK:       @ %bb.0: @ %entry
296; CHECK-NEXT:    vldrw.u32 q0, [r0]
297; CHECK-NEXT:    vmov r2, s0
298; CHECK-NEXT:    vmov r0, s1
299; CHECK-NEXT:    vmov r3, s2
300; CHECK-NEXT:    vmov r1, s3
301; CHECK-NEXT:    ldrh r2, [r2]
302; CHECK-NEXT:    ldrh r0, [r0]
303; CHECK-NEXT:    vmov.32 q0[0], r2
304; CHECK-NEXT:    ldrh r3, [r3]
305; CHECK-NEXT:    vmov.32 q0[1], r0
306; CHECK-NEXT:    ldrh r1, [r1]
307; CHECK-NEXT:    vmov.32 q0[2], r3
308; CHECK-NEXT:    vmov.32 q0[3], r1
309; CHECK-NEXT:    vmovlb.u16 q0, q0
310; CHECK-NEXT:    bx lr
311entry:
312  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
313  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
314  %ext = zext <4 x i16> %gather to <4 x i32>
315  ret <4 x i32> %ext
316}
317
318define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
319; CHECK-LABEL: ptr_v8i16_sext:
320; CHECK:       @ %bb.0: @ %entry
321; CHECK-NEXT:    .save {r4, r5, r7, lr}
322; CHECK-NEXT:    push {r4, r5, r7, lr}
323; CHECK-NEXT:    vldrw.u32 q0, [r0]
324; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
325; CHECK-NEXT:    vmov r1, s2
326; CHECK-NEXT:    vmov r2, s3
327; CHECK-NEXT:    vmov r3, s0
328; CHECK-NEXT:    vmov r0, s4
329; CHECK-NEXT:    vmov r5, s1
330; CHECK-NEXT:    vmov r4, s7
331; CHECK-NEXT:    ldrh.w r12, [r1]
332; CHECK-NEXT:    vmov r1, s5
333; CHECK-NEXT:    ldrh.w lr, [r2]
334; CHECK-NEXT:    vmov r2, s6
335; CHECK-NEXT:    ldrh r3, [r3]
336; CHECK-NEXT:    ldrh r0, [r0]
337; CHECK-NEXT:    ldrh r5, [r5]
338; CHECK-NEXT:    vmov.32 q0[0], r3
339; CHECK-NEXT:    vmov.32 q1[0], r0
340; CHECK-NEXT:    ldrh r4, [r4]
341; CHECK-NEXT:    vmov.32 q0[1], r5
342; CHECK-NEXT:    vmov.32 q0[2], r12
343; CHECK-NEXT:    vmov.32 q0[3], lr
344; CHECK-NEXT:    vmovlb.s16 q0, q0
345; CHECK-NEXT:    ldrh r1, [r1]
346; CHECK-NEXT:    ldrh r2, [r2]
347; CHECK-NEXT:    vmov.32 q1[1], r1
348; CHECK-NEXT:    vmov.32 q1[2], r2
349; CHECK-NEXT:    vmov.32 q1[3], r4
350; CHECK-NEXT:    vmovlb.s16 q1, q1
351; CHECK-NEXT:    pop {r4, r5, r7, pc}
352entry:
353  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
354  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
355  %ext = sext <8 x i16> %gather to <8 x i32>
356  ret <8 x i32> %ext
357}
358
359define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
360; CHECK-LABEL: ptr_v8i16_zext:
361; CHECK:       @ %bb.0: @ %entry
362; CHECK-NEXT:    .save {r4, r5, r7, lr}
363; CHECK-NEXT:    push {r4, r5, r7, lr}
364; CHECK-NEXT:    vldrw.u32 q0, [r0]
365; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
366; CHECK-NEXT:    vmov r1, s2
367; CHECK-NEXT:    vmov r2, s3
368; CHECK-NEXT:    vmov r3, s0
369; CHECK-NEXT:    vmov r0, s4
370; CHECK-NEXT:    vmov r5, s1
371; CHECK-NEXT:    vmov r4, s7
372; CHECK-NEXT:    ldrh.w r12, [r1]
373; CHECK-NEXT:    vmov r1, s5
374; CHECK-NEXT:    ldrh.w lr, [r2]
375; CHECK-NEXT:    vmov r2, s6
376; CHECK-NEXT:    ldrh r3, [r3]
377; CHECK-NEXT:    ldrh r0, [r0]
378; CHECK-NEXT:    ldrh r5, [r5]
379; CHECK-NEXT:    vmov.32 q0[0], r3
380; CHECK-NEXT:    vmov.32 q1[0], r0
381; CHECK-NEXT:    ldrh r4, [r4]
382; CHECK-NEXT:    vmov.32 q0[1], r5
383; CHECK-NEXT:    vmov.32 q0[2], r12
384; CHECK-NEXT:    vmov.32 q0[3], lr
385; CHECK-NEXT:    vmovlb.u16 q0, q0
386; CHECK-NEXT:    ldrh r1, [r1]
387; CHECK-NEXT:    ldrh r2, [r2]
388; CHECK-NEXT:    vmov.32 q1[1], r1
389; CHECK-NEXT:    vmov.32 q1[2], r2
390; CHECK-NEXT:    vmov.32 q1[3], r4
391; CHECK-NEXT:    vmovlb.u16 q1, q1
392; CHECK-NEXT:    pop {r4, r5, r7, pc}
393entry:
394  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
395  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
396  %ext = zext <8 x i16> %gather to <8 x i32>
397  ret <8 x i32> %ext
398}
399
400; f16
401
402define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
403; CHECK-LABEL: ptr_f16:
404; CHECK:       @ %bb.0: @ %entry
405; CHECK-NEXT:    vldrw.u32 q1, [r0]
406; CHECK-NEXT:    vmov r1, s5
407; CHECK-NEXT:    vldr.16 s0, [r1]
408; CHECK-NEXT:    vmov r2, s4
409; CHECK-NEXT:    vmov r1, s0
410; CHECK-NEXT:    vldr.16 s0, [r2]
411; CHECK-NEXT:    vmov r2, s0
412; CHECK-NEXT:    vmov.16 q0[0], r2
413; CHECK-NEXT:    vmov.16 q0[1], r1
414; CHECK-NEXT:    vmov r1, s6
415; CHECK-NEXT:    vldr.16 s8, [r1]
416; CHECK-NEXT:    vmov r1, s8
417; CHECK-NEXT:    vmov.16 q0[2], r1
418; CHECK-NEXT:    vmov r1, s7
419; CHECK-NEXT:    vldr.16 s4, [r1]
420; CHECK-NEXT:    vmov r1, s4
421; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
422; CHECK-NEXT:    vmov.16 q0[3], r1
423; CHECK-NEXT:    vmov r0, s4
424; CHECK-NEXT:    vldr.16 s8, [r0]
425; CHECK-NEXT:    vmov r0, s8
426; CHECK-NEXT:    vmov.16 q0[4], r0
427; CHECK-NEXT:    vmov r0, s5
428; CHECK-NEXT:    vldr.16 s8, [r0]
429; CHECK-NEXT:    vmov r0, s8
430; CHECK-NEXT:    vmov.16 q0[5], r0
431; CHECK-NEXT:    vmov r0, s6
432; CHECK-NEXT:    vldr.16 s8, [r0]
433; CHECK-NEXT:    vmov r0, s8
434; CHECK-NEXT:    vmov.16 q0[6], r0
435; CHECK-NEXT:    vmov r0, s7
436; CHECK-NEXT:    vldr.16 s4, [r0]
437; CHECK-NEXT:    vmov r0, s4
438; CHECK-NEXT:    vmov.16 q0[7], r0
439; CHECK-NEXT:    bx lr
440entry:
441  %offs = load <8 x half*>, <8 x half*>* %offptr, align 4
442  %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
443  ret <8 x half> %gather
444}
445
446; i8
447
448define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
449; CHECK-LABEL: ptr_i8:
450; CHECK:       @ %bb.0: @ %entry
451; CHECK-NEXT:    .save {r4, r5, r6, lr}
452; CHECK-NEXT:    push {r4, r5, r6, lr}
453; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
454; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
455; CHECK-NEXT:    vldrw.u32 q2, [r0]
456; CHECK-NEXT:    vmov r1, s6
457; CHECK-NEXT:    vmov r5, s8
458; CHECK-NEXT:    vmov r2, s3
459; CHECK-NEXT:    vmov r6, s11
460; CHECK-NEXT:    vmov r4, s7
461; CHECK-NEXT:    ldrb.w r12, [r1]
462; CHECK-NEXT:    vmov r1, s0
463; CHECK-NEXT:    ldrb r5, [r5]
464; CHECK-NEXT:    ldrb r2, [r2]
465; CHECK-NEXT:    ldrb r6, [r6]
466; CHECK-NEXT:    ldrb r4, [r4]
467; CHECK-NEXT:    ldrb.w lr, [r1]
468; CHECK-NEXT:    vmov r1, s1
469; CHECK-NEXT:    ldrb r3, [r1]
470; CHECK-NEXT:    vmov r1, s2
471; CHECK-NEXT:    vmov.8 q0[0], r5
472; CHECK-NEXT:    vmov r5, s9
473; CHECK-NEXT:    ldrb r1, [r1]
474; CHECK-NEXT:    ldrb r5, [r5]
475; CHECK-NEXT:    vmov.8 q0[1], r5
476; CHECK-NEXT:    vmov r5, s10
477; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
478; CHECK-NEXT:    vmov r0, s8
479; CHECK-NEXT:    ldrb r5, [r5]
480; CHECK-NEXT:    vmov.8 q0[2], r5
481; CHECK-NEXT:    vmov r5, s4
482; CHECK-NEXT:    ldrb r0, [r0]
483; CHECK-NEXT:    vmov.8 q0[3], r6
484; CHECK-NEXT:    vmov.8 q0[4], r0
485; CHECK-NEXT:    vmov r0, s9
486; CHECK-NEXT:    ldrb r5, [r5]
487; CHECK-NEXT:    ldrb r0, [r0]
488; CHECK-NEXT:    vmov.8 q0[5], r0
489; CHECK-NEXT:    vmov r0, s10
490; CHECK-NEXT:    ldrb r0, [r0]
491; CHECK-NEXT:    vmov.8 q0[6], r0
492; CHECK-NEXT:    vmov r0, s11
493; CHECK-NEXT:    ldrb r0, [r0]
494; CHECK-NEXT:    vmov.8 q0[7], r0
495; CHECK-NEXT:    vmov r0, s5
496; CHECK-NEXT:    vmov.8 q0[8], r5
497; CHECK-NEXT:    ldrb r0, [r0]
498; CHECK-NEXT:    vmov.8 q0[9], r0
499; CHECK-NEXT:    vmov.8 q0[10], r12
500; CHECK-NEXT:    vmov.8 q0[11], r4
501; CHECK-NEXT:    vmov.8 q0[12], lr
502; CHECK-NEXT:    vmov.8 q0[13], r3
503; CHECK-NEXT:    vmov.8 q0[14], r1
504; CHECK-NEXT:    vmov.8 q0[15], r2
505; CHECK-NEXT:    pop {r4, r5, r6, pc}
506entry:
507  %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4
508  %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
509  ret <16 x i8> %gather
510}
511
512define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(<8 x i8*>* %offptr) {
513; CHECK-LABEL: ptr_v8i8_sext16:
514; CHECK:       @ %bb.0: @ %entry
515; CHECK-NEXT:    .save {r4, r5, r7, lr}
516; CHECK-NEXT:    push {r4, r5, r7, lr}
517; CHECK-NEXT:    vldrw.u32 q0, [r0]
518; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
519; CHECK-NEXT:    vmov r2, s3
520; CHECK-NEXT:    vmov r1, s1
521; CHECK-NEXT:    vmov r5, s0
522; CHECK-NEXT:    vmov r3, s2
523; CHECK-NEXT:    vmov r0, s4
524; CHECK-NEXT:    vmov r4, s7
525; CHECK-NEXT:    ldrb.w r12, [r2]
526; CHECK-NEXT:    vmov r2, s5
527; CHECK-NEXT:    ldrb.w lr, [r1]
528; CHECK-NEXT:    vmov r1, s6
529; CHECK-NEXT:    ldrb r5, [r5]
530; CHECK-NEXT:    ldrb r3, [r3]
531; CHECK-NEXT:    vmov.16 q0[0], r5
532; CHECK-NEXT:    ldrb r0, [r0]
533; CHECK-NEXT:    vmov.16 q0[1], lr
534; CHECK-NEXT:    ldrb r4, [r4]
535; CHECK-NEXT:    vmov.16 q0[2], r3
536; CHECK-NEXT:    vmov.16 q0[3], r12
537; CHECK-NEXT:    vmov.16 q0[4], r0
538; CHECK-NEXT:    ldrb r2, [r2]
539; CHECK-NEXT:    ldrb r1, [r1]
540; CHECK-NEXT:    vmov.16 q0[5], r2
541; CHECK-NEXT:    vmov.16 q0[6], r1
542; CHECK-NEXT:    vmov.16 q0[7], r4
543; CHECK-NEXT:    vmovlb.s8 q0, q0
544; CHECK-NEXT:    pop {r4, r5, r7, pc}
545entry:
546  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
547  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
548  %ext = sext <8 x i8> %gather to <8 x i16>
549  ret <8 x i16> %ext
550}
551
552define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(<8 x i8*>* %offptr) {
553; CHECK-LABEL: ptr_v8i8_zext16:
554; CHECK:       @ %bb.0: @ %entry
555; CHECK-NEXT:    .save {r4, r5, r7, lr}
556; CHECK-NEXT:    push {r4, r5, r7, lr}
557; CHECK-NEXT:    vldrw.u32 q0, [r0]
558; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
559; CHECK-NEXT:    vmov r2, s3
560; CHECK-NEXT:    vmov r1, s1
561; CHECK-NEXT:    vmov r5, s0
562; CHECK-NEXT:    vmov r3, s2
563; CHECK-NEXT:    vmov r0, s4
564; CHECK-NEXT:    vmov r4, s7
565; CHECK-NEXT:    ldrb.w r12, [r2]
566; CHECK-NEXT:    vmov r2, s5
567; CHECK-NEXT:    ldrb.w lr, [r1]
568; CHECK-NEXT:    vmov r1, s6
569; CHECK-NEXT:    ldrb r5, [r5]
570; CHECK-NEXT:    ldrb r3, [r3]
571; CHECK-NEXT:    vmov.16 q0[0], r5
572; CHECK-NEXT:    ldrb r0, [r0]
573; CHECK-NEXT:    vmov.16 q0[1], lr
574; CHECK-NEXT:    ldrb r4, [r4]
575; CHECK-NEXT:    vmov.16 q0[2], r3
576; CHECK-NEXT:    vmov.16 q0[3], r12
577; CHECK-NEXT:    vmov.16 q0[4], r0
578; CHECK-NEXT:    ldrb r2, [r2]
579; CHECK-NEXT:    ldrb r1, [r1]
580; CHECK-NEXT:    vmov.16 q0[5], r2
581; CHECK-NEXT:    vmov.16 q0[6], r1
582; CHECK-NEXT:    vmov.16 q0[7], r4
583; CHECK-NEXT:    vmovlb.u8 q0, q0
584; CHECK-NEXT:    pop {r4, r5, r7, pc}
585entry:
586  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
587  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
588  %ext = zext <8 x i8> %gather to <8 x i16>
589  ret <8 x i16> %ext
590}
591
592define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
593; CHECK-LABEL: ptr_v4i8_sext32:
594; CHECK:       @ %bb.0: @ %entry
595; CHECK-NEXT:    vldrw.u32 q0, [r0]
596; CHECK-NEXT:    vmov r0, s0
597; CHECK-NEXT:    vmov r3, s1
598; CHECK-NEXT:    vmov r1, s2
599; CHECK-NEXT:    vmov r2, s3
600; CHECK-NEXT:    ldrb r0, [r0]
601; CHECK-NEXT:    ldrb r3, [r3]
602; CHECK-NEXT:    vmov.32 q0[0], r0
603; CHECK-NEXT:    ldrb r1, [r1]
604; CHECK-NEXT:    vmov.32 q0[1], r3
605; CHECK-NEXT:    ldrb r2, [r2]
606; CHECK-NEXT:    vmov.32 q0[2], r1
607; CHECK-NEXT:    vmov.32 q0[3], r2
608; CHECK-NEXT:    vmovlb.s8 q0, q0
609; CHECK-NEXT:    vmovlb.s16 q0, q0
610; CHECK-NEXT:    bx lr
611entry:
612  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
613  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
614  %ext = sext <4 x i8> %gather to <4 x i32>
615  ret <4 x i32> %ext
616}
617
618define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
619; CHECK-LABEL: ptr_v4i8_zext32:
620; CHECK:       @ %bb.0: @ %entry
621; CHECK-NEXT:    vldrw.u32 q0, [r0]
622; CHECK-NEXT:    vmov.i32 q1, #0xff
623; CHECK-NEXT:    vmov r2, s0
624; CHECK-NEXT:    vmov r1, s1
625; CHECK-NEXT:    vmov r3, s2
626; CHECK-NEXT:    vmov r0, s3
627; CHECK-NEXT:    ldrb r2, [r2]
628; CHECK-NEXT:    ldrb r1, [r1]
629; CHECK-NEXT:    vmov.32 q0[0], r2
630; CHECK-NEXT:    ldrb r3, [r3]
631; CHECK-NEXT:    vmov.32 q0[1], r1
632; CHECK-NEXT:    ldrb r0, [r0]
633; CHECK-NEXT:    vmov.32 q0[2], r3
634; CHECK-NEXT:    vmov.32 q0[3], r0
635; CHECK-NEXT:    vand q0, q0, q1
636; CHECK-NEXT:    bx lr
637entry:
638  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
639  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
640  %ext = zext <4 x i8> %gather to <4 x i32>
641  ret <4 x i32> %ext
642}
643
644define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
645; CHECK-LABEL: ptr_v8i8_sext32:
646; CHECK:       @ %bb.0: @ %entry
647; CHECK-NEXT:    .save {r4, r5, r7, lr}
648; CHECK-NEXT:    push {r4, r5, r7, lr}
649; CHECK-NEXT:    vldrw.u32 q0, [r0]
650; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
651; CHECK-NEXT:    vmov r1, s2
652; CHECK-NEXT:    vmov r2, s3
653; CHECK-NEXT:    vmov r3, s0
654; CHECK-NEXT:    vmov r0, s4
655; CHECK-NEXT:    vmov r5, s1
656; CHECK-NEXT:    vmov r4, s7
657; CHECK-NEXT:    ldrb.w r12, [r1]
658; CHECK-NEXT:    vmov r1, s5
659; CHECK-NEXT:    ldrb.w lr, [r2]
660; CHECK-NEXT:    vmov r2, s6
661; CHECK-NEXT:    ldrb r3, [r3]
662; CHECK-NEXT:    ldrb r0, [r0]
663; CHECK-NEXT:    ldrb r5, [r5]
664; CHECK-NEXT:    vmov.32 q0[0], r3
665; CHECK-NEXT:    vmov.32 q1[0], r0
666; CHECK-NEXT:    ldrb r4, [r4]
667; CHECK-NEXT:    vmov.32 q0[1], r5
668; CHECK-NEXT:    vmov.32 q0[2], r12
669; CHECK-NEXT:    vmov.32 q0[3], lr
670; CHECK-NEXT:    vmovlb.s8 q0, q0
671; CHECK-NEXT:    vmovlb.s16 q0, q0
672; CHECK-NEXT:    ldrb r1, [r1]
673; CHECK-NEXT:    ldrb r2, [r2]
674; CHECK-NEXT:    vmov.32 q1[1], r1
675; CHECK-NEXT:    vmov.32 q1[2], r2
676; CHECK-NEXT:    vmov.32 q1[3], r4
677; CHECK-NEXT:    vmovlb.s8 q1, q1
678; CHECK-NEXT:    vmovlb.s16 q1, q1
679; CHECK-NEXT:    pop {r4, r5, r7, pc}
680entry:
681  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
682  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
683  %ext = sext <8 x i8> %gather to <8 x i32>
684  ret <8 x i32> %ext
685}
686
687define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) {
688; CHECK-LABEL: ptr_v8i8_zext32:
689; CHECK:       @ %bb.0: @ %entry
690; CHECK-NEXT:    .save {r4, r5, r7, lr}
691; CHECK-NEXT:    push {r4, r5, r7, lr}
692; CHECK-NEXT:    vldrw.u32 q0, [r0]
693; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
694; CHECK-NEXT:    vmov r1, s2
695; CHECK-NEXT:    vmov r2, s3
696; CHECK-NEXT:    vmov r0, s4
697; CHECK-NEXT:    vmov r4, s0
698; CHECK-NEXT:    vmov r3, s5
699; CHECK-NEXT:    vmov r5, s1
700; CHECK-NEXT:    ldrb.w r12, [r1]
701; CHECK-NEXT:    vmov r1, s6
702; CHECK-NEXT:    ldrb.w lr, [r2]
703; CHECK-NEXT:    vmov r2, s7
704; CHECK-NEXT:    ldrb r0, [r0]
705; CHECK-NEXT:    vmov.i32 q1, #0xff
706; CHECK-NEXT:    ldrb r4, [r4]
707; CHECK-NEXT:    ldrb r3, [r3]
708; CHECK-NEXT:    vmov.32 q2[0], r0
709; CHECK-NEXT:    ldrb r5, [r5]
710; CHECK-NEXT:    vmov.32 q0[0], r4
711; CHECK-NEXT:    vmov.32 q2[1], r3
712; CHECK-NEXT:    vmov.32 q0[1], r5
713; CHECK-NEXT:    vmov.32 q0[2], r12
714; CHECK-NEXT:    vmov.32 q0[3], lr
715; CHECK-NEXT:    vand q0, q0, q1
716; CHECK-NEXT:    ldrb r1, [r1]
717; CHECK-NEXT:    ldrb r2, [r2]
718; CHECK-NEXT:    vmov.32 q2[2], r1
719; CHECK-NEXT:    vmov.32 q2[3], r2
720; CHECK-NEXT:    vand q1, q2, q1
721; CHECK-NEXT:    pop {r4, r5, r7, pc}
722entry:
723  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
724  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
725  %ext = zext <8 x i8> %gather to <8 x i32>
726  ret <8 x i32> %ext
727}
728
729; loops
730
731define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
732; CHECK-LABEL: foo_ptr_p_int32_t:
733; CHECK:       @ %bb.0: @ %entry
734; CHECK-NEXT:    .save {r7, lr}
735; CHECK-NEXT:    push {r7, lr}
736; CHECK-NEXT:    bic r2, r2, #15
737; CHECK-NEXT:    cmp r2, #1
738; CHECK-NEXT:    it lt
739; CHECK-NEXT:    poplt {r7, pc}
740; CHECK-NEXT:  .LBB22_1: @ %vector.body.preheader
741; CHECK-NEXT:    subs r2, #4
742; CHECK-NEXT:    movs r3, #1
743; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
744; CHECK-NEXT:    dls lr, lr
745; CHECK-NEXT:  .LBB22_2: @ %vector.body
746; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
747; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
748; CHECK-NEXT:    vptt.i32 ne, q0, zr
749; CHECK-NEXT:    vldrwt.u32 q1, [q0]
750; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
751; CHECK-NEXT:    le lr, .LBB22_2
752; CHECK-NEXT:  @ %bb.3: @ %for.end
753; CHECK-NEXT:    pop {r7, pc}
754entry:
755  %and = and i32 %n, -16
756  %cmp11 = icmp sgt i32 %and, 0
757  br i1 %cmp11, label %vector.body, label %for.end
758
759vector.body:                                      ; preds = %entry, %vector.body
760  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
761  %0 = getelementptr inbounds i32*, i32** %src, i32 %index
762  %1 = bitcast i32** %0 to <4 x i32*>*
763  %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4
764  %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer
765  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %wide.load, i32 4, <4 x i1> %2, <4 x i32> undef)
766  %3 = getelementptr inbounds i32, i32* %dest, i32 %index
767  %4 = bitcast i32* %3 to <4 x i32>*
768  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %4, i32 4, <4 x i1> %2)
769  %index.next = add i32 %index, 4
770  %5 = icmp eq i32 %index.next, %and
771  br i1 %5, label %for.end, label %vector.body
772
773for.end:                                          ; preds = %vector.body, %entry
774  ret void
775}
776
777define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
778; CHECK-LABEL: foo_ptr_p_float:
779; CHECK:       @ %bb.0: @ %entry
780; CHECK-NEXT:    .save {r7, lr}
781; CHECK-NEXT:    push {r7, lr}
782; CHECK-NEXT:    bic r2, r2, #15
783; CHECK-NEXT:    cmp r2, #1
784; CHECK-NEXT:    it lt
785; CHECK-NEXT:    poplt {r7, pc}
786; CHECK-NEXT:  .LBB23_1: @ %vector.body.preheader
787; CHECK-NEXT:    subs r2, #4
788; CHECK-NEXT:    movs r3, #1
789; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
790; CHECK-NEXT:    dls lr, lr
791; CHECK-NEXT:  .LBB23_2: @ %vector.body
792; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
793; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
794; CHECK-NEXT:    vptt.i32 ne, q0, zr
795; CHECK-NEXT:    vldrwt.u32 q1, [q0]
796; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
797; CHECK-NEXT:    le lr, .LBB23_2
798; CHECK-NEXT:  @ %bb.3: @ %for.end
799; CHECK-NEXT:    pop {r7, pc}
800entry:
801  %and = and i32 %n, -16
802  %cmp11 = icmp sgt i32 %and, 0
803  br i1 %cmp11, label %vector.body, label %for.end
804
805vector.body:                                      ; preds = %entry, %vector.body
806  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
807  %0 = getelementptr inbounds float*, float** %src, i32 %index
808  %1 = bitcast float** %0 to <4 x float*>*
809  %wide.load = load <4 x float*>, <4 x float*>* %1, align 4
810  %2 = icmp ne <4 x float*> %wide.load, zeroinitializer
811  %3 = bitcast <4 x float*> %wide.load to <4 x i32*>
812  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> %2, <4 x i32> undef)
813  %4 = getelementptr inbounds float, float* %dest, i32 %index
814  %5 = bitcast float* %4 to <4 x i32>*
815  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %5, i32 4, <4 x i1> %2)
816  %index.next = add i32 %index, 4
817  %6 = icmp eq i32 %index.next, %and
818  br i1 %6, label %for.end, label %vector.body
819
820for.end:                                          ; preds = %vector.body, %entry
821  ret void
822}
823
824define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
825; CHECK-LABEL: qi4:
826; CHECK:       @ %bb.0: @ %entry
827; CHECK-NEXT:    vmov.i32 q1, #0x10
828; CHECK-NEXT:    vadd.i32 q1, q0, q1
829; CHECK-NEXT:    vldrw.u32 q0, [q1]
830; CHECK-NEXT:    bx lr
831entry:
832  %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4
833  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
834  ret <4 x i32> %gather
835}
836
837define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) {
838; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
839; CHECK:       @ %bb.0: @ %entry
840; CHECK-NEXT:    .save {r4, r5, r7, lr}
841; CHECK-NEXT:    push {r4, r5, r7, lr}
842; CHECK-NEXT:    vldrb.u32 q0, [r1]
843; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
844; CHECK-NEXT:    vadd.i32 q0, q0, r0
845; CHECK-NEXT:    vadd.i32 q1, q1, r0
846; CHECK-NEXT:    vmov r2, s2
847; CHECK-NEXT:    vmov r3, s3
848; CHECK-NEXT:    vmov r0, s4
849; CHECK-NEXT:    vmov r1, s5
850; CHECK-NEXT:    vmov r5, s1
851; CHECK-NEXT:    vmov r4, s7
852; CHECK-NEXT:    ldrb.w r12, [r2]
853; CHECK-NEXT:    vmov r2, s0
854; CHECK-NEXT:    ldrb.w lr, [r3]
855; CHECK-NEXT:    vmov r3, s6
856; CHECK-NEXT:    ldrb r0, [r0]
857; CHECK-NEXT:    ldrb r1, [r1]
858; CHECK-NEXT:    ldrb r5, [r5]
859; CHECK-NEXT:    vmov.32 q1[0], r0
860; CHECK-NEXT:    ldrb r4, [r4]
861; CHECK-NEXT:    vmov.32 q1[1], r1
862; CHECK-NEXT:    ldrb r2, [r2]
863; CHECK-NEXT:    ldrb r3, [r3]
864; CHECK-NEXT:    vmov.32 q0[0], r2
865; CHECK-NEXT:    vmov.32 q0[1], r5
866; CHECK-NEXT:    vmov.32 q1[2], r3
867; CHECK-NEXT:    vmov.32 q0[2], r12
868; CHECK-NEXT:    vmov.32 q1[3], r4
869; CHECK-NEXT:    vmov.32 q0[3], lr
870; CHECK-NEXT:    vmovlb.s8 q1, q1
871; CHECK-NEXT:    vmovlb.s8 q0, q0
872; CHECK-NEXT:    vmovlb.s16 q1, q1
873; CHECK-NEXT:    vmovlb.s16 q0, q0
874; CHECK-NEXT:    pop {r4, r5, r7, pc}
875entry:
876  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
877  %offs.zext = zext <8 x i8> %offs to <8 x i32>
878  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
879  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
880  %gather.sext = sext <8 x i8> %gather to <8 x i32>
881  ret <8 x i32> %gather.sext
882}
883
884declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
885declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
886declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
887declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
888declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
889declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
890declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
891declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>)
892declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
893declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>)
894declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>)
895declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
896declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>)
897declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>)
898declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
899declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
900declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
901declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>)
902declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
903