• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; VLDRB.8
5define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
6; CHECK-LABEL: unscaled_v16i8_i8:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrb.u8 q1, [r1]
9; CHECK-NEXT:    vstrb.8 q0, [r0, q1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
13  %offs.zext = zext <16 x i8> %offs to <16 x i32>
14  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
15  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
16  ret void
17}
18
19; Expand
20define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
21; CHECK-LABEL: unscaled_v8i8_i8:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vldrb.u32 q2, [r1]
24; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
25; CHECK-NEXT:    vmov.u16 r1, q0[0]
26; CHECK-NEXT:    vadd.i32 q2, q2, r0
27; CHECK-NEXT:    vadd.i32 q1, q1, r0
28; CHECK-NEXT:    vmov r0, s8
29; CHECK-NEXT:    strb r1, [r0]
30; CHECK-NEXT:    vmov r0, s9
31; CHECK-NEXT:    vmov.u16 r1, q0[1]
32; CHECK-NEXT:    strb r1, [r0]
33; CHECK-NEXT:    vmov r0, s10
34; CHECK-NEXT:    vmov.u16 r1, q0[2]
35; CHECK-NEXT:    strb r1, [r0]
36; CHECK-NEXT:    vmov r0, s11
37; CHECK-NEXT:    vmov.u16 r1, q0[3]
38; CHECK-NEXT:    strb r1, [r0]
39; CHECK-NEXT:    vmov r0, s4
40; CHECK-NEXT:    vmov.u16 r1, q0[4]
41; CHECK-NEXT:    strb r1, [r0]
42; CHECK-NEXT:    vmov r0, s5
43; CHECK-NEXT:    vmov.u16 r1, q0[5]
44; CHECK-NEXT:    strb r1, [r0]
45; CHECK-NEXT:    vmov r0, s6
46; CHECK-NEXT:    vmov.u16 r1, q0[6]
47; CHECK-NEXT:    strb r1, [r0]
48; CHECK-NEXT:    vmov r0, s7
49; CHECK-NEXT:    vmov.u16 r1, q0[7]
50; CHECK-NEXT:    strb r1, [r0]
51; CHECK-NEXT:    bx lr
52entry:
53  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
54  %offs.zext = zext <8 x i8> %offs to <8 x i32>
55  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
56  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
57  ret void
58}
59
60; Expand
61define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) {
62; CHECK-LABEL: unscaled_v2i8_i8:
63; CHECK:       @ %bb.0: @ %entry
64; CHECK-NEXT:    ldrb r2, [r1]
65; CHECK-NEXT:    vmov.i32 q1, #0xff
66; CHECK-NEXT:    ldrb r1, [r1, #1]
67; CHECK-NEXT:    vmov.32 q2[0], r2
68; CHECK-NEXT:    vmov r2, s0
69; CHECK-NEXT:    vmov.32 q2[2], r1
70; CHECK-NEXT:    vand q1, q2, q1
71; CHECK-NEXT:    vmov r1, s4
72; CHECK-NEXT:    strb r2, [r0, r1]
73; CHECK-NEXT:    vmov r1, s6
74; CHECK-NEXT:    vmov r2, s2
75; CHECK-NEXT:    strb r2, [r0, r1]
76; CHECK-NEXT:    bx lr
77entry:
78  %offs = load <2 x i8>, <2 x i8>* %offptr, align 1
79  %offs.zext = zext <2 x i8> %offs to <2 x i32>
80  %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
81  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>)
82  ret void
83}
84
85; Expand
86define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
87; CHECK-LABEL: unscaled_v16i8_sext:
88; CHECK:       @ %bb.0: @ %entry
89; CHECK-NEXT:    .vsave {d8, d9}
90; CHECK-NEXT:    vpush {d8, d9}
91; CHECK-NEXT:    vldrb.s32 q4, [r1]
92; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
93; CHECK-NEXT:    vldrb.s32 q2, [r1, #8]
94; CHECK-NEXT:    vldrb.s32 q3, [r1, #4]
95; CHECK-NEXT:    vadd.i32 q4, q4, r0
96; CHECK-NEXT:    vadd.i32 q1, q1, r0
97; CHECK-NEXT:    vadd.i32 q2, q2, r0
98; CHECK-NEXT:    vadd.i32 q3, q3, r0
99; CHECK-NEXT:    vmov r0, s16
100; CHECK-NEXT:    vmov.u8 r1, q0[0]
101; CHECK-NEXT:    strb r1, [r0]
102; CHECK-NEXT:    vmov r0, s17
103; CHECK-NEXT:    vmov.u8 r1, q0[1]
104; CHECK-NEXT:    strb r1, [r0]
105; CHECK-NEXT:    vmov r0, s18
106; CHECK-NEXT:    vmov.u8 r1, q0[2]
107; CHECK-NEXT:    strb r1, [r0]
108; CHECK-NEXT:    vmov r0, s19
109; CHECK-NEXT:    vmov.u8 r1, q0[3]
110; CHECK-NEXT:    strb r1, [r0]
111; CHECK-NEXT:    vmov r0, s12
112; CHECK-NEXT:    vmov.u8 r1, q0[4]
113; CHECK-NEXT:    strb r1, [r0]
114; CHECK-NEXT:    vmov r0, s13
115; CHECK-NEXT:    vmov.u8 r1, q0[5]
116; CHECK-NEXT:    strb r1, [r0]
117; CHECK-NEXT:    vmov r0, s14
118; CHECK-NEXT:    vmov.u8 r1, q0[6]
119; CHECK-NEXT:    strb r1, [r0]
120; CHECK-NEXT:    vmov r0, s15
121; CHECK-NEXT:    vmov.u8 r1, q0[7]
122; CHECK-NEXT:    strb r1, [r0]
123; CHECK-NEXT:    vmov r0, s8
124; CHECK-NEXT:    vmov.u8 r1, q0[8]
125; CHECK-NEXT:    strb r1, [r0]
126; CHECK-NEXT:    vmov r0, s9
127; CHECK-NEXT:    vmov.u8 r1, q0[9]
128; CHECK-NEXT:    strb r1, [r0]
129; CHECK-NEXT:    vmov r0, s10
130; CHECK-NEXT:    vmov.u8 r1, q0[10]
131; CHECK-NEXT:    strb r1, [r0]
132; CHECK-NEXT:    vmov r0, s11
133; CHECK-NEXT:    vmov.u8 r1, q0[11]
134; CHECK-NEXT:    strb r1, [r0]
135; CHECK-NEXT:    vmov r0, s4
136; CHECK-NEXT:    vmov.u8 r1, q0[12]
137; CHECK-NEXT:    strb r1, [r0]
138; CHECK-NEXT:    vmov r0, s5
139; CHECK-NEXT:    vmov.u8 r1, q0[13]
140; CHECK-NEXT:    strb r1, [r0]
141; CHECK-NEXT:    vmov r0, s6
142; CHECK-NEXT:    vmov.u8 r1, q0[14]
143; CHECK-NEXT:    strb r1, [r0]
144; CHECK-NEXT:    vmov r0, s7
145; CHECK-NEXT:    vmov.u8 r1, q0[15]
146; CHECK-NEXT:    strb r1, [r0]
147; CHECK-NEXT:    vpop {d8, d9}
148; CHECK-NEXT:    bx lr
149entry:
150  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
151  %offs.sext = sext <16 x i8> %offs to <16 x i32>
152  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
153  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
154  ret void
155}
156
157; Expand
158define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
159; CHECK-LABEL: unscaled_v16i8_i16:
160; CHECK:       @ %bb.0: @ %entry
161; CHECK-NEXT:    .vsave {d8, d9}
162; CHECK-NEXT:    vpush {d8, d9}
163; CHECK-NEXT:    vldrh.s32 q4, [r1]
164; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
165; CHECK-NEXT:    vldrh.s32 q2, [r1, #16]
166; CHECK-NEXT:    vldrh.s32 q3, [r1, #8]
167; CHECK-NEXT:    vadd.i32 q4, q4, r0
168; CHECK-NEXT:    vadd.i32 q1, q1, r0
169; CHECK-NEXT:    vadd.i32 q2, q2, r0
170; CHECK-NEXT:    vadd.i32 q3, q3, r0
171; CHECK-NEXT:    vmov r0, s16
172; CHECK-NEXT:    vmov.u8 r1, q0[0]
173; CHECK-NEXT:    strb r1, [r0]
174; CHECK-NEXT:    vmov r0, s17
175; CHECK-NEXT:    vmov.u8 r1, q0[1]
176; CHECK-NEXT:    strb r1, [r0]
177; CHECK-NEXT:    vmov r0, s18
178; CHECK-NEXT:    vmov.u8 r1, q0[2]
179; CHECK-NEXT:    strb r1, [r0]
180; CHECK-NEXT:    vmov r0, s19
181; CHECK-NEXT:    vmov.u8 r1, q0[3]
182; CHECK-NEXT:    strb r1, [r0]
183; CHECK-NEXT:    vmov r0, s12
184; CHECK-NEXT:    vmov.u8 r1, q0[4]
185; CHECK-NEXT:    strb r1, [r0]
186; CHECK-NEXT:    vmov r0, s13
187; CHECK-NEXT:    vmov.u8 r1, q0[5]
188; CHECK-NEXT:    strb r1, [r0]
189; CHECK-NEXT:    vmov r0, s14
190; CHECK-NEXT:    vmov.u8 r1, q0[6]
191; CHECK-NEXT:    strb r1, [r0]
192; CHECK-NEXT:    vmov r0, s15
193; CHECK-NEXT:    vmov.u8 r1, q0[7]
194; CHECK-NEXT:    strb r1, [r0]
195; CHECK-NEXT:    vmov r0, s8
196; CHECK-NEXT:    vmov.u8 r1, q0[8]
197; CHECK-NEXT:    strb r1, [r0]
198; CHECK-NEXT:    vmov r0, s9
199; CHECK-NEXT:    vmov.u8 r1, q0[9]
200; CHECK-NEXT:    strb r1, [r0]
201; CHECK-NEXT:    vmov r0, s10
202; CHECK-NEXT:    vmov.u8 r1, q0[10]
203; CHECK-NEXT:    strb r1, [r0]
204; CHECK-NEXT:    vmov r0, s11
205; CHECK-NEXT:    vmov.u8 r1, q0[11]
206; CHECK-NEXT:    strb r1, [r0]
207; CHECK-NEXT:    vmov r0, s4
208; CHECK-NEXT:    vmov.u8 r1, q0[12]
209; CHECK-NEXT:    strb r1, [r0]
210; CHECK-NEXT:    vmov r0, s5
211; CHECK-NEXT:    vmov.u8 r1, q0[13]
212; CHECK-NEXT:    strb r1, [r0]
213; CHECK-NEXT:    vmov r0, s6
214; CHECK-NEXT:    vmov.u8 r1, q0[14]
215; CHECK-NEXT:    strb r1, [r0]
216; CHECK-NEXT:    vmov r0, s7
217; CHECK-NEXT:    vmov.u8 r1, q0[15]
218; CHECK-NEXT:    strb r1, [r0]
219; CHECK-NEXT:    vpop {d8, d9}
220; CHECK-NEXT:    bx lr
221entry:
222  %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
223  %offs.sext = sext <16 x i16> %offs to <16 x i32>
224  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
225  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
226  ret void
227}
228
229; Expand
230define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
231; CHECK-LABEL: unscaled_v16i8_scaled:
232; CHECK:       @ %bb.0: @ %entry
233; CHECK-NEXT:    .vsave {d8, d9}
234; CHECK-NEXT:    vpush {d8, d9}
235; CHECK-NEXT:    vldrb.u32 q4, [r1]
236; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
237; CHECK-NEXT:    vldrb.u32 q2, [r1, #8]
238; CHECK-NEXT:    vldrb.u32 q3, [r1, #4]
239; CHECK-NEXT:    vshl.i32 q4, q4, #2
240; CHECK-NEXT:    vshl.i32 q1, q1, #2
241; CHECK-NEXT:    vshl.i32 q2, q2, #2
242; CHECK-NEXT:    vshl.i32 q3, q3, #2
243; CHECK-NEXT:    vadd.i32 q4, q4, r0
244; CHECK-NEXT:    vadd.i32 q1, q1, r0
245; CHECK-NEXT:    vadd.i32 q2, q2, r0
246; CHECK-NEXT:    vadd.i32 q3, q3, r0
247; CHECK-NEXT:    vmov r0, s16
248; CHECK-NEXT:    vmov.u8 r1, q0[0]
249; CHECK-NEXT:    strb r1, [r0]
250; CHECK-NEXT:    vmov r0, s17
251; CHECK-NEXT:    vmov.u8 r1, q0[1]
252; CHECK-NEXT:    strb r1, [r0]
253; CHECK-NEXT:    vmov r0, s18
254; CHECK-NEXT:    vmov.u8 r1, q0[2]
255; CHECK-NEXT:    strb r1, [r0]
256; CHECK-NEXT:    vmov r0, s19
257; CHECK-NEXT:    vmov.u8 r1, q0[3]
258; CHECK-NEXT:    strb r1, [r0]
259; CHECK-NEXT:    vmov r0, s12
260; CHECK-NEXT:    vmov.u8 r1, q0[4]
261; CHECK-NEXT:    strb r1, [r0]
262; CHECK-NEXT:    vmov r0, s13
263; CHECK-NEXT:    vmov.u8 r1, q0[5]
264; CHECK-NEXT:    strb r1, [r0]
265; CHECK-NEXT:    vmov r0, s14
266; CHECK-NEXT:    vmov.u8 r1, q0[6]
267; CHECK-NEXT:    strb r1, [r0]
268; CHECK-NEXT:    vmov r0, s15
269; CHECK-NEXT:    vmov.u8 r1, q0[7]
270; CHECK-NEXT:    strb r1, [r0]
271; CHECK-NEXT:    vmov r0, s8
272; CHECK-NEXT:    vmov.u8 r1, q0[8]
273; CHECK-NEXT:    strb r1, [r0]
274; CHECK-NEXT:    vmov r0, s9
275; CHECK-NEXT:    vmov.u8 r1, q0[9]
276; CHECK-NEXT:    strb r1, [r0]
277; CHECK-NEXT:    vmov r0, s10
278; CHECK-NEXT:    vmov.u8 r1, q0[10]
279; CHECK-NEXT:    strb r1, [r0]
280; CHECK-NEXT:    vmov r0, s11
281; CHECK-NEXT:    vmov.u8 r1, q0[11]
282; CHECK-NEXT:    strb r1, [r0]
283; CHECK-NEXT:    vmov r0, s4
284; CHECK-NEXT:    vmov.u8 r1, q0[12]
285; CHECK-NEXT:    strb r1, [r0]
286; CHECK-NEXT:    vmov r0, s5
287; CHECK-NEXT:    vmov.u8 r1, q0[13]
288; CHECK-NEXT:    strb r1, [r0]
289; CHECK-NEXT:    vmov r0, s6
290; CHECK-NEXT:    vmov.u8 r1, q0[14]
291; CHECK-NEXT:    strb r1, [r0]
292; CHECK-NEXT:    vmov r0, s7
293; CHECK-NEXT:    vmov.u8 r1, q0[15]
294; CHECK-NEXT:    strb r1, [r0]
295; CHECK-NEXT:    vpop {d8, d9}
296; CHECK-NEXT:    bx lr
297entry:
298  %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
299  %offs.zext = zext <16 x i8> %offs to <16 x i32>
300  %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
301  %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
302  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
303  ret void
304}
305
306; Expand
307define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
308; CHECK-LABEL: unscaled_v16i8_i8_next:
309; CHECK:       @ %bb.0: @ %entry
310; CHECK-NEXT:    .vsave {d8, d9}
311; CHECK-NEXT:    vpush {d8, d9}
312; CHECK-NEXT:    vldrw.u32 q4, [r1]
313; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
314; CHECK-NEXT:    vldrw.u32 q2, [r1, #32]
315; CHECK-NEXT:    vldrw.u32 q3, [r1, #16]
316; CHECK-NEXT:    vadd.i32 q4, q4, r0
317; CHECK-NEXT:    vadd.i32 q1, q1, r0
318; CHECK-NEXT:    vadd.i32 q2, q2, r0
319; CHECK-NEXT:    vadd.i32 q3, q3, r0
320; CHECK-NEXT:    vmov r0, s16
321; CHECK-NEXT:    vmov.u8 r1, q0[0]
322; CHECK-NEXT:    strb r1, [r0]
323; CHECK-NEXT:    vmov r0, s17
324; CHECK-NEXT:    vmov.u8 r1, q0[1]
325; CHECK-NEXT:    strb r1, [r0]
326; CHECK-NEXT:    vmov r0, s18
327; CHECK-NEXT:    vmov.u8 r1, q0[2]
328; CHECK-NEXT:    strb r1, [r0]
329; CHECK-NEXT:    vmov r0, s19
330; CHECK-NEXT:    vmov.u8 r1, q0[3]
331; CHECK-NEXT:    strb r1, [r0]
332; CHECK-NEXT:    vmov r0, s12
333; CHECK-NEXT:    vmov.u8 r1, q0[4]
334; CHECK-NEXT:    strb r1, [r0]
335; CHECK-NEXT:    vmov r0, s13
336; CHECK-NEXT:    vmov.u8 r1, q0[5]
337; CHECK-NEXT:    strb r1, [r0]
338; CHECK-NEXT:    vmov r0, s14
339; CHECK-NEXT:    vmov.u8 r1, q0[6]
340; CHECK-NEXT:    strb r1, [r0]
341; CHECK-NEXT:    vmov r0, s15
342; CHECK-NEXT:    vmov.u8 r1, q0[7]
343; CHECK-NEXT:    strb r1, [r0]
344; CHECK-NEXT:    vmov r0, s8
345; CHECK-NEXT:    vmov.u8 r1, q0[8]
346; CHECK-NEXT:    strb r1, [r0]
347; CHECK-NEXT:    vmov r0, s9
348; CHECK-NEXT:    vmov.u8 r1, q0[9]
349; CHECK-NEXT:    strb r1, [r0]
350; CHECK-NEXT:    vmov r0, s10
351; CHECK-NEXT:    vmov.u8 r1, q0[10]
352; CHECK-NEXT:    strb r1, [r0]
353; CHECK-NEXT:    vmov r0, s11
354; CHECK-NEXT:    vmov.u8 r1, q0[11]
355; CHECK-NEXT:    strb r1, [r0]
356; CHECK-NEXT:    vmov r0, s4
357; CHECK-NEXT:    vmov.u8 r1, q0[12]
358; CHECK-NEXT:    strb r1, [r0]
359; CHECK-NEXT:    vmov r0, s5
360; CHECK-NEXT:    vmov.u8 r1, q0[13]
361; CHECK-NEXT:    strb r1, [r0]
362; CHECK-NEXT:    vmov r0, s6
363; CHECK-NEXT:    vmov.u8 r1, q0[14]
364; CHECK-NEXT:    strb r1, [r0]
365; CHECK-NEXT:    vmov r0, s7
366; CHECK-NEXT:    vmov.u8 r1, q0[15]
367; CHECK-NEXT:    strb r1, [r0]
368; CHECK-NEXT:    vpop {d8, d9}
369; CHECK-NEXT:    bx lr
370entry:
371  %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
372  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
373  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
374  ret void
375}
376
377; Expand
378define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
379; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
380; CHECK:       @ %bb.0: @ %entry
381; CHECK-NEXT:    .save {r4, lr}
382; CHECK-NEXT:    push {r4, lr}
383; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
384; CHECK-NEXT:    vpush {d8, d9, d10, d11}
385; CHECK-NEXT:    vmov r4, s0
386; CHECK-NEXT:    add r3, sp, #40
387; CHECK-NEXT:    vmov.8 q5[0], r4
388; CHECK-NEXT:    vmov r4, s2
389; CHECK-NEXT:    vmov.8 q5[1], r4
390; CHECK-NEXT:    vmov r4, s4
391; CHECK-NEXT:    vmov.8 q5[2], r4
392; CHECK-NEXT:    vmov r4, s6
393; CHECK-NEXT:    vmov.8 q5[3], r4
394; CHECK-NEXT:    vmov r4, s8
395; CHECK-NEXT:    vmov.8 q5[4], r4
396; CHECK-NEXT:    vmov r4, s10
397; CHECK-NEXT:    vldrw.u32 q0, [r3]
398; CHECK-NEXT:    vmov.8 q5[5], r4
399; CHECK-NEXT:    vmov r4, s12
400; CHECK-NEXT:    add.w lr, sp, #56
401; CHECK-NEXT:    vmov.8 q5[6], r4
402; CHECK-NEXT:    vmov r4, s14
403; CHECK-NEXT:    vmov.8 q5[7], r4
404; CHECK-NEXT:    vmov r3, s0
405; CHECK-NEXT:    vmov.8 q5[8], r3
406; CHECK-NEXT:    vmov r3, s2
407; CHECK-NEXT:    vldrw.u32 q0, [lr]
408; CHECK-NEXT:    vmov.8 q5[9], r3
409; CHECK-NEXT:    add.w r12, sp, #72
410; CHECK-NEXT:    add r2, sp, #88
411; CHECK-NEXT:    vmov r3, s0
412; CHECK-NEXT:    vldrw.u32 q4, [r2]
413; CHECK-NEXT:    vmov.8 q5[10], r3
414; CHECK-NEXT:    vmov r3, s2
415; CHECK-NEXT:    vldrw.u32 q0, [r12]
416; CHECK-NEXT:    vmov.8 q5[11], r3
417; CHECK-NEXT:    vmov r2, s18
418; CHECK-NEXT:    vmov r3, s0
419; CHECK-NEXT:    vmov.8 q5[12], r3
420; CHECK-NEXT:    vmov r3, s2
421; CHECK-NEXT:    vmov.8 q5[13], r3
422; CHECK-NEXT:    vmov r3, s16
423; CHECK-NEXT:    vmov.8 q5[14], r3
424; CHECK-NEXT:    vldrb.u8 q0, [r1]
425; CHECK-NEXT:    vmov.8 q5[15], r2
426; CHECK-NEXT:    vstrb.8 q5, [r0, q0]
427; CHECK-NEXT:    vpop {d8, d9, d10, d11}
428; CHECK-NEXT:    pop {r4, pc}
429entry:
430  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
431  %offs.zext = zext <16 x i8> %offs to <16 x i32>
432  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
433  %input.trunc = trunc <16 x i64> %input to <16 x i8>
434  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
435  ret void
436}
437
438; Expand
439define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
440; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
441; CHECK:       @ %bb.0: @ %entry
442; CHECK-NEXT:    .vsave {d8, d9}
443; CHECK-NEXT:    vpush {d8, d9}
444; CHECK-NEXT:    vmov r3, s0
445; CHECK-NEXT:    vmov.8 q4[0], r3
446; CHECK-NEXT:    vmov r3, s1
447; CHECK-NEXT:    vmov.8 q4[1], r3
448; CHECK-NEXT:    vmov r3, s2
449; CHECK-NEXT:    vmov.8 q4[2], r3
450; CHECK-NEXT:    vmov r3, s3
451; CHECK-NEXT:    vmov.8 q4[3], r3
452; CHECK-NEXT:    vmov r3, s4
453; CHECK-NEXT:    vmov.8 q4[4], r3
454; CHECK-NEXT:    vmov r3, s5
455; CHECK-NEXT:    vmov.8 q4[5], r3
456; CHECK-NEXT:    vmov r3, s6
457; CHECK-NEXT:    vmov.8 q4[6], r3
458; CHECK-NEXT:    vmov r3, s7
459; CHECK-NEXT:    vmov.8 q4[7], r3
460; CHECK-NEXT:    vmov r3, s8
461; CHECK-NEXT:    vmov.8 q4[8], r3
462; CHECK-NEXT:    vmov r3, s9
463; CHECK-NEXT:    vmov.8 q4[9], r3
464; CHECK-NEXT:    vmov r3, s10
465; CHECK-NEXT:    vmov.8 q4[10], r3
466; CHECK-NEXT:    vmov r3, s11
467; CHECK-NEXT:    vmov.8 q4[11], r3
468; CHECK-NEXT:    vmov r3, s12
469; CHECK-NEXT:    vmov.8 q4[12], r3
470; CHECK-NEXT:    vmov r3, s13
471; CHECK-NEXT:    vmov.8 q4[13], r3
472; CHECK-NEXT:    vmov r3, s14
473; CHECK-NEXT:    vmov r2, s15
474; CHECK-NEXT:    vmov.8 q4[14], r3
475; CHECK-NEXT:    vldrb.u8 q0, [r1]
476; CHECK-NEXT:    vmov.8 q4[15], r2
477; CHECK-NEXT:    vstrb.8 q4, [r0, q0]
478; CHECK-NEXT:    vpop {d8, d9}
479; CHECK-NEXT:    bx lr
480entry:
481  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
482  %offs.zext = zext <16 x i8> %offs to <16 x i32>
483  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
484  %input.trunc = trunc <16 x i32> %input to <16 x i8>
485  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
486  ret void
487}
488
489; Expand
490define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
491; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
492; CHECK:       @ %bb.0: @ %entry
493; CHECK-NEXT:    vmov.u16 r3, q0[0]
494; CHECK-NEXT:    vmov.u16 r2, q1[7]
495; CHECK-NEXT:    vmov.8 q2[0], r3
496; CHECK-NEXT:    vmov.u16 r3, q0[1]
497; CHECK-NEXT:    vmov.8 q2[1], r3
498; CHECK-NEXT:    vmov.u16 r3, q0[2]
499; CHECK-NEXT:    vmov.8 q2[2], r3
500; CHECK-NEXT:    vmov.u16 r3, q0[3]
501; CHECK-NEXT:    vmov.8 q2[3], r3
502; CHECK-NEXT:    vmov.u16 r3, q0[4]
503; CHECK-NEXT:    vmov.8 q2[4], r3
504; CHECK-NEXT:    vmov.u16 r3, q0[5]
505; CHECK-NEXT:    vmov.8 q2[5], r3
506; CHECK-NEXT:    vmov.u16 r3, q0[6]
507; CHECK-NEXT:    vmov.8 q2[6], r3
508; CHECK-NEXT:    vmov.u16 r3, q0[7]
509; CHECK-NEXT:    vmov.8 q2[7], r3
510; CHECK-NEXT:    vmov.u16 r3, q1[0]
511; CHECK-NEXT:    vmov.8 q2[8], r3
512; CHECK-NEXT:    vmov.u16 r3, q1[1]
513; CHECK-NEXT:    vmov.8 q2[9], r3
514; CHECK-NEXT:    vmov.u16 r3, q1[2]
515; CHECK-NEXT:    vmov.8 q2[10], r3
516; CHECK-NEXT:    vmov.u16 r3, q1[3]
517; CHECK-NEXT:    vmov.8 q2[11], r3
518; CHECK-NEXT:    vmov.u16 r3, q1[4]
519; CHECK-NEXT:    vmov.8 q2[12], r3
520; CHECK-NEXT:    vmov.u16 r3, q1[5]
521; CHECK-NEXT:    vmov.8 q2[13], r3
522; CHECK-NEXT:    vmov.u16 r3, q1[6]
523; CHECK-NEXT:    vmov.8 q2[14], r3
524; CHECK-NEXT:    vldrb.u8 q0, [r1]
525; CHECK-NEXT:    vmov.8 q2[15], r2
526; CHECK-NEXT:    vstrb.8 q2, [r0, q0]
527; CHECK-NEXT:    bx lr
528entry:
529  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
530  %offs.zext = zext <16 x i8> %offs to <16 x i32>
531  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
532  %input.trunc = trunc <16 x i16> %input to <16 x i8>
533  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
534  ret void
535}
536
537define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
538; CHECK-LABEL: unscaled_v16i8_i8_2gep:
539; CHECK:       @ %bb.0: @ %entry
540; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
541; CHECK-NEXT:    vpush {d8, d9, d10, d11}
542; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
543; CHECK-NEXT:    vldrb.s32 q2, [r1, #8]
544; CHECK-NEXT:    vldrb.s32 q3, [r1, #4]
545; CHECK-NEXT:    vldrb.s32 q5, [r1]
546; CHECK-NEXT:    vmov.i32 q4, #0x5
547; CHECK-NEXT:    vadd.i32 q1, q1, r0
548; CHECK-NEXT:    vadd.i32 q2, q2, r0
549; CHECK-NEXT:    vadd.i32 q3, q3, r0
550; CHECK-NEXT:    vadd.i32 q5, q5, r0
551; CHECK-NEXT:    vadd.i32 q1, q1, q4
552; CHECK-NEXT:    vadd.i32 q2, q2, q4
553; CHECK-NEXT:    vadd.i32 q3, q3, q4
554; CHECK-NEXT:    vadd.i32 q4, q5, q4
555; CHECK-NEXT:    vmov.u8 r1, q0[0]
556; CHECK-NEXT:    vmov r0, s16
557; CHECK-NEXT:    strb r1, [r0]
558; CHECK-NEXT:    vmov r0, s17
559; CHECK-NEXT:    vmov.u8 r1, q0[1]
560; CHECK-NEXT:    strb r1, [r0]
561; CHECK-NEXT:    vmov r0, s18
562; CHECK-NEXT:    vmov.u8 r1, q0[2]
563; CHECK-NEXT:    strb r1, [r0]
564; CHECK-NEXT:    vmov r0, s19
565; CHECK-NEXT:    vmov.u8 r1, q0[3]
566; CHECK-NEXT:    strb r1, [r0]
567; CHECK-NEXT:    vmov r0, s12
568; CHECK-NEXT:    vmov.u8 r1, q0[4]
569; CHECK-NEXT:    strb r1, [r0]
570; CHECK-NEXT:    vmov r0, s13
571; CHECK-NEXT:    vmov.u8 r1, q0[5]
572; CHECK-NEXT:    strb r1, [r0]
573; CHECK-NEXT:    vmov r0, s14
574; CHECK-NEXT:    vmov.u8 r1, q0[6]
575; CHECK-NEXT:    strb r1, [r0]
576; CHECK-NEXT:    vmov r0, s15
577; CHECK-NEXT:    vmov.u8 r1, q0[7]
578; CHECK-NEXT:    strb r1, [r0]
579; CHECK-NEXT:    vmov r0, s8
580; CHECK-NEXT:    vmov.u8 r1, q0[8]
581; CHECK-NEXT:    strb r1, [r0]
582; CHECK-NEXT:    vmov r0, s9
583; CHECK-NEXT:    vmov.u8 r1, q0[9]
584; CHECK-NEXT:    strb r1, [r0]
585; CHECK-NEXT:    vmov r0, s10
586; CHECK-NEXT:    vmov.u8 r1, q0[10]
587; CHECK-NEXT:    strb r1, [r0]
588; CHECK-NEXT:    vmov r0, s11
589; CHECK-NEXT:    vmov.u8 r1, q0[11]
590; CHECK-NEXT:    strb r1, [r0]
591; CHECK-NEXT:    vmov r0, s4
592; CHECK-NEXT:    vmov.u8 r1, q0[12]
593; CHECK-NEXT:    strb r1, [r0]
594; CHECK-NEXT:    vmov r0, s5
595; CHECK-NEXT:    vmov.u8 r1, q0[13]
596; CHECK-NEXT:    strb r1, [r0]
597; CHECK-NEXT:    vmov r0, s6
598; CHECK-NEXT:    vmov.u8 r1, q0[14]
599; CHECK-NEXT:    strb r1, [r0]
600; CHECK-NEXT:    vmov r0, s7
601; CHECK-NEXT:    vmov.u8 r1, q0[15]
602; CHECK-NEXT:    strb r1, [r0]
603; CHECK-NEXT:    vpop {d8, d9, d10, d11}
604; CHECK-NEXT:    bx lr
605entry:
606  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
607  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
608  %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
609  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
610  ret void
611}
612
613define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
614; CHECK-LABEL: unscaled_v16i8_i8_2gep2:
615; CHECK:       @ %bb.0: @ %entry
616; CHECK-NEXT:    adr r1, .LCPI11_0
617; CHECK-NEXT:    vldrw.u32 q1, [r1]
618; CHECK-NEXT:    vstrb.8 q0, [r0, q1]
619; CHECK-NEXT:    bx lr
620; CHECK-NEXT:    .p2align 4
621; CHECK-NEXT:  @ %bb.1:
622; CHECK-NEXT:  .LCPI11_0:
623; CHECK-NEXT:    .byte 5 @ 0x5
624; CHECK-NEXT:    .byte 8 @ 0x8
625; CHECK-NEXT:    .byte 11 @ 0xb
626; CHECK-NEXT:    .byte 14 @ 0xe
627; CHECK-NEXT:    .byte 17 @ 0x11
628; CHECK-NEXT:    .byte 20 @ 0x14
629; CHECK-NEXT:    .byte 23 @ 0x17
630; CHECK-NEXT:    .byte 26 @ 0x1a
631; CHECK-NEXT:    .byte 29 @ 0x1d
632; CHECK-NEXT:    .byte 32 @ 0x20
633; CHECK-NEXT:    .byte 35 @ 0x23
634; CHECK-NEXT:    .byte 38 @ 0x26
635; CHECK-NEXT:    .byte 41 @ 0x29
636; CHECK-NEXT:    .byte 44 @ 0x2c
637; CHECK-NEXT:    .byte 47 @ 0x2f
638; CHECK-NEXT:    .byte 50 @ 0x32
639entry:
640  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
641  %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
642  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
643  ret void
644}
645
646
647declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
648declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
649declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
650