• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; VLDRB.u16 Qd, [base, offs]
5define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
6; CHECK-LABEL: ext_unscaled_i8_i16:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrh.u16 q1, [r1]
9; CHECK-NEXT:    vstrb.16 q0, [r0, q1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
13  %offs.zext = zext <8 x i16> %offs to <8 x i32>
14  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
15  %t = trunc <8 x i16> %input to <8 x i8>
16  call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x i8*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
17  ret void
18}
19
20; VLDRB.u16 Qd, [base, offs]
21define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
22; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrb.u16 q1, [r1]
25; CHECK-NEXT:    vstrb.16 q0, [r0, q1]
26; CHECK-NEXT:    bx lr
27entry:
28  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
29  %offs.zext = zext <8 x i8> %offs to <8 x i32>
30  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
31  %input.trunc = trunc <8 x i16> %input to <8 x i8>
32  call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
33  ret void
34}
35
36; VLDRH.16 Qd, [base, offs]
37define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
38; CHECK-LABEL: unscaled_i16_i16:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vldrh.u16 q1, [r1]
41; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
42; CHECK-NEXT:    bx lr
43entry:
44  %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
45  %offs.zext = zext <8 x i16> %offs to <8 x i32>
46  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
47  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
48  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
49  ret void
50}
51
52; VLDRH.s16 Qd, [base, offs]
53define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
54; CHECK-LABEL: unscaled_v8f16_i16:
55; CHECK:       @ %bb.0: @ %entry
56; CHECK-NEXT:    vldrh.u16 q1, [r1]
57; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
58; CHECK-NEXT:    bx lr
59entry:
60  %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
61  %offs.zext = zext <8 x i16> %offs to <8 x i32>
62  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
63  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
64  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
65  ret void
66}
67
68; Expand
69define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
70; CHECK-LABEL: unscaled_v8i16_sext:
71; CHECK:       @ %bb.0: @ %entry
72; CHECK-NEXT:    vldrh.s32 q2, [r1]
73; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
74; CHECK-NEXT:    vmov.u16 r1, q0[0]
75; CHECK-NEXT:    vadd.i32 q2, q2, r0
76; CHECK-NEXT:    vadd.i32 q1, q1, r0
77; CHECK-NEXT:    vmov r0, s8
78; CHECK-NEXT:    strh r1, [r0]
79; CHECK-NEXT:    vmov r0, s9
80; CHECK-NEXT:    vmov.u16 r1, q0[1]
81; CHECK-NEXT:    strh r1, [r0]
82; CHECK-NEXT:    vmov r0, s10
83; CHECK-NEXT:    vmov.u16 r1, q0[2]
84; CHECK-NEXT:    strh r1, [r0]
85; CHECK-NEXT:    vmov r0, s11
86; CHECK-NEXT:    vmov.u16 r1, q0[3]
87; CHECK-NEXT:    strh r1, [r0]
88; CHECK-NEXT:    vmov r0, s4
89; CHECK-NEXT:    vmov.u16 r1, q0[4]
90; CHECK-NEXT:    strh r1, [r0]
91; CHECK-NEXT:    vmov r0, s5
92; CHECK-NEXT:    vmov.u16 r1, q0[5]
93; CHECK-NEXT:    strh r1, [r0]
94; CHECK-NEXT:    vmov r0, s6
95; CHECK-NEXT:    vmov.u16 r1, q0[6]
96; CHECK-NEXT:    strh r1, [r0]
97; CHECK-NEXT:    vmov r0, s7
98; CHECK-NEXT:    vmov.u16 r1, q0[7]
99; CHECK-NEXT:    strh r1, [r0]
100; CHECK-NEXT:    bx lr
101entry:
102  %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
103  %offs.sext = sext <8 x i16> %offs to <8 x i32>
104  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
105  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
106  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
107  ret void
108}
109
110; Expand
111define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
112; CHECK-LABEL: unscaled_v8f16_sext:
113; CHECK:       @ %bb.0: @ %entry
114; CHECK-NEXT:    vldrh.s32 q2, [r1]
115; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
116; CHECK-NEXT:    vmovx.f16 s12, s0
117; CHECK-NEXT:    vadd.i32 q2, q2, r0
118; CHECK-NEXT:    vadd.i32 q1, q1, r0
119; CHECK-NEXT:    vmov r1, s8
120; CHECK-NEXT:    vstr.16 s0, [r1]
121; CHECK-NEXT:    vmov r1, s9
122; CHECK-NEXT:    vstr.16 s12, [r1]
123; CHECK-NEXT:    vmov r1, s10
124; CHECK-NEXT:    vstr.16 s1, [r1]
125; CHECK-NEXT:    vmov r1, s11
126; CHECK-NEXT:    vmovx.f16 s8, s1
127; CHECK-NEXT:    vmov r0, s4
128; CHECK-NEXT:    vstr.16 s8, [r1]
129; CHECK-NEXT:    vstr.16 s2, [r0]
130; CHECK-NEXT:    vmov r0, s5
131; CHECK-NEXT:    vmovx.f16 s8, s2
132; CHECK-NEXT:    vstr.16 s8, [r0]
133; CHECK-NEXT:    vmov r0, s6
134; CHECK-NEXT:    vstr.16 s3, [r0]
135; CHECK-NEXT:    vmov r0, s7
136; CHECK-NEXT:    vmovx.f16 s0, s3
137; CHECK-NEXT:    vstr.16 s0, [r0]
138; CHECK-NEXT:    bx lr
139entry:
140  %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
141  %offs.sext = sext <8 x i16> %offs to <8 x i32>
142  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
143  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
144  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
145  ret void
146}
147
148; Expand
149define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
150; CHECK-LABEL: unscaled_v8i16_noext:
151; CHECK:       @ %bb.0: @ %entry
152; CHECK-NEXT:    vldrw.u32 q2, [r1]
153; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
154; CHECK-NEXT:    vmov.u16 r1, q0[0]
155; CHECK-NEXT:    vadd.i32 q2, q2, r0
156; CHECK-NEXT:    vadd.i32 q1, q1, r0
157; CHECK-NEXT:    vmov r0, s8
158; CHECK-NEXT:    strh r1, [r0]
159; CHECK-NEXT:    vmov r0, s9
160; CHECK-NEXT:    vmov.u16 r1, q0[1]
161; CHECK-NEXT:    strh r1, [r0]
162; CHECK-NEXT:    vmov r0, s10
163; CHECK-NEXT:    vmov.u16 r1, q0[2]
164; CHECK-NEXT:    strh r1, [r0]
165; CHECK-NEXT:    vmov r0, s11
166; CHECK-NEXT:    vmov.u16 r1, q0[3]
167; CHECK-NEXT:    strh r1, [r0]
168; CHECK-NEXT:    vmov r0, s4
169; CHECK-NEXT:    vmov.u16 r1, q0[4]
170; CHECK-NEXT:    strh r1, [r0]
171; CHECK-NEXT:    vmov r0, s5
172; CHECK-NEXT:    vmov.u16 r1, q0[5]
173; CHECK-NEXT:    strh r1, [r0]
174; CHECK-NEXT:    vmov r0, s6
175; CHECK-NEXT:    vmov.u16 r1, q0[6]
176; CHECK-NEXT:    strh r1, [r0]
177; CHECK-NEXT:    vmov r0, s7
178; CHECK-NEXT:    vmov.u16 r1, q0[7]
179; CHECK-NEXT:    strh r1, [r0]
180; CHECK-NEXT:    bx lr
181entry:
182  %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
183  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
184  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
185  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
186  ret void
187}
188
189; Expand
190define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
191; CHECK-LABEL: unscaled_v8f16_noext:
192; CHECK:       @ %bb.0: @ %entry
193; CHECK-NEXT:    vldrw.u32 q2, [r1]
194; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
195; CHECK-NEXT:    vmovx.f16 s12, s0
196; CHECK-NEXT:    vadd.i32 q2, q2, r0
197; CHECK-NEXT:    vadd.i32 q1, q1, r0
198; CHECK-NEXT:    vmov r1, s8
199; CHECK-NEXT:    vstr.16 s0, [r1]
200; CHECK-NEXT:    vmov r1, s9
201; CHECK-NEXT:    vstr.16 s12, [r1]
202; CHECK-NEXT:    vmov r1, s10
203; CHECK-NEXT:    vstr.16 s1, [r1]
204; CHECK-NEXT:    vmov r1, s11
205; CHECK-NEXT:    vmovx.f16 s8, s1
206; CHECK-NEXT:    vmov r0, s4
207; CHECK-NEXT:    vstr.16 s8, [r1]
208; CHECK-NEXT:    vstr.16 s2, [r0]
209; CHECK-NEXT:    vmov r0, s5
210; CHECK-NEXT:    vmovx.f16 s8, s2
211; CHECK-NEXT:    vstr.16 s8, [r0]
212; CHECK-NEXT:    vmov r0, s6
213; CHECK-NEXT:    vstr.16 s3, [r0]
214; CHECK-NEXT:    vmov r0, s7
215; CHECK-NEXT:    vmovx.f16 s0, s3
216; CHECK-NEXT:    vstr.16 s0, [r0]
217; CHECK-NEXT:    bx lr
218entry:
219  %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
220  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
221  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
222  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
223  ret void
224}
225
226; VLDRH.16 Qd, [base, zext(offs)]
227define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
228; CHECK-LABEL: unsigned_unscaled_i16_i8:
229; CHECK:       @ %bb.0: @ %entry
230; CHECK-NEXT:    vldrb.u16 q1, [r1]
231; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
232; CHECK-NEXT:    bx lr
233entry:
234  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
235  %offs.zext = zext <8 x i8> %offs to <8 x i32>
236  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
237  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
238  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
239  ret void
240}
241
242; VLDRH.16 Qd, [base, zext(offs)]
243define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) {
244; CHECK-LABEL: unsigned_unscaled_f16_i8:
245; CHECK:       @ %bb.0: @ %entry
246; CHECK-NEXT:    vldrb.u16 q1, [r1]
247; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
248; CHECK-NEXT:    bx lr
249entry:
250  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
251  %offs.zext = zext <8 x i8> %offs to <8 x i32>
252  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
253  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
254  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
255  ret void
256}
257
258; Expand ?
259define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
260; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
261; CHECK:       @ %bb.0: @ %entry
262; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
263; CHECK-NEXT:    vpush {d8, d9, d10, d11}
264; CHECK-NEXT:    vldrb.s32 q5, [r1]
265; CHECK-NEXT:    vldrb.s32 q4, [r1, #4]
266; CHECK-NEXT:    vmov r1, s0
267; CHECK-NEXT:    vadd.i32 q5, q5, r0
268; CHECK-NEXT:    vadd.i32 q4, q4, r0
269; CHECK-NEXT:    vmov r0, s20
270; CHECK-NEXT:    strh r1, [r0]
271; CHECK-NEXT:    vmov r0, s21
272; CHECK-NEXT:    vmov r1, s2
273; CHECK-NEXT:    strh r1, [r0]
274; CHECK-NEXT:    vmov r0, s22
275; CHECK-NEXT:    vmov r1, s4
276; CHECK-NEXT:    strh r1, [r0]
277; CHECK-NEXT:    vmov r0, s23
278; CHECK-NEXT:    vmov r1, s6
279; CHECK-NEXT:    strh r1, [r0]
280; CHECK-NEXT:    vmov r0, s16
281; CHECK-NEXT:    vmov r1, s8
282; CHECK-NEXT:    strh r1, [r0]
283; CHECK-NEXT:    vmov r0, s17
284; CHECK-NEXT:    vmov r1, s10
285; CHECK-NEXT:    strh r1, [r0]
286; CHECK-NEXT:    vmov r0, s18
287; CHECK-NEXT:    vmov r1, s12
288; CHECK-NEXT:    strh r1, [r0]
289; CHECK-NEXT:    vmov r0, s19
290; CHECK-NEXT:    vmov r1, s14
291; CHECK-NEXT:    strh r1, [r0]
292; CHECK-NEXT:    vpop {d8, d9, d10, d11}
293; CHECK-NEXT:    bx lr
294entry:
295  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
296  %offs.sext = sext <8 x i8> %offs to <8 x i32>
297  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
298  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
299  %input.trunc = trunc <8 x i64> %input to <8 x i16>
300  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
301  ret void
302}
303
304; Expand ?
305define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
306; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
307; CHECK:       @ %bb.0: @ %entry
308; CHECK-NEXT:    .vsave {d8, d9}
309; CHECK-NEXT:    vpush {d8, d9}
310; CHECK-NEXT:    vmov r3, s0
311; CHECK-NEXT:    vmov.16 q4[0], r3
312; CHECK-NEXT:    vmov r3, s2
313; CHECK-NEXT:    vmov.16 q4[1], r3
314; CHECK-NEXT:    vmov r3, s4
315; CHECK-NEXT:    vmov.16 q4[2], r3
316; CHECK-NEXT:    vmov r3, s6
317; CHECK-NEXT:    vmov.16 q4[3], r3
318; CHECK-NEXT:    vmov r3, s8
319; CHECK-NEXT:    vmov.16 q4[4], r3
320; CHECK-NEXT:    vmov r3, s10
321; CHECK-NEXT:    vmov.16 q4[5], r3
322; CHECK-NEXT:    vmov r3, s12
323; CHECK-NEXT:    vmov r2, s14
324; CHECK-NEXT:    vmov.16 q4[6], r3
325; CHECK-NEXT:    vldrb.u16 q0, [r1]
326; CHECK-NEXT:    vmov.16 q4[7], r2
327; CHECK-NEXT:    vstrh.16 q4, [r0, q0]
328; CHECK-NEXT:    vpop {d8, d9}
329; CHECK-NEXT:    bx lr
330entry:
331  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
332  %offs.zext = zext <8 x i8> %offs to <8 x i32>
333  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
334  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
335  %input.trunc = trunc <8 x i64> %input to <8 x i16>
336  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
337  ret void
338}
339
340; Expand ?
341define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
342; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
343; CHECK:       @ %bb.0: @ %entry
344; CHECK-NEXT:    vldrb.s32 q3, [r1]
345; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
346; CHECK-NEXT:    vmov r1, s0
347; CHECK-NEXT:    vadd.i32 q3, q3, r0
348; CHECK-NEXT:    vadd.i32 q2, q2, r0
349; CHECK-NEXT:    vmov r0, s12
350; CHECK-NEXT:    strh r1, [r0]
351; CHECK-NEXT:    vmov r0, s13
352; CHECK-NEXT:    vmov r1, s1
353; CHECK-NEXT:    strh r1, [r0]
354; CHECK-NEXT:    vmov r0, s14
355; CHECK-NEXT:    vmov r1, s2
356; CHECK-NEXT:    strh r1, [r0]
357; CHECK-NEXT:    vmov r0, s15
358; CHECK-NEXT:    vmov r1, s3
359; CHECK-NEXT:    strh r1, [r0]
360; CHECK-NEXT:    vmov r0, s8
361; CHECK-NEXT:    vmov r1, s4
362; CHECK-NEXT:    strh r1, [r0]
363; CHECK-NEXT:    vmov r0, s9
364; CHECK-NEXT:    vmov r1, s5
365; CHECK-NEXT:    strh r1, [r0]
366; CHECK-NEXT:    vmov r0, s10
367; CHECK-NEXT:    vmov r1, s6
368; CHECK-NEXT:    strh r1, [r0]
369; CHECK-NEXT:    vmov r0, s11
370; CHECK-NEXT:    vmov r1, s7
371; CHECK-NEXT:    strh r1, [r0]
372; CHECK-NEXT:    bx lr
373entry:
374  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
375  %offs.sext = sext <8 x i8> %offs to <8 x i32>
376  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
377  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
378  %input.trunc = trunc <8 x i32> %input to <8 x i16>
379  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
380  ret void
381}
382
383; Expand ?
384define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
385; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
386; CHECK:       @ %bb.0: @ %entry
387; CHECK-NEXT:    vmov r3, s0
388; CHECK-NEXT:    vmov.16 q2[0], r3
389; CHECK-NEXT:    vmov r3, s1
390; CHECK-NEXT:    vmov.16 q2[1], r3
391; CHECK-NEXT:    vmov r3, s2
392; CHECK-NEXT:    vmov.16 q2[2], r3
393; CHECK-NEXT:    vmov r3, s3
394; CHECK-NEXT:    vmov.16 q2[3], r3
395; CHECK-NEXT:    vmov r3, s4
396; CHECK-NEXT:    vmov.16 q2[4], r3
397; CHECK-NEXT:    vmov r3, s5
398; CHECK-NEXT:    vmov.16 q2[5], r3
399; CHECK-NEXT:    vmov r3, s6
400; CHECK-NEXT:    vmov r2, s7
401; CHECK-NEXT:    vmov.16 q2[6], r3
402; CHECK-NEXT:    vldrb.u16 q0, [r1]
403; CHECK-NEXT:    vmov.16 q2[7], r2
404; CHECK-NEXT:    vstrh.16 q2, [r0, q0]
405; CHECK-NEXT:    bx lr
406entry:
407  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
408  %offs.zext = zext <8 x i8> %offs to <8 x i32>
409  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
410  %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
411  %input.trunc = trunc <8 x i32> %input to <8 x i16>
412  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
413  ret void
414}
415
416; Expand ?
417define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
418; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
419; CHECK:       @ %bb.0: @ %entry
420; CHECK-NEXT:    vldrb.s32 q2, [r1]
421; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
422; CHECK-NEXT:    vmov.u16 r1, q0[0]
423; CHECK-NEXT:    vadd.i32 q2, q2, r0
424; CHECK-NEXT:    vadd.i32 q1, q1, r0
425; CHECK-NEXT:    vmov r0, s8
426; CHECK-NEXT:    strb r1, [r0]
427; CHECK-NEXT:    vmov r0, s9
428; CHECK-NEXT:    vmov.u16 r1, q0[1]
429; CHECK-NEXT:    strb r1, [r0]
430; CHECK-NEXT:    vmov r0, s10
431; CHECK-NEXT:    vmov.u16 r1, q0[2]
432; CHECK-NEXT:    strb r1, [r0]
433; CHECK-NEXT:    vmov r0, s11
434; CHECK-NEXT:    vmov.u16 r1, q0[3]
435; CHECK-NEXT:    strb r1, [r0]
436; CHECK-NEXT:    vmov r0, s4
437; CHECK-NEXT:    vmov.u16 r1, q0[4]
438; CHECK-NEXT:    strb r1, [r0]
439; CHECK-NEXT:    vmov r0, s5
440; CHECK-NEXT:    vmov.u16 r1, q0[5]
441; CHECK-NEXT:    strb r1, [r0]
442; CHECK-NEXT:    vmov r0, s6
443; CHECK-NEXT:    vmov.u16 r1, q0[6]
444; CHECK-NEXT:    strb r1, [r0]
445; CHECK-NEXT:    vmov r0, s7
446; CHECK-NEXT:    vmov.u16 r1, q0[7]
447; CHECK-NEXT:    strb r1, [r0]
448; CHECK-NEXT:    bx lr
449entry:
450  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
451  %offs.sext = sext <8 x i8> %offs to <8 x i32>
452  %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
453  %input.trunc = trunc <8 x i16> %input to <8 x i8>
454  call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
455  ret void
456}
457
458declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
459declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
460declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
461