• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vpt.i32 eq, q2, zr
8; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
9; CHECK-NEXT:    bx lr
10entry:
11  %c = icmp eq <4 x i32> %b, zeroinitializer
12  %m = mul <4 x i32> %x, %y
13  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
14  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
15  ret i32 %z
16}
17
18define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
19; CHECK-LABEL: add_v4i32_v4i64_zext:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vpt.i32 eq, q2, zr
22; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
23; CHECK-NEXT:    bx lr
24entry:
25  %c = icmp eq <4 x i32> %b, zeroinitializer
26  %xx = zext <4 x i32> %x to <4 x i64>
27  %yy = zext <4 x i32> %y to <4 x i64>
28  %m = mul <4 x i64> %xx, %yy
29  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
30  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
31  ret i64 %z
32}
33
34define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
35; CHECK-LABEL: add_v4i32_v4i64_sext:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    vpt.i32 eq, q2, zr
38; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
39; CHECK-NEXT:    bx lr
40entry:
41  %c = icmp eq <4 x i32> %b, zeroinitializer
42  %xx = sext <4 x i32> %x to <4 x i64>
43  %yy = sext <4 x i32> %y to <4 x i64>
44  %m = mul <4 x i64> %xx, %yy
45  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
46  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
47  ret i64 %z
48}
49
50define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
51; CHECK-LABEL: add_v2i32_v2i64_zext:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vmov r0, s8
54; CHECK-NEXT:    vmullb.u32 q3, q0, q1
55; CHECK-NEXT:    cmp r0, #0
56; CHECK-NEXT:    cset r0, eq
57; CHECK-NEXT:    tst.w r0, #1
58; CHECK-NEXT:    csetm r0, ne
59; CHECK-NEXT:    vmov.32 q0[0], r0
60; CHECK-NEXT:    vmov.32 q0[1], r0
61; CHECK-NEXT:    vmov r0, s10
62; CHECK-NEXT:    cmp r0, #0
63; CHECK-NEXT:    cset r0, eq
64; CHECK-NEXT:    tst.w r0, #1
65; CHECK-NEXT:    csetm r0, ne
66; CHECK-NEXT:    vmov.32 q0[2], r0
67; CHECK-NEXT:    vmov.32 q0[3], r0
68; CHECK-NEXT:    vand q0, q3, q0
69; CHECK-NEXT:    vmov r0, s2
70; CHECK-NEXT:    vmov r3, s0
71; CHECK-NEXT:    vmov r1, s3
72; CHECK-NEXT:    vmov r2, s1
73; CHECK-NEXT:    adds r0, r0, r3
74; CHECK-NEXT:    adcs r1, r2
75; CHECK-NEXT:    bx lr
76entry:
77  %c = icmp eq <2 x i32> %b, zeroinitializer
78  %xx = zext <2 x i32> %x to <2 x i64>
79  %yy = zext <2 x i32> %y to <2 x i64>
80  %m = mul <2 x i64> %xx, %yy
81  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
82  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
83  ret i64 %z
84}
85
86define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
87; CHECK-LABEL: add_v2i32_v2i64_sext:
88; CHECK:       @ %bb.0: @ %entry
89; CHECK-NEXT:    vmov r0, s8
90; CHECK-NEXT:    vmullb.s32 q3, q0, q1
91; CHECK-NEXT:    cmp r0, #0
92; CHECK-NEXT:    cset r0, eq
93; CHECK-NEXT:    tst.w r0, #1
94; CHECK-NEXT:    csetm r0, ne
95; CHECK-NEXT:    vmov.32 q0[0], r0
96; CHECK-NEXT:    vmov.32 q0[1], r0
97; CHECK-NEXT:    vmov r0, s10
98; CHECK-NEXT:    cmp r0, #0
99; CHECK-NEXT:    cset r0, eq
100; CHECK-NEXT:    tst.w r0, #1
101; CHECK-NEXT:    csetm r0, ne
102; CHECK-NEXT:    vmov.32 q0[2], r0
103; CHECK-NEXT:    vmov.32 q0[3], r0
104; CHECK-NEXT:    vand q0, q3, q0
105; CHECK-NEXT:    vmov r0, s2
106; CHECK-NEXT:    vmov r3, s0
107; CHECK-NEXT:    vmov r1, s3
108; CHECK-NEXT:    vmov r2, s1
109; CHECK-NEXT:    adds r0, r0, r3
110; CHECK-NEXT:    adcs r1, r2
111; CHECK-NEXT:    bx lr
112entry:
113  %c = icmp eq <2 x i32> %b, zeroinitializer
114  %xx = sext <2 x i32> %x to <2 x i64>
115  %yy = sext <2 x i32> %y to <2 x i64>
116  %m = mul <2 x i64> %xx, %yy
117  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
118  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
119  ret i64 %z
120}
121
122define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
123; CHECK-LABEL: add_v8i16_v8i32_zext:
124; CHECK:       @ %bb.0: @ %entry
125; CHECK-NEXT:    vpt.i16 eq, q2, zr
126; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
127; CHECK-NEXT:    bx lr
128entry:
129  %c = icmp eq <8 x i16> %b, zeroinitializer
130  %xx = zext <8 x i16> %x to <8 x i32>
131  %yy = zext <8 x i16> %y to <8 x i32>
132  %m = mul <8 x i32> %xx, %yy
133  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
134  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
135  ret i32 %z
136}
137
138define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
139; CHECK-LABEL: add_v8i16_v8i32_sext:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    vpt.i16 eq, q2, zr
142; CHECK-NEXT:    vmlavt.s16 r0, q0, q1
143; CHECK-NEXT:    bx lr
144entry:
145  %c = icmp eq <8 x i16> %b, zeroinitializer
146  %xx = sext <8 x i16> %x to <8 x i32>
147  %yy = sext <8 x i16> %y to <8 x i32>
148  %m = mul <8 x i32> %xx, %yy
149  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
150  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
151  ret i32 %z
152}
153
154define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
155; CHECK-LABEL: add_v4i16_v4i32_zext:
156; CHECK:       @ %bb.0: @ %entry
157; CHECK-NEXT:    vmovlb.u16 q1, q1
158; CHECK-NEXT:    vmovlb.u16 q0, q0
159; CHECK-NEXT:    vmovlb.u16 q2, q2
160; CHECK-NEXT:    vpt.i32 eq, q2, zr
161; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
162; CHECK-NEXT:    bx lr
163entry:
164  %c = icmp eq <4 x i16> %b, zeroinitializer
165  %xx = zext <4 x i16> %x to <4 x i32>
166  %yy = zext <4 x i16> %y to <4 x i32>
167  %m = mul <4 x i32> %xx, %yy
168  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
169  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
170  ret i32 %z
171}
172
173define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
174; CHECK-LABEL: add_v4i16_v4i32_sext:
175; CHECK:       @ %bb.0: @ %entry
176; CHECK-NEXT:    vmovlb.s16 q1, q1
177; CHECK-NEXT:    vmovlb.s16 q0, q0
178; CHECK-NEXT:    vmovlb.u16 q2, q2
179; CHECK-NEXT:    vpt.i32 eq, q2, zr
180; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
181; CHECK-NEXT:    bx lr
182entry:
183  %c = icmp eq <4 x i16> %b, zeroinitializer
184  %xx = sext <4 x i16> %x to <4 x i32>
185  %yy = sext <4 x i16> %y to <4 x i32>
186  %m = mul <4 x i32> %xx, %yy
187  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
188  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
189  ret i32 %z
190}
191
192define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
193; CHECK-LABEL: add_v8i16_v8i16:
194; CHECK:       @ %bb.0: @ %entry
195; CHECK-NEXT:    vpt.i16 eq, q2, zr
196; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
197; CHECK-NEXT:    uxth r0, r0
198; CHECK-NEXT:    bx lr
199entry:
200  %c = icmp eq <8 x i16> %b, zeroinitializer
201  %m = mul <8 x i16> %x, %y
202  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
203  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
204  ret i16 %z
205}
206
207define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
208; CHECK-LABEL: add_v8i16_v8i64_zext:
209; CHECK:       @ %bb.0: @ %entry
210; CHECK-NEXT:    vpt.i16 eq, q2, zr
211; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
212; CHECK-NEXT:    bx lr
213entry:
214  %c = icmp eq <8 x i16> %b, zeroinitializer
215  %xx = zext <8 x i16> %x to <8 x i64>
216  %yy = zext <8 x i16> %y to <8 x i64>
217  %m = mul <8 x i64> %xx, %yy
218  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
219  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
220  ret i64 %z
221}
222
223define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
224; CHECK-LABEL: add_v8i16_v8i64_sext:
225; CHECK:       @ %bb.0: @ %entry
226; CHECK-NEXT:    vpt.i16 eq, q2, zr
227; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
228; CHECK-NEXT:    bx lr
229entry:
230  %c = icmp eq <8 x i16> %b, zeroinitializer
231  %xx = sext <8 x i16> %x to <8 x i64>
232  %yy = sext <8 x i16> %y to <8 x i64>
233  %m = mul <8 x i64> %xx, %yy
234  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
235  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
236  ret i64 %z
237}
238
239define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
240; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
241; CHECK:       @ %bb.0: @ %entry
242; CHECK-NEXT:    vpt.i16 eq, q2, zr
243; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
244; CHECK-NEXT:    bx lr
245entry:
246  %c = icmp eq <8 x i16> %b, zeroinitializer
247  %xx = zext <8 x i16> %x to <8 x i32>
248  %yy = zext <8 x i16> %y to <8 x i32>
249  %m = mul <8 x i32> %xx, %yy
250  %ma = zext <8 x i32> %m to <8 x i64>
251  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
252  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
253  ret i64 %z
254}
255
256define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
257; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
258; CHECK:       @ %bb.0: @ %entry
259; CHECK-NEXT:    vpt.i16 eq, q2, zr
260; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
261; CHECK-NEXT:    bx lr
262entry:
263  %c = icmp eq <8 x i16> %b, zeroinitializer
264  %xx = sext <8 x i16> %x to <8 x i32>
265  %yy = sext <8 x i16> %y to <8 x i32>
266  %m = mul <8 x i32> %xx, %yy
267  %ma = sext <8 x i32> %m to <8 x i64>
268  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
269  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
270  ret i64 %z
271}
272
273define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
274; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
275; CHECK:       @ %bb.0: @ %entry
276; CHECK-NEXT:    vpt.i16 eq, q2, zr
277; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q0
278; CHECK-NEXT:    bx lr
279entry:
280  %c = icmp eq <8 x i16> %b, zeroinitializer
281  %xx = sext <8 x i16> %x to <8 x i32>
282  %m = mul <8 x i32> %xx, %xx
283  %ma = zext <8 x i32> %m to <8 x i64>
284  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
285  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
286  ret i64 %z
287}
288
289define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
290; CHECK-LABEL: add_v2i16_v2i64_zext:
291; CHECK:       @ %bb.0: @ %entry
292; CHECK-NEXT:    .vsave {d8, d9}
293; CHECK-NEXT:    vpush {d8, d9}
294; CHECK-NEXT:    vmov.i64 q3, #0xffff
295; CHECK-NEXT:    vand q1, q1, q3
296; CHECK-NEXT:    vand q4, q0, q3
297; CHECK-NEXT:    vmov r0, s4
298; CHECK-NEXT:    vmov r1, s16
299; CHECK-NEXT:    umull r0, r1, r1, r0
300; CHECK-NEXT:    vmov.32 q0[0], r0
301; CHECK-NEXT:    vmov r0, s6
302; CHECK-NEXT:    vmov.32 q0[1], r1
303; CHECK-NEXT:    vmov r1, s18
304; CHECK-NEXT:    vand q1, q2, q3
305; CHECK-NEXT:    umull r0, r1, r1, r0
306; CHECK-NEXT:    vmov.32 q0[2], r0
307; CHECK-NEXT:    vmov r0, s4
308; CHECK-NEXT:    vmov.32 q0[3], r1
309; CHECK-NEXT:    cmp r0, #0
310; CHECK-NEXT:    cset r0, eq
311; CHECK-NEXT:    tst.w r0, #1
312; CHECK-NEXT:    csetm r0, ne
313; CHECK-NEXT:    vmov.32 q2[0], r0
314; CHECK-NEXT:    vmov.32 q2[1], r0
315; CHECK-NEXT:    vmov r0, s6
316; CHECK-NEXT:    cmp r0, #0
317; CHECK-NEXT:    cset r0, eq
318; CHECK-NEXT:    tst.w r0, #1
319; CHECK-NEXT:    csetm r0, ne
320; CHECK-NEXT:    vmov.32 q2[2], r0
321; CHECK-NEXT:    vmov.32 q2[3], r0
322; CHECK-NEXT:    vand q0, q0, q2
323; CHECK-NEXT:    vmov r0, s2
324; CHECK-NEXT:    vmov r3, s0
325; CHECK-NEXT:    vmov r1, s3
326; CHECK-NEXT:    vmov r2, s1
327; CHECK-NEXT:    adds r0, r0, r3
328; CHECK-NEXT:    adcs r1, r2
329; CHECK-NEXT:    vpop {d8, d9}
330; CHECK-NEXT:    bx lr
331entry:
332  %c = icmp eq <2 x i16> %b, zeroinitializer
333  %xx = zext <2 x i16> %x to <2 x i64>
334  %yy = zext <2 x i16> %y to <2 x i64>
335  %m = mul <2 x i64> %xx, %yy
336  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
337  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
338  ret i64 %z
339}
340
341define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
342; CHECK-LABEL: add_v2i16_v2i64_sext:
343; CHECK:       @ %bb.0: @ %entry
344; CHECK-NEXT:    vmov.i32 q3, #0xffff
345; CHECK-NEXT:    vmov r1, s0
346; CHECK-NEXT:    vand q3, q2, q3
347; CHECK-NEXT:    vmov r0, s12
348; CHECK-NEXT:    sxth r1, r1
349; CHECK-NEXT:    cmp r0, #0
350; CHECK-NEXT:    cset r0, eq
351; CHECK-NEXT:    tst.w r0, #1
352; CHECK-NEXT:    csetm r0, ne
353; CHECK-NEXT:    vmov.32 q2[0], r0
354; CHECK-NEXT:    vmov.32 q2[1], r0
355; CHECK-NEXT:    vmov r0, s14
356; CHECK-NEXT:    cmp r0, #0
357; CHECK-NEXT:    cset r0, eq
358; CHECK-NEXT:    tst.w r0, #1
359; CHECK-NEXT:    csetm r0, ne
360; CHECK-NEXT:    vmov.32 q2[2], r0
361; CHECK-NEXT:    vmov.32 q2[3], r0
362; CHECK-NEXT:    vmov r0, s4
363; CHECK-NEXT:    sxth r0, r0
364; CHECK-NEXT:    smull r0, r1, r1, r0
365; CHECK-NEXT:    vmov.32 q3[0], r0
366; CHECK-NEXT:    vmov r0, s6
367; CHECK-NEXT:    vmov.32 q3[1], r1
368; CHECK-NEXT:    vmov r1, s2
369; CHECK-NEXT:    sxth r0, r0
370; CHECK-NEXT:    sxth r1, r1
371; CHECK-NEXT:    smull r0, r1, r1, r0
372; CHECK-NEXT:    vmov.32 q3[2], r0
373; CHECK-NEXT:    vmov.32 q3[3], r1
374; CHECK-NEXT:    vand q0, q3, q2
375; CHECK-NEXT:    vmov r0, s2
376; CHECK-NEXT:    vmov r3, s0
377; CHECK-NEXT:    vmov r1, s3
378; CHECK-NEXT:    vmov r2, s1
379; CHECK-NEXT:    adds r0, r0, r3
380; CHECK-NEXT:    adcs r1, r2
381; CHECK-NEXT:    bx lr
382entry:
383  %c = icmp eq <2 x i16> %b, zeroinitializer
384  %xx = sext <2 x i16> %x to <2 x i64>
385  %yy = sext <2 x i16> %y to <2 x i64>
386  %m = mul <2 x i64> %xx, %yy
387  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
388  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
389  ret i64 %z
390}
391
392define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
393; CHECK-LABEL: add_v16i8_v16i32_zext:
394; CHECK:       @ %bb.0: @ %entry
395; CHECK-NEXT:    vpt.i8 eq, q2, zr
396; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
397; CHECK-NEXT:    bx lr
398entry:
399  %c = icmp eq <16 x i8> %b, zeroinitializer
400  %xx = zext <16 x i8> %x to <16 x i32>
401  %yy = zext <16 x i8> %y to <16 x i32>
402  %m = mul <16 x i32> %xx, %yy
403  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
404  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
405  ret i32 %z
406}
407
408define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
409; CHECK-LABEL: add_v16i8_v16i32_sext:
410; CHECK:       @ %bb.0: @ %entry
411; CHECK-NEXT:    vpt.i8 eq, q2, zr
412; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
413; CHECK-NEXT:    bx lr
414entry:
415  %c = icmp eq <16 x i8> %b, zeroinitializer
416  %xx = sext <16 x i8> %x to <16 x i32>
417  %yy = sext <16 x i8> %y to <16 x i32>
418  %m = mul <16 x i32> %xx, %yy
419  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
420  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
421  ret i32 %z
422}
423
424define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
425; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
426; CHECK:       @ %bb.0: @ %entry
427; CHECK-NEXT:    vpt.i8 eq, q2, zr
428; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
429; CHECK-NEXT:    bx lr
430entry:
431  %c = icmp eq <16 x i8> %b, zeroinitializer
432  %xx = zext <16 x i8> %x to <16 x i16>
433  %yy = zext <16 x i8> %y to <16 x i16>
434  %m = mul <16 x i16> %xx, %yy
435  %ma = zext <16 x i16> %m to <16 x i32>
436  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
437  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
438  ret i32 %z
439}
440
441define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
442; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
443; CHECK:       @ %bb.0: @ %entry
444; CHECK-NEXT:    vpt.i8 eq, q2, zr
445; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
446; CHECK-NEXT:    bx lr
447entry:
448  %c = icmp eq <16 x i8> %b, zeroinitializer
449  %xx = sext <16 x i8> %x to <16 x i16>
450  %yy = sext <16 x i8> %y to <16 x i16>
451  %m = mul <16 x i16> %xx, %yy
452  %ma = sext <16 x i16> %m to <16 x i32>
453  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
454  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
455  ret i32 %z
456}
457
458define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
459; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
460; CHECK:       @ %bb.0: @ %entry
461; CHECK-NEXT:    vpt.i8 eq, q2, zr
462; CHECK-NEXT:    vmlavt.s8 r0, q0, q0
463; CHECK-NEXT:    bx lr
464entry:
465  %c = icmp eq <16 x i8> %b, zeroinitializer
466  %xx = sext <16 x i8> %x to <16 x i16>
467  %m = mul <16 x i16> %xx, %xx
468  %ma = zext <16 x i16> %m to <16 x i32>
469  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
470  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
471  ret i32 %z
472}
473
474define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
475; CHECK-LABEL: add_v4i8_v4i32_zext:
476; CHECK:       @ %bb.0: @ %entry
477; CHECK-NEXT:    vmov.i32 q3, #0xff
478; CHECK-NEXT:    vand q1, q1, q3
479; CHECK-NEXT:    vand q0, q0, q3
480; CHECK-NEXT:    vand q2, q2, q3
481; CHECK-NEXT:    vpt.i32 eq, q2, zr
482; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
483; CHECK-NEXT:    bx lr
484entry:
485  %c = icmp eq <4 x i8> %b, zeroinitializer
486  %xx = zext <4 x i8> %x to <4 x i32>
487  %yy = zext <4 x i8> %y to <4 x i32>
488  %m = mul <4 x i32> %xx, %yy
489  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
490  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
491  ret i32 %z
492}
493
494define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
495; CHECK-LABEL: add_v4i8_v4i32_sext:
496; CHECK:       @ %bb.0: @ %entry
497; CHECK-NEXT:    vmovlb.s8 q1, q1
498; CHECK-NEXT:    vmovlb.s8 q0, q0
499; CHECK-NEXT:    vmov.i32 q3, #0xff
500; CHECK-NEXT:    vmovlb.s16 q1, q1
501; CHECK-NEXT:    vand q2, q2, q3
502; CHECK-NEXT:    vmovlb.s16 q0, q0
503; CHECK-NEXT:    vpt.i32 eq, q2, zr
504; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
505; CHECK-NEXT:    bx lr
506entry:
507  %c = icmp eq <4 x i8> %b, zeroinitializer
508  %xx = sext <4 x i8> %x to <4 x i32>
509  %yy = sext <4 x i8> %y to <4 x i32>
510  %m = mul <4 x i32> %xx, %yy
511  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
512  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
513  ret i32 %z
514}
515
516define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
517; CHECK-LABEL: add_v16i8_v16i16_zext:
518; CHECK:       @ %bb.0: @ %entry
519; CHECK-NEXT:    vpt.i8 eq, q2, zr
520; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
521; CHECK-NEXT:    uxth r0, r0
522; CHECK-NEXT:    bx lr
523entry:
524  %c = icmp eq <16 x i8> %b, zeroinitializer
525  %xx = zext <16 x i8> %x to <16 x i16>
526  %yy = zext <16 x i8> %y to <16 x i16>
527  %m = mul <16 x i16> %xx, %yy
528  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
529  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
530  ret i16 %z
531}
532
533define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
534; CHECK-LABEL: add_v16i8_v16i16_sext:
535; CHECK:       @ %bb.0: @ %entry
536; CHECK-NEXT:    vpt.i8 eq, q2, zr
537; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
538; CHECK-NEXT:    sxth r0, r0
539; CHECK-NEXT:    bx lr
540entry:
541  %c = icmp eq <16 x i8> %b, zeroinitializer
542  %xx = sext <16 x i8> %x to <16 x i16>
543  %yy = sext <16 x i8> %y to <16 x i16>
544  %m = mul <16 x i16> %xx, %yy
545  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
546  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
547  ret i16 %z
548}
549
550define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
551; CHECK-LABEL: add_v8i8_v8i16_zext:
552; CHECK:       @ %bb.0: @ %entry
553; CHECK-NEXT:    vmovlb.u8 q1, q1
554; CHECK-NEXT:    vmovlb.u8 q0, q0
555; CHECK-NEXT:    vmovlb.u8 q2, q2
556; CHECK-NEXT:    vpt.i16 eq, q2, zr
557; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
558; CHECK-NEXT:    uxth r0, r0
559; CHECK-NEXT:    bx lr
560entry:
561  %c = icmp eq <8 x i8> %b, zeroinitializer
562  %xx = zext <8 x i8> %x to <8 x i16>
563  %yy = zext <8 x i8> %y to <8 x i16>
564  %m = mul <8 x i16> %xx, %yy
565  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
566  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
567  ret i16 %z
568}
569
570define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
571; CHECK-LABEL: add_v8i8_v8i16_sext:
572; CHECK:       @ %bb.0: @ %entry
573; CHECK-NEXT:    vmovlb.s8 q1, q1
574; CHECK-NEXT:    vmovlb.s8 q0, q0
575; CHECK-NEXT:    vmovlb.u8 q2, q2
576; CHECK-NEXT:    vpt.i16 eq, q2, zr
577; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
578; CHECK-NEXT:    sxth r0, r0
579; CHECK-NEXT:    bx lr
580entry:
581  %c = icmp eq <8 x i8> %b, zeroinitializer
582  %xx = sext <8 x i8> %x to <8 x i16>
583  %yy = sext <8 x i8> %y to <8 x i16>
584  %m = mul <8 x i16> %xx, %yy
585  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
586  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
587  ret i16 %z
588}
589
590define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
591; CHECK-LABEL: add_v16i8_v16i8:
592; CHECK:       @ %bb.0: @ %entry
593; CHECK-NEXT:    vpt.i8 eq, q2, zr
594; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
595; CHECK-NEXT:    uxtb r0, r0
596; CHECK-NEXT:    bx lr
597entry:
598  %c = icmp eq <16 x i8> %b, zeroinitializer
599  %m = mul <16 x i8> %x, %y
600  %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
601  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
602  ret i8 %z
603}
604
605define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
606; CHECK-LABEL: add_v16i8_v16i64_zext:
607; CHECK:       @ %bb.0: @ %entry
608; CHECK-NEXT:    .save {r7, lr}
609; CHECK-NEXT:    push {r7, lr}
610; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
611; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
612; CHECK-NEXT:    .pad #88
613; CHECK-NEXT:    sub sp, #88
614; CHECK-NEXT:    vmov q3, q1
615; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
616; CHECK-NEXT:    vmov.i8 q0, #0x0
617; CHECK-NEXT:    vmov.i8 q1, #0xff
618; CHECK-NEXT:    vcmp.i8 eq, q2, zr
619; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
620; CHECK-NEXT:    vpsel q5, q1, q0
621; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
622; CHECK-NEXT:    vmov.u8 r0, q5[0]
623; CHECK-NEXT:    vmov.i64 q4, #0xff
624; CHECK-NEXT:    vmov.16 q2[0], r0
625; CHECK-NEXT:    vmov.u8 r0, q5[1]
626; CHECK-NEXT:    vmov.16 q2[1], r0
627; CHECK-NEXT:    vmov.u8 r0, q5[2]
628; CHECK-NEXT:    vmov.16 q2[2], r0
629; CHECK-NEXT:    vmov.u8 r0, q5[3]
630; CHECK-NEXT:    vmov.16 q2[3], r0
631; CHECK-NEXT:    vmov.u8 r0, q5[4]
632; CHECK-NEXT:    vmov.16 q2[4], r0
633; CHECK-NEXT:    vmov.u8 r0, q5[5]
634; CHECK-NEXT:    vmov.16 q2[5], r0
635; CHECK-NEXT:    vmov.u8 r0, q5[6]
636; CHECK-NEXT:    vmov.16 q2[6], r0
637; CHECK-NEXT:    vmov.u8 r0, q5[7]
638; CHECK-NEXT:    vmov.16 q2[7], r0
639; CHECK-NEXT:    vcmp.i16 ne, q2, zr
640; CHECK-NEXT:    vpsel q6, q1, q0
641; CHECK-NEXT:    vmov.u16 r0, q6[0]
642; CHECK-NEXT:    vmov.32 q2[0], r0
643; CHECK-NEXT:    vmov.u16 r0, q6[1]
644; CHECK-NEXT:    vmov.32 q2[1], r0
645; CHECK-NEXT:    vmov.u16 r0, q6[2]
646; CHECK-NEXT:    vmov.32 q2[2], r0
647; CHECK-NEXT:    vmov.u16 r0, q6[3]
648; CHECK-NEXT:    vmov.32 q2[3], r0
649; CHECK-NEXT:    vcmp.i32 ne, q2, zr
650; CHECK-NEXT:    vmrs r0, p0
651; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
652; CHECK-NEXT:    vmov.u8 r2, q1[0]
653; CHECK-NEXT:    and r1, r0, #1
654; CHECK-NEXT:    rsbs r1, r1, #0
655; CHECK-NEXT:    vmov.32 q7[0], r1
656; CHECK-NEXT:    vmov.32 q7[1], r1
657; CHECK-NEXT:    ubfx r1, r0, #4, #1
658; CHECK-NEXT:    rsbs r1, r1, #0
659; CHECK-NEXT:    vmov.32 q7[2], r1
660; CHECK-NEXT:    vmov.32 q7[3], r1
661; CHECK-NEXT:    vmov.u8 r1, q3[0]
662; CHECK-NEXT:    vmov.32 q0[0], r1
663; CHECK-NEXT:    vmov.u8 r1, q3[1]
664; CHECK-NEXT:    vmov.32 q0[2], r1
665; CHECK-NEXT:    vand q2, q0, q4
666; CHECK-NEXT:    vmov.32 q0[0], r2
667; CHECK-NEXT:    vmov.u8 r2, q1[1]
668; CHECK-NEXT:    vmov r1, s8
669; CHECK-NEXT:    vmov.32 q0[2], r2
670; CHECK-NEXT:    vand q1, q0, q4
671; CHECK-NEXT:    vmov r2, s4
672; CHECK-NEXT:    umull r1, r2, r2, r1
673; CHECK-NEXT:    vmov.32 q0[0], r1
674; CHECK-NEXT:    vmov r1, s10
675; CHECK-NEXT:    vmov.32 q0[1], r2
676; CHECK-NEXT:    vmov r2, s6
677; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
678; CHECK-NEXT:    umull r1, r2, r2, r1
679; CHECK-NEXT:    vmov.32 q0[2], r1
680; CHECK-NEXT:    vmov.32 q0[3], r2
681; CHECK-NEXT:    vand q0, q0, q7
682; CHECK-NEXT:    vmov r1, s2
683; CHECK-NEXT:    vmov r2, s0
684; CHECK-NEXT:    vmov r12, s3
685; CHECK-NEXT:    vmov r3, s1
686; CHECK-NEXT:    adds r1, r1, r2
687; CHECK-NEXT:    adc.w r2, r3, r12
688; CHECK-NEXT:    ubfx r3, r0, #8, #1
689; CHECK-NEXT:    rsbs r3, r3, #0
690; CHECK-NEXT:    ubfx r0, r0, #12, #1
691; CHECK-NEXT:    vmov.32 q7[0], r3
692; CHECK-NEXT:    rsbs r0, r0, #0
693; CHECK-NEXT:    vmov.32 q7[1], r3
694; CHECK-NEXT:    vmov.u8 r3, q2[2]
695; CHECK-NEXT:    vmov.32 q7[2], r0
696; CHECK-NEXT:    vmov.32 q1[0], r3
697; CHECK-NEXT:    vmov.32 q7[3], r0
698; CHECK-NEXT:    vmov.u8 r0, q3[2]
699; CHECK-NEXT:    vmov.32 q0[0], r0
700; CHECK-NEXT:    vmov.u8 r0, q3[3]
701; CHECK-NEXT:    vmov.u8 r3, q2[3]
702; CHECK-NEXT:    vmov.32 q0[2], r0
703; CHECK-NEXT:    vmov.32 q1[2], r3
704; CHECK-NEXT:    vand q0, q0, q4
705; CHECK-NEXT:    vand q1, q1, q4
706; CHECK-NEXT:    vmov r0, s0
707; CHECK-NEXT:    vmov r3, s4
708; CHECK-NEXT:    umull r0, r3, r3, r0
709; CHECK-NEXT:    vmov.32 q2[0], r0
710; CHECK-NEXT:    vmov r0, s2
711; CHECK-NEXT:    vmov.32 q2[1], r3
712; CHECK-NEXT:    vmov r3, s6
713; CHECK-NEXT:    umull r0, r3, r3, r0
714; CHECK-NEXT:    vmov.32 q2[2], r0
715; CHECK-NEXT:    vmov.32 q2[3], r3
716; CHECK-NEXT:    vand q0, q2, q7
717; CHECK-NEXT:    vmov r3, s0
718; CHECK-NEXT:    vmov r0, s1
719; CHECK-NEXT:    adds r1, r1, r3
720; CHECK-NEXT:    vmov r3, s3
721; CHECK-NEXT:    adcs r2, r0
722; CHECK-NEXT:    vmov r0, s2
723; CHECK-NEXT:    adds.w r12, r1, r0
724; CHECK-NEXT:    adc.w r1, r2, r3
725; CHECK-NEXT:    vmov.u16 r2, q6[4]
726; CHECK-NEXT:    vmov.32 q0[0], r2
727; CHECK-NEXT:    vmov.u16 r2, q6[5]
728; CHECK-NEXT:    vmov.32 q0[1], r2
729; CHECK-NEXT:    vmov.u16 r2, q6[6]
730; CHECK-NEXT:    vmov.32 q0[2], r2
731; CHECK-NEXT:    vmov.u16 r2, q6[7]
732; CHECK-NEXT:    vmov.32 q0[3], r2
733; CHECK-NEXT:    vcmp.i32 ne, q0, zr
734; CHECK-NEXT:    vmrs lr, p0
735; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
736; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
737; CHECK-NEXT:    and r3, lr, #1
738; CHECK-NEXT:    rsbs r3, r3, #0
739; CHECK-NEXT:    vmov.32 q6[0], r3
740; CHECK-NEXT:    vmov.32 q6[1], r3
741; CHECK-NEXT:    ubfx r3, lr, #4, #1
742; CHECK-NEXT:    rsbs r3, r3, #0
743; CHECK-NEXT:    vmov.32 q6[2], r3
744; CHECK-NEXT:    vmov.32 q6[3], r3
745; CHECK-NEXT:    vmov.u8 r3, q3[4]
746; CHECK-NEXT:    vmov.32 q0[0], r3
747; CHECK-NEXT:    vmov.u8 r3, q3[5]
748; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
749; CHECK-NEXT:    vmov.32 q0[2], r3
750; CHECK-NEXT:    vand q0, q0, q4
751; CHECK-NEXT:    vmov.u8 r0, q3[4]
752; CHECK-NEXT:    vmov r3, s0
753; CHECK-NEXT:    vmov.32 q1[0], r0
754; CHECK-NEXT:    vmov.u8 r0, q3[5]
755; CHECK-NEXT:    vmov.32 q1[2], r0
756; CHECK-NEXT:    vmov q7, q3
757; CHECK-NEXT:    vand q1, q1, q4
758; CHECK-NEXT:    vmov r0, s4
759; CHECK-NEXT:    umull r0, r3, r0, r3
760; CHECK-NEXT:    vmov.32 q2[0], r0
761; CHECK-NEXT:    vmov r0, s2
762; CHECK-NEXT:    vmov.32 q2[1], r3
763; CHECK-NEXT:    vmov r3, s6
764; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
765; CHECK-NEXT:    umull r0, r3, r3, r0
766; CHECK-NEXT:    vmov.32 q2[2], r0
767; CHECK-NEXT:    vmov.32 q2[3], r3
768; CHECK-NEXT:    vand q0, q2, q6
769; CHECK-NEXT:    vmov r3, s0
770; CHECK-NEXT:    vmov r0, s1
771; CHECK-NEXT:    vmov r2, s2
772; CHECK-NEXT:    adds.w r3, r3, r12
773; CHECK-NEXT:    adcs r1, r0
774; CHECK-NEXT:    vmov r0, s3
775; CHECK-NEXT:    adds r3, r3, r2
776; CHECK-NEXT:    vmov.u8 r2, q3[6]
777; CHECK-NEXT:    adcs r1, r0
778; CHECK-NEXT:    ubfx r0, lr, #8, #1
779; CHECK-NEXT:    rsbs r0, r0, #0
780; CHECK-NEXT:    vmov.32 q6[0], r0
781; CHECK-NEXT:    vmov.32 q6[1], r0
782; CHECK-NEXT:    ubfx r0, lr, #12, #1
783; CHECK-NEXT:    rsbs r0, r0, #0
784; CHECK-NEXT:    vmov.32 q6[2], r0
785; CHECK-NEXT:    vmov.32 q6[3], r0
786; CHECK-NEXT:    vmov.u8 r0, q1[6]
787; CHECK-NEXT:    vmov.32 q0[0], r0
788; CHECK-NEXT:    vmov.u8 r0, q1[7]
789; CHECK-NEXT:    vmov.32 q1[0], r2
790; CHECK-NEXT:    vmov.u8 r2, q3[7]
791; CHECK-NEXT:    vmov.32 q0[2], r0
792; CHECK-NEXT:    vmov.32 q1[2], r2
793; CHECK-NEXT:    vand q0, q0, q4
794; CHECK-NEXT:    vand q1, q1, q4
795; CHECK-NEXT:    vmov r0, s0
796; CHECK-NEXT:    vmov r2, s4
797; CHECK-NEXT:    umull r0, r2, r2, r0
798; CHECK-NEXT:    vmov.32 q2[0], r0
799; CHECK-NEXT:    vmov r0, s2
800; CHECK-NEXT:    vmov.32 q2[1], r2
801; CHECK-NEXT:    vmov r2, s6
802; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
803; CHECK-NEXT:    umull r0, r2, r2, r0
804; CHECK-NEXT:    vmov.32 q2[2], r0
805; CHECK-NEXT:    vmov.32 q2[3], r2
806; CHECK-NEXT:    vand q0, q2, q6
807; CHECK-NEXT:    vmov r2, s0
808; CHECK-NEXT:    vmov r0, s1
809; CHECK-NEXT:    adds r2, r2, r3
810; CHECK-NEXT:    vmov r3, s3
811; CHECK-NEXT:    adcs r1, r0
812; CHECK-NEXT:    vmov r0, s2
813; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
814; CHECK-NEXT:    adds.w r12, r2, r0
815; CHECK-NEXT:    vmov.u8 r2, q5[8]
816; CHECK-NEXT:    vmov.16 q6[0], r2
817; CHECK-NEXT:    vmov.u8 r2, q5[9]
818; CHECK-NEXT:    vmov.16 q6[1], r2
819; CHECK-NEXT:    vmov.u8 r2, q5[10]
820; CHECK-NEXT:    vmov.16 q6[2], r2
821; CHECK-NEXT:    vmov.u8 r2, q5[11]
822; CHECK-NEXT:    vmov.16 q6[3], r2
823; CHECK-NEXT:    vmov.u8 r2, q5[12]
824; CHECK-NEXT:    vmov.16 q6[4], r2
825; CHECK-NEXT:    vmov.u8 r2, q5[13]
826; CHECK-NEXT:    vmov.16 q6[5], r2
827; CHECK-NEXT:    vmov.u8 r2, q5[14]
828; CHECK-NEXT:    vmov.16 q6[6], r2
829; CHECK-NEXT:    vmov.u8 r2, q5[15]
830; CHECK-NEXT:    vmov.16 q6[7], r2
831; CHECK-NEXT:    adcs r1, r3
832; CHECK-NEXT:    vcmp.i16 ne, q6, zr
833; CHECK-NEXT:    vmov.u8 r0, q7[8]
834; CHECK-NEXT:    vpsel q3, q1, q0
835; CHECK-NEXT:    vmov.32 q1[0], r0
836; CHECK-NEXT:    vmov.u16 r2, q3[0]
837; CHECK-NEXT:    vmov.u8 r0, q7[9]
838; CHECK-NEXT:    vmov.32 q0[0], r2
839; CHECK-NEXT:    vmov.u16 r2, q3[1]
840; CHECK-NEXT:    vmov.32 q0[1], r2
841; CHECK-NEXT:    vmov.u16 r2, q3[2]
842; CHECK-NEXT:    vmov.32 q0[2], r2
843; CHECK-NEXT:    vmov.u16 r2, q3[3]
844; CHECK-NEXT:    vmov.32 q0[3], r2
845; CHECK-NEXT:    vmov.32 q1[2], r0
846; CHECK-NEXT:    vcmp.i32 ne, q0, zr
847; CHECK-NEXT:    vmrs lr, p0
848; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
849; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
850; CHECK-NEXT:    vand q1, q1, q5
851; CHECK-NEXT:    vmov r0, s4
852; CHECK-NEXT:    and r3, lr, #1
853; CHECK-NEXT:    rsbs r3, r3, #0
854; CHECK-NEXT:    vmov.32 q4[0], r3
855; CHECK-NEXT:    vmov.32 q4[1], r3
856; CHECK-NEXT:    ubfx r3, lr, #4, #1
857; CHECK-NEXT:    rsbs r3, r3, #0
858; CHECK-NEXT:    vmov.32 q4[2], r3
859; CHECK-NEXT:    vmov.32 q4[3], r3
860; CHECK-NEXT:    vmov.u8 r3, q6[8]
861; CHECK-NEXT:    vmov.32 q0[0], r3
862; CHECK-NEXT:    vmov.u8 r3, q6[9]
863; CHECK-NEXT:    vmov.32 q0[2], r3
864; CHECK-NEXT:    vand q0, q0, q5
865; CHECK-NEXT:    vmov r3, s0
866; CHECK-NEXT:    umull r0, r3, r0, r3
867; CHECK-NEXT:    vmov.32 q2[0], r0
868; CHECK-NEXT:    vmov r0, s2
869; CHECK-NEXT:    vmov.32 q2[1], r3
870; CHECK-NEXT:    vmov r3, s6
871; CHECK-NEXT:    umull r0, r3, r3, r0
872; CHECK-NEXT:    vmov.32 q2[2], r0
873; CHECK-NEXT:    vmov.32 q2[3], r3
874; CHECK-NEXT:    vand q0, q2, q4
875; CHECK-NEXT:    vmov r3, s0
876; CHECK-NEXT:    vmov r0, s1
877; CHECK-NEXT:    vmov r2, s2
878; CHECK-NEXT:    adds.w r3, r3, r12
879; CHECK-NEXT:    adcs r1, r0
880; CHECK-NEXT:    vmov r0, s3
881; CHECK-NEXT:    adds r3, r3, r2
882; CHECK-NEXT:    vmov.u8 r2, q7[10]
883; CHECK-NEXT:    vmov.32 q1[0], r2
884; CHECK-NEXT:    vmov.u8 r2, q7[11]
885; CHECK-NEXT:    vmov.32 q1[2], r2
886; CHECK-NEXT:    vand q1, q1, q5
887; CHECK-NEXT:    vmov r2, s4
888; CHECK-NEXT:    adcs r1, r0
889; CHECK-NEXT:    ubfx r0, lr, #8, #1
890; CHECK-NEXT:    rsbs r0, r0, #0
891; CHECK-NEXT:    vmov.32 q4[0], r0
892; CHECK-NEXT:    vmov.32 q4[1], r0
893; CHECK-NEXT:    ubfx r0, lr, #12, #1
894; CHECK-NEXT:    rsbs r0, r0, #0
895; CHECK-NEXT:    vmov.32 q4[2], r0
896; CHECK-NEXT:    vmov.32 q4[3], r0
897; CHECK-NEXT:    vmov.u8 r0, q6[10]
898; CHECK-NEXT:    vmov.32 q0[0], r0
899; CHECK-NEXT:    vmov.u8 r0, q6[11]
900; CHECK-NEXT:    vmov.32 q0[2], r0
901; CHECK-NEXT:    vand q0, q0, q5
902; CHECK-NEXT:    vmov r0, s0
903; CHECK-NEXT:    umull r0, r2, r2, r0
904; CHECK-NEXT:    vmov.32 q2[0], r0
905; CHECK-NEXT:    vmov r0, s2
906; CHECK-NEXT:    vmov.32 q2[1], r2
907; CHECK-NEXT:    vmov r2, s6
908; CHECK-NEXT:    umull r0, r2, r2, r0
909; CHECK-NEXT:    vmov.32 q2[2], r0
910; CHECK-NEXT:    vmov.32 q2[3], r2
911; CHECK-NEXT:    vand q0, q2, q4
912; CHECK-NEXT:    vmov r2, s0
913; CHECK-NEXT:    vmov r0, s1
914; CHECK-NEXT:    adds r2, r2, r3
915; CHECK-NEXT:    vmov r3, s3
916; CHECK-NEXT:    adcs r1, r0
917; CHECK-NEXT:    vmov r0, s2
918; CHECK-NEXT:    adds.w r12, r2, r0
919; CHECK-NEXT:    vmov.u16 r2, q3[4]
920; CHECK-NEXT:    vmov.32 q0[0], r2
921; CHECK-NEXT:    vmov.u16 r2, q3[5]
922; CHECK-NEXT:    vmov.32 q0[1], r2
923; CHECK-NEXT:    vmov.u16 r2, q3[6]
924; CHECK-NEXT:    vmov.32 q0[2], r2
925; CHECK-NEXT:    vmov.u16 r2, q3[7]
926; CHECK-NEXT:    vmov.32 q0[3], r2
927; CHECK-NEXT:    adcs r1, r3
928; CHECK-NEXT:    vcmp.i32 ne, q0, zr
929; CHECK-NEXT:    vmov.u8 r0, q7[12]
930; CHECK-NEXT:    vmrs lr, p0
931; CHECK-NEXT:    vmov.32 q1[0], r0
932; CHECK-NEXT:    vmov.u8 r0, q7[13]
933; CHECK-NEXT:    vmov.32 q1[2], r0
934; CHECK-NEXT:    vand q1, q1, q5
935; CHECK-NEXT:    vmov r0, s4
936; CHECK-NEXT:    and r3, lr, #1
937; CHECK-NEXT:    rsbs r3, r3, #0
938; CHECK-NEXT:    vmov.32 q3[0], r3
939; CHECK-NEXT:    vmov.32 q3[1], r3
940; CHECK-NEXT:    ubfx r3, lr, #4, #1
941; CHECK-NEXT:    rsbs r3, r3, #0
942; CHECK-NEXT:    vmov.32 q3[2], r3
943; CHECK-NEXT:    vmov.32 q3[3], r3
944; CHECK-NEXT:    vmov.u8 r3, q6[12]
945; CHECK-NEXT:    vmov.32 q0[0], r3
946; CHECK-NEXT:    vmov.u8 r3, q6[13]
947; CHECK-NEXT:    vmov.32 q0[2], r3
948; CHECK-NEXT:    vand q0, q0, q5
949; CHECK-NEXT:    vmov r3, s0
950; CHECK-NEXT:    umull r0, r3, r0, r3
951; CHECK-NEXT:    vmov.32 q2[0], r0
952; CHECK-NEXT:    vmov r0, s2
953; CHECK-NEXT:    vmov.32 q2[1], r3
954; CHECK-NEXT:    vmov r3, s6
955; CHECK-NEXT:    umull r0, r3, r3, r0
956; CHECK-NEXT:    vmov.32 q2[2], r0
957; CHECK-NEXT:    vmov.32 q2[3], r3
958; CHECK-NEXT:    vand q0, q2, q3
959; CHECK-NEXT:    vmov r3, s0
960; CHECK-NEXT:    vmov r0, s1
961; CHECK-NEXT:    vmov r2, s2
962; CHECK-NEXT:    adds.w r3, r3, r12
963; CHECK-NEXT:    adcs r1, r0
964; CHECK-NEXT:    vmov r0, s3
965; CHECK-NEXT:    adds r3, r3, r2
966; CHECK-NEXT:    vmov.u8 r2, q7[14]
967; CHECK-NEXT:    vmov.32 q1[0], r2
968; CHECK-NEXT:    vmov.u8 r2, q7[15]
969; CHECK-NEXT:    vmov.32 q1[2], r2
970; CHECK-NEXT:    vand q1, q1, q5
971; CHECK-NEXT:    vmov r2, s4
972; CHECK-NEXT:    adcs r1, r0
973; CHECK-NEXT:    ubfx r0, lr, #8, #1
974; CHECK-NEXT:    rsbs r0, r0, #0
975; CHECK-NEXT:    vmov.32 q3[0], r0
976; CHECK-NEXT:    vmov.32 q3[1], r0
977; CHECK-NEXT:    ubfx r0, lr, #12, #1
978; CHECK-NEXT:    rsbs r0, r0, #0
979; CHECK-NEXT:    vmov.32 q3[2], r0
980; CHECK-NEXT:    vmov.32 q3[3], r0
981; CHECK-NEXT:    vmov.u8 r0, q6[14]
982; CHECK-NEXT:    vmov.32 q0[0], r0
983; CHECK-NEXT:    vmov.u8 r0, q6[15]
984; CHECK-NEXT:    vmov.32 q0[2], r0
985; CHECK-NEXT:    vand q0, q0, q5
986; CHECK-NEXT:    vmov r0, s0
987; CHECK-NEXT:    umull r0, r2, r2, r0
988; CHECK-NEXT:    vmov.32 q2[0], r0
989; CHECK-NEXT:    vmov r0, s2
990; CHECK-NEXT:    vmov.32 q2[1], r2
991; CHECK-NEXT:    vmov r2, s6
992; CHECK-NEXT:    umull r0, r2, r2, r0
993; CHECK-NEXT:    vmov.32 q2[2], r0
994; CHECK-NEXT:    vmov.32 q2[3], r2
995; CHECK-NEXT:    vand q0, q2, q3
996; CHECK-NEXT:    vmov r2, s0
997; CHECK-NEXT:    vmov r0, s1
998; CHECK-NEXT:    adds r2, r2, r3
999; CHECK-NEXT:    vmov r3, s3
1000; CHECK-NEXT:    adcs r1, r0
1001; CHECK-NEXT:    vmov r0, s2
1002; CHECK-NEXT:    adds r0, r0, r2
1003; CHECK-NEXT:    adcs r1, r3
1004; CHECK-NEXT:    add sp, #88
1005; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1006; CHECK-NEXT:    pop {r7, pc}
1007entry:
1008  %c = icmp eq <16 x i8> %b, zeroinitializer
1009  %xx = zext <16 x i8> %x to <16 x i64>
1010  %yy = zext <16 x i8> %y to <16 x i64>
1011  %m = mul <16 x i64> %xx, %yy
1012  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1013  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1014  ret i64 %z
1015}
1016
1017define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
1018; CHECK-LABEL: add_v16i8_v16i64_sext:
1019; CHECK:       @ %bb.0: @ %entry
1020; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1021; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1022; CHECK-NEXT:    vcmp.i8 eq, q2, zr
1023; CHECK-NEXT:    vmov.i8 q2, #0x0
1024; CHECK-NEXT:    vmov.i8 q3, #0xff
1025; CHECK-NEXT:    vmov.u8 r2, q0[0]
1026; CHECK-NEXT:    vpsel q4, q3, q2
1027; CHECK-NEXT:    sxtb r2, r2
1028; CHECK-NEXT:    vmov.u8 r0, q4[0]
1029; CHECK-NEXT:    vmov.16 q5[0], r0
1030; CHECK-NEXT:    vmov.u8 r0, q4[1]
1031; CHECK-NEXT:    vmov.16 q5[1], r0
1032; CHECK-NEXT:    vmov.u8 r0, q4[2]
1033; CHECK-NEXT:    vmov.16 q5[2], r0
1034; CHECK-NEXT:    vmov.u8 r0, q4[3]
1035; CHECK-NEXT:    vmov.16 q5[3], r0
1036; CHECK-NEXT:    vmov.u8 r0, q4[4]
1037; CHECK-NEXT:    vmov.16 q5[4], r0
1038; CHECK-NEXT:    vmov.u8 r0, q4[5]
1039; CHECK-NEXT:    vmov.16 q5[5], r0
1040; CHECK-NEXT:    vmov.u8 r0, q4[6]
1041; CHECK-NEXT:    vmov.16 q5[6], r0
1042; CHECK-NEXT:    vmov.u8 r0, q4[7]
1043; CHECK-NEXT:    vmov.16 q5[7], r0
1044; CHECK-NEXT:    vcmp.i16 ne, q5, zr
1045; CHECK-NEXT:    vpsel q5, q3, q2
1046; CHECK-NEXT:    vmov.u16 r0, q5[0]
1047; CHECK-NEXT:    vmov.32 q6[0], r0
1048; CHECK-NEXT:    vmov.u16 r0, q5[1]
1049; CHECK-NEXT:    vmov.32 q6[1], r0
1050; CHECK-NEXT:    vmov.u16 r0, q5[2]
1051; CHECK-NEXT:    vmov.32 q6[2], r0
1052; CHECK-NEXT:    vmov.u16 r0, q5[3]
1053; CHECK-NEXT:    vmov.32 q6[3], r0
1054; CHECK-NEXT:    vcmp.i32 ne, q6, zr
1055; CHECK-NEXT:    vmrs r0, p0
1056; CHECK-NEXT:    and r1, r0, #1
1057; CHECK-NEXT:    rsbs r1, r1, #0
1058; CHECK-NEXT:    vmov.32 q6[0], r1
1059; CHECK-NEXT:    vmov.32 q6[1], r1
1060; CHECK-NEXT:    ubfx r1, r0, #4, #1
1061; CHECK-NEXT:    rsbs r1, r1, #0
1062; CHECK-NEXT:    vmov.32 q6[2], r1
1063; CHECK-NEXT:    vmov.32 q6[3], r1
1064; CHECK-NEXT:    vmov.u8 r1, q1[0]
1065; CHECK-NEXT:    sxtb r1, r1
1066; CHECK-NEXT:    smull r1, r2, r2, r1
1067; CHECK-NEXT:    vmov.32 q7[0], r1
1068; CHECK-NEXT:    vmov.u8 r1, q1[1]
1069; CHECK-NEXT:    vmov.32 q7[1], r2
1070; CHECK-NEXT:    vmov.u8 r2, q0[1]
1071; CHECK-NEXT:    sxtb r1, r1
1072; CHECK-NEXT:    sxtb r2, r2
1073; CHECK-NEXT:    smull r1, r2, r2, r1
1074; CHECK-NEXT:    vmov.32 q7[2], r1
1075; CHECK-NEXT:    vmov.32 q7[3], r2
1076; CHECK-NEXT:    vand q6, q7, q6
1077; CHECK-NEXT:    vmov r1, s26
1078; CHECK-NEXT:    vmov r2, s24
1079; CHECK-NEXT:    vmov r12, s27
1080; CHECK-NEXT:    vmov r3, s25
1081; CHECK-NEXT:    adds r1, r1, r2
1082; CHECK-NEXT:    adc.w r2, r3, r12
1083; CHECK-NEXT:    ubfx r3, r0, #8, #1
1084; CHECK-NEXT:    rsbs r3, r3, #0
1085; CHECK-NEXT:    ubfx r0, r0, #12, #1
1086; CHECK-NEXT:    vmov.32 q6[0], r3
1087; CHECK-NEXT:    rsbs r0, r0, #0
1088; CHECK-NEXT:    vmov.32 q6[1], r3
1089; CHECK-NEXT:    vmov.u8 r3, q0[2]
1090; CHECK-NEXT:    vmov.32 q6[2], r0
1091; CHECK-NEXT:    sxtb r3, r3
1092; CHECK-NEXT:    vmov.32 q6[3], r0
1093; CHECK-NEXT:    vmov.u8 r0, q1[2]
1094; CHECK-NEXT:    sxtb r0, r0
1095; CHECK-NEXT:    smull r0, r3, r3, r0
1096; CHECK-NEXT:    vmov.32 q7[0], r0
1097; CHECK-NEXT:    vmov.u8 r0, q1[3]
1098; CHECK-NEXT:    vmov.32 q7[1], r3
1099; CHECK-NEXT:    vmov.u8 r3, q0[3]
1100; CHECK-NEXT:    sxtb r0, r0
1101; CHECK-NEXT:    sxtb r3, r3
1102; CHECK-NEXT:    smull r0, r3, r3, r0
1103; CHECK-NEXT:    vmov.32 q7[2], r0
1104; CHECK-NEXT:    vmov.32 q7[3], r3
1105; CHECK-NEXT:    vand q6, q7, q6
1106; CHECK-NEXT:    vmov r3, s24
1107; CHECK-NEXT:    vmov r0, s25
1108; CHECK-NEXT:    adds r1, r1, r3
1109; CHECK-NEXT:    vmov r3, s27
1110; CHECK-NEXT:    adcs r2, r0
1111; CHECK-NEXT:    vmov r0, s26
1112; CHECK-NEXT:    adds.w r12, r1, r0
1113; CHECK-NEXT:    vmov.u8 r0, q0[4]
1114; CHECK-NEXT:    adc.w r1, r2, r3
1115; CHECK-NEXT:    vmov.u16 r2, q5[4]
1116; CHECK-NEXT:    vmov.32 q6[0], r2
1117; CHECK-NEXT:    vmov.u16 r2, q5[5]
1118; CHECK-NEXT:    vmov.32 q6[1], r2
1119; CHECK-NEXT:    vmov.u16 r2, q5[6]
1120; CHECK-NEXT:    vmov.32 q6[2], r2
1121; CHECK-NEXT:    vmov.u16 r2, q5[7]
1122; CHECK-NEXT:    vmov.32 q6[3], r2
1123; CHECK-NEXT:    sxtb r0, r0
1124; CHECK-NEXT:    vcmp.i32 ne, q6, zr
1125; CHECK-NEXT:    vmrs r2, p0
1126; CHECK-NEXT:    and r3, r2, #1
1127; CHECK-NEXT:    rsbs r3, r3, #0
1128; CHECK-NEXT:    vmov.32 q5[0], r3
1129; CHECK-NEXT:    vmov.32 q5[1], r3
1130; CHECK-NEXT:    ubfx r3, r2, #4, #1
1131; CHECK-NEXT:    rsbs r3, r3, #0
1132; CHECK-NEXT:    vmov.32 q5[2], r3
1133; CHECK-NEXT:    vmov.32 q5[3], r3
1134; CHECK-NEXT:    vmov.u8 r3, q1[4]
1135; CHECK-NEXT:    sxtb r3, r3
1136; CHECK-NEXT:    smull r0, r3, r0, r3
1137; CHECK-NEXT:    vmov.32 q6[0], r0
1138; CHECK-NEXT:    vmov.u8 r0, q1[5]
1139; CHECK-NEXT:    vmov.32 q6[1], r3
1140; CHECK-NEXT:    vmov.u8 r3, q0[5]
1141; CHECK-NEXT:    sxtb r0, r0
1142; CHECK-NEXT:    sxtb r3, r3
1143; CHECK-NEXT:    smull r0, r3, r3, r0
1144; CHECK-NEXT:    vmov.32 q6[2], r0
1145; CHECK-NEXT:    vmov.32 q6[3], r3
1146; CHECK-NEXT:    vand q5, q6, q5
1147; CHECK-NEXT:    vmov r3, s20
1148; CHECK-NEXT:    vmov r0, s21
1149; CHECK-NEXT:    adds.w r3, r3, r12
1150; CHECK-NEXT:    adc.w r12, r1, r0
1151; CHECK-NEXT:    vmov r1, s22
1152; CHECK-NEXT:    vmov r0, s23
1153; CHECK-NEXT:    adds r3, r3, r1
1154; CHECK-NEXT:    adc.w r1, r12, r0
1155; CHECK-NEXT:    ubfx r0, r2, #8, #1
1156; CHECK-NEXT:    rsbs r0, r0, #0
1157; CHECK-NEXT:    vmov.32 q5[0], r0
1158; CHECK-NEXT:    vmov.32 q5[1], r0
1159; CHECK-NEXT:    ubfx r0, r2, #12, #1
1160; CHECK-NEXT:    rsbs r0, r0, #0
1161; CHECK-NEXT:    vmov.u8 r2, q0[6]
1162; CHECK-NEXT:    vmov.32 q5[2], r0
1163; CHECK-NEXT:    sxtb r2, r2
1164; CHECK-NEXT:    vmov.32 q5[3], r0
1165; CHECK-NEXT:    vmov.u8 r0, q1[6]
1166; CHECK-NEXT:    sxtb r0, r0
1167; CHECK-NEXT:    smull r0, r2, r2, r0
1168; CHECK-NEXT:    vmov.32 q6[0], r0
1169; CHECK-NEXT:    vmov.u8 r0, q1[7]
1170; CHECK-NEXT:    vmov.32 q6[1], r2
1171; CHECK-NEXT:    vmov.u8 r2, q0[7]
1172; CHECK-NEXT:    sxtb r0, r0
1173; CHECK-NEXT:    sxtb r2, r2
1174; CHECK-NEXT:    smull r0, r2, r2, r0
1175; CHECK-NEXT:    vmov.32 q6[2], r0
1176; CHECK-NEXT:    vmov.32 q6[3], r2
1177; CHECK-NEXT:    vand q5, q6, q5
1178; CHECK-NEXT:    vmov r2, s20
1179; CHECK-NEXT:    vmov r0, s21
1180; CHECK-NEXT:    adds r2, r2, r3
1181; CHECK-NEXT:    vmov r3, s23
1182; CHECK-NEXT:    adcs r1, r0
1183; CHECK-NEXT:    vmov r0, s22
1184; CHECK-NEXT:    adds.w r12, r2, r0
1185; CHECK-NEXT:    vmov.u8 r2, q4[8]
1186; CHECK-NEXT:    vmov.16 q5[0], r2
1187; CHECK-NEXT:    vmov.u8 r2, q4[9]
1188; CHECK-NEXT:    vmov.16 q5[1], r2
1189; CHECK-NEXT:    vmov.u8 r2, q4[10]
1190; CHECK-NEXT:    vmov.16 q5[2], r2
1191; CHECK-NEXT:    vmov.u8 r2, q4[11]
1192; CHECK-NEXT:    vmov.16 q5[3], r2
1193; CHECK-NEXT:    vmov.u8 r2, q4[12]
1194; CHECK-NEXT:    vmov.16 q5[4], r2
1195; CHECK-NEXT:    vmov.u8 r2, q4[13]
1196; CHECK-NEXT:    vmov.16 q5[5], r2
1197; CHECK-NEXT:    vmov.u8 r2, q4[14]
1198; CHECK-NEXT:    vmov.16 q5[6], r2
1199; CHECK-NEXT:    vmov.u8 r2, q4[15]
1200; CHECK-NEXT:    vmov.16 q5[7], r2
1201; CHECK-NEXT:    adcs r1, r3
1202; CHECK-NEXT:    vcmp.i16 ne, q5, zr
1203; CHECK-NEXT:    vmov.u8 r0, q0[8]
1204; CHECK-NEXT:    vpsel q2, q3, q2
1205; CHECK-NEXT:    sxtb r0, r0
1206; CHECK-NEXT:    vmov.u16 r2, q2[0]
1207; CHECK-NEXT:    vmov.32 q3[0], r2
1208; CHECK-NEXT:    vmov.u16 r2, q2[1]
1209; CHECK-NEXT:    vmov.32 q3[1], r2
1210; CHECK-NEXT:    vmov.u16 r2, q2[2]
1211; CHECK-NEXT:    vmov.32 q3[2], r2
1212; CHECK-NEXT:    vmov.u16 r2, q2[3]
1213; CHECK-NEXT:    vmov.32 q3[3], r2
1214; CHECK-NEXT:    vcmp.i32 ne, q3, zr
1215; CHECK-NEXT:    vmrs r2, p0
1216; CHECK-NEXT:    and r3, r2, #1
1217; CHECK-NEXT:    rsbs r3, r3, #0
1218; CHECK-NEXT:    vmov.32 q3[0], r3
1219; CHECK-NEXT:    vmov.32 q3[1], r3
1220; CHECK-NEXT:    ubfx r3, r2, #4, #1
1221; CHECK-NEXT:    rsbs r3, r3, #0
1222; CHECK-NEXT:    vmov.32 q3[2], r3
1223; CHECK-NEXT:    vmov.32 q3[3], r3
1224; CHECK-NEXT:    vmov.u8 r3, q1[8]
1225; CHECK-NEXT:    sxtb r3, r3
1226; CHECK-NEXT:    smull r0, r3, r0, r3
1227; CHECK-NEXT:    vmov.32 q4[0], r0
1228; CHECK-NEXT:    vmov.u8 r0, q1[9]
1229; CHECK-NEXT:    vmov.32 q4[1], r3
1230; CHECK-NEXT:    vmov.u8 r3, q0[9]
1231; CHECK-NEXT:    sxtb r0, r0
1232; CHECK-NEXT:    sxtb r3, r3
1233; CHECK-NEXT:    smull r0, r3, r3, r0
1234; CHECK-NEXT:    vmov.32 q4[2], r0
1235; CHECK-NEXT:    vmov.32 q4[3], r3
1236; CHECK-NEXT:    vand q3, q4, q3
1237; CHECK-NEXT:    vmov r3, s12
1238; CHECK-NEXT:    vmov r0, s13
1239; CHECK-NEXT:    adds.w r3, r3, r12
1240; CHECK-NEXT:    adc.w r12, r1, r0
1241; CHECK-NEXT:    vmov r1, s14
1242; CHECK-NEXT:    vmov r0, s15
1243; CHECK-NEXT:    adds r3, r3, r1
1244; CHECK-NEXT:    adc.w r1, r12, r0
1245; CHECK-NEXT:    ubfx r0, r2, #8, #1
1246; CHECK-NEXT:    rsbs r0, r0, #0
1247; CHECK-NEXT:    vmov.32 q3[0], r0
1248; CHECK-NEXT:    vmov.32 q3[1], r0
1249; CHECK-NEXT:    ubfx r0, r2, #12, #1
1250; CHECK-NEXT:    rsbs r0, r0, #0
1251; CHECK-NEXT:    vmov.u8 r2, q0[10]
1252; CHECK-NEXT:    vmov.32 q3[2], r0
1253; CHECK-NEXT:    sxtb r2, r2
1254; CHECK-NEXT:    vmov.32 q3[3], r0
1255; CHECK-NEXT:    vmov.u8 r0, q1[10]
1256; CHECK-NEXT:    sxtb r0, r0
1257; CHECK-NEXT:    smull r0, r2, r2, r0
1258; CHECK-NEXT:    vmov.32 q4[0], r0
1259; CHECK-NEXT:    vmov.u8 r0, q1[11]
1260; CHECK-NEXT:    vmov.32 q4[1], r2
1261; CHECK-NEXT:    vmov.u8 r2, q0[11]
1262; CHECK-NEXT:    sxtb r0, r0
1263; CHECK-NEXT:    sxtb r2, r2
1264; CHECK-NEXT:    smull r0, r2, r2, r0
1265; CHECK-NEXT:    vmov.32 q4[2], r0
1266; CHECK-NEXT:    vmov.32 q4[3], r2
1267; CHECK-NEXT:    vand q3, q4, q3
1268; CHECK-NEXT:    vmov r2, s12
1269; CHECK-NEXT:    vmov r0, s13
1270; CHECK-NEXT:    adds r2, r2, r3
1271; CHECK-NEXT:    vmov r3, s15
1272; CHECK-NEXT:    adcs r1, r0
1273; CHECK-NEXT:    vmov r0, s14
1274; CHECK-NEXT:    adds.w r12, r2, r0
1275; CHECK-NEXT:    vmov.u16 r2, q2[4]
1276; CHECK-NEXT:    vmov.32 q3[0], r2
1277; CHECK-NEXT:    vmov.u16 r2, q2[5]
1278; CHECK-NEXT:    vmov.32 q3[1], r2
1279; CHECK-NEXT:    vmov.u16 r2, q2[6]
1280; CHECK-NEXT:    vmov.32 q3[2], r2
1281; CHECK-NEXT:    vmov.u16 r2, q2[7]
1282; CHECK-NEXT:    vmov.32 q3[3], r2
1283; CHECK-NEXT:    adcs r1, r3
1284; CHECK-NEXT:    vcmp.i32 ne, q3, zr
1285; CHECK-NEXT:    vmov.u8 r0, q0[12]
1286; CHECK-NEXT:    vmrs r2, p0
1287; CHECK-NEXT:    sxtb r0, r0
1288; CHECK-NEXT:    and r3, r2, #1
1289; CHECK-NEXT:    rsbs r3, r3, #0
1290; CHECK-NEXT:    vmov.32 q2[0], r3
1291; CHECK-NEXT:    vmov.32 q2[1], r3
1292; CHECK-NEXT:    ubfx r3, r2, #4, #1
1293; CHECK-NEXT:    rsbs r3, r3, #0
1294; CHECK-NEXT:    vmov.32 q2[2], r3
1295; CHECK-NEXT:    vmov.32 q2[3], r3
1296; CHECK-NEXT:    vmov.u8 r3, q1[12]
1297; CHECK-NEXT:    sxtb r3, r3
1298; CHECK-NEXT:    smull r0, r3, r0, r3
1299; CHECK-NEXT:    vmov.32 q3[0], r0
1300; CHECK-NEXT:    vmov.u8 r0, q1[13]
1301; CHECK-NEXT:    vmov.32 q3[1], r3
1302; CHECK-NEXT:    vmov.u8 r3, q0[13]
1303; CHECK-NEXT:    sxtb r0, r0
1304; CHECK-NEXT:    sxtb r3, r3
1305; CHECK-NEXT:    smull r0, r3, r3, r0
1306; CHECK-NEXT:    vmov.32 q3[2], r0
1307; CHECK-NEXT:    vmov.32 q3[3], r3
1308; CHECK-NEXT:    vand q2, q3, q2
1309; CHECK-NEXT:    vmov r3, s8
1310; CHECK-NEXT:    vmov r0, s9
1311; CHECK-NEXT:    adds.w r3, r3, r12
1312; CHECK-NEXT:    adc.w r12, r1, r0
1313; CHECK-NEXT:    vmov r1, s10
1314; CHECK-NEXT:    vmov r0, s11
1315; CHECK-NEXT:    adds r3, r3, r1
1316; CHECK-NEXT:    adc.w r1, r12, r0
1317; CHECK-NEXT:    ubfx r0, r2, #8, #1
1318; CHECK-NEXT:    rsbs r0, r0, #0
1319; CHECK-NEXT:    vmov.32 q2[0], r0
1320; CHECK-NEXT:    vmov.32 q2[1], r0
1321; CHECK-NEXT:    ubfx r0, r2, #12, #1
1322; CHECK-NEXT:    rsbs r0, r0, #0
1323; CHECK-NEXT:    vmov.u8 r2, q0[14]
1324; CHECK-NEXT:    vmov.32 q2[2], r0
1325; CHECK-NEXT:    sxtb r2, r2
1326; CHECK-NEXT:    vmov.32 q2[3], r0
1327; CHECK-NEXT:    vmov.u8 r0, q1[14]
1328; CHECK-NEXT:    sxtb r0, r0
1329; CHECK-NEXT:    smull r0, r2, r2, r0
1330; CHECK-NEXT:    vmov.32 q3[0], r0
1331; CHECK-NEXT:    vmov.u8 r0, q1[15]
1332; CHECK-NEXT:    vmov.32 q3[1], r2
1333; CHECK-NEXT:    vmov.u8 r2, q0[15]
1334; CHECK-NEXT:    sxtb r0, r0
1335; CHECK-NEXT:    sxtb r2, r2
1336; CHECK-NEXT:    smull r0, r2, r2, r0
1337; CHECK-NEXT:    vmov.32 q3[2], r0
1338; CHECK-NEXT:    vmov.32 q3[3], r2
1339; CHECK-NEXT:    vand q0, q3, q2
1340; CHECK-NEXT:    vmov r2, s0
1341; CHECK-NEXT:    vmov r0, s1
1342; CHECK-NEXT:    adds r2, r2, r3
1343; CHECK-NEXT:    vmov r3, s3
1344; CHECK-NEXT:    adcs r1, r0
1345; CHECK-NEXT:    vmov r0, s2
1346; CHECK-NEXT:    adds r0, r0, r2
1347; CHECK-NEXT:    adcs r1, r3
1348; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1349; CHECK-NEXT:    bx lr
1350entry:
1351  %c = icmp eq <16 x i8> %b, zeroinitializer
1352  %xx = sext <16 x i8> %x to <16 x i64>
1353  %yy = sext <16 x i8> %y to <16 x i64>
1354  %m = mul <16 x i64> %xx, %yy
1355  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1356  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1357  ret i64 %z
1358}
1359
1360define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1361; CHECK-LABEL: add_v2i8_v2i64_zext:
1362; CHECK:       @ %bb.0: @ %entry
1363; CHECK-NEXT:    .vsave {d8, d9}
1364; CHECK-NEXT:    vpush {d8, d9}
1365; CHECK-NEXT:    vmov.i64 q3, #0xff
1366; CHECK-NEXT:    vand q1, q1, q3
1367; CHECK-NEXT:    vand q4, q0, q3
1368; CHECK-NEXT:    vmov r0, s4
1369; CHECK-NEXT:    vmov r1, s16
1370; CHECK-NEXT:    umull r0, r1, r1, r0
1371; CHECK-NEXT:    vmov.32 q0[0], r0
1372; CHECK-NEXT:    vmov r0, s6
1373; CHECK-NEXT:    vmov.32 q0[1], r1
1374; CHECK-NEXT:    vmov r1, s18
1375; CHECK-NEXT:    vand q1, q2, q3
1376; CHECK-NEXT:    umull r0, r1, r1, r0
1377; CHECK-NEXT:    vmov.32 q0[2], r0
1378; CHECK-NEXT:    vmov r0, s4
1379; CHECK-NEXT:    vmov.32 q0[3], r1
1380; CHECK-NEXT:    cmp r0, #0
1381; CHECK-NEXT:    cset r0, eq
1382; CHECK-NEXT:    tst.w r0, #1
1383; CHECK-NEXT:    csetm r0, ne
1384; CHECK-NEXT:    vmov.32 q2[0], r0
1385; CHECK-NEXT:    vmov.32 q2[1], r0
1386; CHECK-NEXT:    vmov r0, s6
1387; CHECK-NEXT:    cmp r0, #0
1388; CHECK-NEXT:    cset r0, eq
1389; CHECK-NEXT:    tst.w r0, #1
1390; CHECK-NEXT:    csetm r0, ne
1391; CHECK-NEXT:    vmov.32 q2[2], r0
1392; CHECK-NEXT:    vmov.32 q2[3], r0
1393; CHECK-NEXT:    vand q0, q0, q2
1394; CHECK-NEXT:    vmov r0, s2
1395; CHECK-NEXT:    vmov r3, s0
1396; CHECK-NEXT:    vmov r1, s3
1397; CHECK-NEXT:    vmov r2, s1
1398; CHECK-NEXT:    adds r0, r0, r3
1399; CHECK-NEXT:    adcs r1, r2
1400; CHECK-NEXT:    vpop {d8, d9}
1401; CHECK-NEXT:    bx lr
1402entry:
1403  %c = icmp eq <2 x i8> %b, zeroinitializer
1404  %xx = zext <2 x i8> %x to <2 x i64>
1405  %yy = zext <2 x i8> %y to <2 x i64>
1406  %m = mul <2 x i64> %xx, %yy
1407  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1408  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1409  ret i64 %z
1410}
1411
1412define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1413; CHECK-LABEL: add_v2i8_v2i64_sext:
1414; CHECK:       @ %bb.0: @ %entry
1415; CHECK-NEXT:    vmov.i32 q3, #0xff
1416; CHECK-NEXT:    vmov r1, s0
1417; CHECK-NEXT:    vand q3, q2, q3
1418; CHECK-NEXT:    vmov r0, s12
1419; CHECK-NEXT:    sxtb r1, r1
1420; CHECK-NEXT:    cmp r0, #0
1421; CHECK-NEXT:    cset r0, eq
1422; CHECK-NEXT:    tst.w r0, #1
1423; CHECK-NEXT:    csetm r0, ne
1424; CHECK-NEXT:    vmov.32 q2[0], r0
1425; CHECK-NEXT:    vmov.32 q2[1], r0
1426; CHECK-NEXT:    vmov r0, s14
1427; CHECK-NEXT:    cmp r0, #0
1428; CHECK-NEXT:    cset r0, eq
1429; CHECK-NEXT:    tst.w r0, #1
1430; CHECK-NEXT:    csetm r0, ne
1431; CHECK-NEXT:    vmov.32 q2[2], r0
1432; CHECK-NEXT:    vmov.32 q2[3], r0
1433; CHECK-NEXT:    vmov r0, s4
1434; CHECK-NEXT:    sxtb r0, r0
1435; CHECK-NEXT:    smull r0, r1, r1, r0
1436; CHECK-NEXT:    vmov.32 q3[0], r0
1437; CHECK-NEXT:    vmov r0, s6
1438; CHECK-NEXT:    vmov.32 q3[1], r1
1439; CHECK-NEXT:    vmov r1, s2
1440; CHECK-NEXT:    sxtb r0, r0
1441; CHECK-NEXT:    sxtb r1, r1
1442; CHECK-NEXT:    smull r0, r1, r1, r0
1443; CHECK-NEXT:    vmov.32 q3[2], r0
1444; CHECK-NEXT:    vmov.32 q3[3], r1
1445; CHECK-NEXT:    vand q0, q3, q2
1446; CHECK-NEXT:    vmov r0, s2
1447; CHECK-NEXT:    vmov r3, s0
1448; CHECK-NEXT:    vmov r1, s3
1449; CHECK-NEXT:    vmov r2, s1
1450; CHECK-NEXT:    adds r0, r0, r3
1451; CHECK-NEXT:    adcs r1, r2
1452; CHECK-NEXT:    bx lr
1453entry:
1454  %c = icmp eq <2 x i8> %b, zeroinitializer
1455  %xx = sext <2 x i8> %x to <2 x i64>
1456  %yy = sext <2 x i8> %y to <2 x i64>
1457  %m = mul <2 x i64> %xx, %yy
1458  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1459  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1460  ret i64 %z
1461}
1462
1463define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
1464; CHECK-LABEL: add_v2i64_v2i64:
1465; CHECK:       @ %bb.0: @ %entry
1466; CHECK-NEXT:    vmov r0, s4
1467; CHECK-NEXT:    vmov r1, s0
1468; CHECK-NEXT:    vmov r2, s5
1469; CHECK-NEXT:    umull r12, r3, r1, r0
1470; CHECK-NEXT:    mla r1, r1, r2, r3
1471; CHECK-NEXT:    vmov r2, s1
1472; CHECK-NEXT:    vmov.32 q3[0], r12
1473; CHECK-NEXT:    mla r0, r2, r0, r1
1474; CHECK-NEXT:    vmov r1, s2
1475; CHECK-NEXT:    vmov.32 q3[1], r0
1476; CHECK-NEXT:    vmov r0, s6
1477; CHECK-NEXT:    umull r2, r3, r1, r0
1478; CHECK-NEXT:    vmov.32 q3[2], r2
1479; CHECK-NEXT:    vmov r2, s7
1480; CHECK-NEXT:    mla r1, r1, r2, r3
1481; CHECK-NEXT:    vmov r2, s3
1482; CHECK-NEXT:    mla r0, r2, r0, r1
1483; CHECK-NEXT:    vmov r1, s8
1484; CHECK-NEXT:    vmov.32 q3[3], r0
1485; CHECK-NEXT:    vmov r0, s9
1486; CHECK-NEXT:    orrs r0, r1
1487; CHECK-NEXT:    vmov r1, s10
1488; CHECK-NEXT:    cset r0, eq
1489; CHECK-NEXT:    tst.w r0, #1
1490; CHECK-NEXT:    csetm r0, ne
1491; CHECK-NEXT:    vmov.32 q0[0], r0
1492; CHECK-NEXT:    vmov.32 q0[1], r0
1493; CHECK-NEXT:    vmov r0, s11
1494; CHECK-NEXT:    orrs r0, r1
1495; CHECK-NEXT:    cset r0, eq
1496; CHECK-NEXT:    tst.w r0, #1
1497; CHECK-NEXT:    csetm r0, ne
1498; CHECK-NEXT:    vmov.32 q0[2], r0
1499; CHECK-NEXT:    vmov.32 q0[3], r0
1500; CHECK-NEXT:    vand q0, q3, q0
1501; CHECK-NEXT:    vmov r0, s2
1502; CHECK-NEXT:    vmov r3, s0
1503; CHECK-NEXT:    vmov r1, s3
1504; CHECK-NEXT:    vmov r2, s1
1505; CHECK-NEXT:    adds r0, r0, r3
1506; CHECK-NEXT:    adcs r1, r2
1507; CHECK-NEXT:    bx lr
1508entry:
1509  %c = icmp eq <2 x i64> %b, zeroinitializer
1510  %m = mul <2 x i64> %x, %y
1511  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1512  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1513  ret i64 %z
1514}
1515
1516define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) {
1517; CHECK-LABEL: add_v4i32_v4i32_acc:
1518; CHECK:       @ %bb.0: @ %entry
1519; CHECK-NEXT:    vpt.i32 eq, q2, zr
1520; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1521; CHECK-NEXT:    bx lr
1522entry:
1523  %c = icmp eq <4 x i32> %b, zeroinitializer
1524  %m = mul <4 x i32> %x, %y
1525  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1526  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1527  %r = add i32 %z, %a
1528  ret i32 %r
1529}
1530
1531define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1532; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
1533; CHECK:       @ %bb.0: @ %entry
1534; CHECK-NEXT:    vpt.i32 eq, q2, zr
1535; CHECK-NEXT:    vmlalvat.u32 r0, r1, q0, q1
1536; CHECK-NEXT:    bx lr
1537entry:
1538  %c = icmp eq <4 x i32> %b, zeroinitializer
1539  %xx = zext <4 x i32> %x to <4 x i64>
1540  %yy = zext <4 x i32> %y to <4 x i64>
1541  %m = mul <4 x i64> %xx, %yy
1542  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1543  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1544  %r = add i64 %z, %a
1545  ret i64 %r
1546}
1547
1548define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1549; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1550; CHECK:       @ %bb.0: @ %entry
1551; CHECK-NEXT:    vpt.i32 eq, q2, zr
1552; CHECK-NEXT:    vmlalvat.s32 r0, r1, q0, q1
1553; CHECK-NEXT:    bx lr
1554entry:
1555  %c = icmp eq <4 x i32> %b, zeroinitializer
1556  %xx = sext <4 x i32> %x to <4 x i64>
1557  %yy = sext <4 x i32> %y to <4 x i64>
1558  %m = mul <4 x i64> %xx, %yy
1559  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1560  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1561  %r = add i64 %z, %a
1562  ret i64 %r
1563}
1564
1565define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1566; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1567; CHECK:       @ %bb.0: @ %entry
1568; CHECK-NEXT:    .save {r7, lr}
1569; CHECK-NEXT:    push {r7, lr}
1570; CHECK-NEXT:    vmov r2, s8
1571; CHECK-NEXT:    vmullb.u32 q3, q0, q1
1572; CHECK-NEXT:    cmp r2, #0
1573; CHECK-NEXT:    cset r2, eq
1574; CHECK-NEXT:    tst.w r2, #1
1575; CHECK-NEXT:    csetm r2, ne
1576; CHECK-NEXT:    vmov.32 q0[0], r2
1577; CHECK-NEXT:    vmov.32 q0[1], r2
1578; CHECK-NEXT:    vmov r2, s10
1579; CHECK-NEXT:    cmp r2, #0
1580; CHECK-NEXT:    cset r2, eq
1581; CHECK-NEXT:    tst.w r2, #1
1582; CHECK-NEXT:    csetm r2, ne
1583; CHECK-NEXT:    vmov.32 q0[2], r2
1584; CHECK-NEXT:    vmov.32 q0[3], r2
1585; CHECK-NEXT:    vand q0, q3, q0
1586; CHECK-NEXT:    vmov r2, s2
1587; CHECK-NEXT:    vmov r3, s0
1588; CHECK-NEXT:    vmov r12, s3
1589; CHECK-NEXT:    vmov lr, s1
1590; CHECK-NEXT:    adds r2, r2, r3
1591; CHECK-NEXT:    adc.w r3, lr, r12
1592; CHECK-NEXT:    adds r0, r0, r2
1593; CHECK-NEXT:    adcs r1, r3
1594; CHECK-NEXT:    pop {r7, pc}
1595entry:
1596  %c = icmp eq <2 x i32> %b, zeroinitializer
1597  %xx = zext <2 x i32> %x to <2 x i64>
1598  %yy = zext <2 x i32> %y to <2 x i64>
1599  %m = mul <2 x i64> %xx, %yy
1600  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1601  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1602  %r = add i64 %z, %a
1603  ret i64 %r
1604}
1605
1606define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1607; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1608; CHECK:       @ %bb.0: @ %entry
1609; CHECK-NEXT:    .save {r7, lr}
1610; CHECK-NEXT:    push {r7, lr}
1611; CHECK-NEXT:    vmov r2, s8
1612; CHECK-NEXT:    vmullb.s32 q3, q0, q1
1613; CHECK-NEXT:    cmp r2, #0
1614; CHECK-NEXT:    cset r2, eq
1615; CHECK-NEXT:    tst.w r2, #1
1616; CHECK-NEXT:    csetm r2, ne
1617; CHECK-NEXT:    vmov.32 q0[0], r2
1618; CHECK-NEXT:    vmov.32 q0[1], r2
1619; CHECK-NEXT:    vmov r2, s10
1620; CHECK-NEXT:    cmp r2, #0
1621; CHECK-NEXT:    cset r2, eq
1622; CHECK-NEXT:    tst.w r2, #1
1623; CHECK-NEXT:    csetm r2, ne
1624; CHECK-NEXT:    vmov.32 q0[2], r2
1625; CHECK-NEXT:    vmov.32 q0[3], r2
1626; CHECK-NEXT:    vand q0, q3, q0
1627; CHECK-NEXT:    vmov r2, s2
1628; CHECK-NEXT:    vmov r3, s0
1629; CHECK-NEXT:    vmov r12, s3
1630; CHECK-NEXT:    vmov lr, s1
1631; CHECK-NEXT:    adds r2, r2, r3
1632; CHECK-NEXT:    adc.w r3, lr, r12
1633; CHECK-NEXT:    adds r0, r0, r2
1634; CHECK-NEXT:    adcs r1, r3
1635; CHECK-NEXT:    pop {r7, pc}
1636entry:
1637  %c = icmp eq <2 x i32> %b, zeroinitializer
1638  %xx = sext <2 x i32> %x to <2 x i64>
1639  %yy = sext <2 x i32> %y to <2 x i64>
1640  %m = mul <2 x i64> %xx, %yy
1641  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1642  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1643  %r = add i64 %z, %a
1644  ret i64 %r
1645}
1646
1647define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1648; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1649; CHECK:       @ %bb.0: @ %entry
1650; CHECK-NEXT:    vpt.i16 eq, q2, zr
1651; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
1652; CHECK-NEXT:    bx lr
1653entry:
1654  %c = icmp eq <8 x i16> %b, zeroinitializer
1655  %xx = zext <8 x i16> %x to <8 x i32>
1656  %yy = zext <8 x i16> %y to <8 x i32>
1657  %m = mul <8 x i32> %xx, %yy
1658  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1659  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1660  %r = add i32 %z, %a
1661  ret i32 %r
1662}
1663
1664define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1665; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1666; CHECK:       @ %bb.0: @ %entry
1667; CHECK-NEXT:    vpt.i16 eq, q2, zr
1668; CHECK-NEXT:    vmlavat.s16 r0, q0, q1
1669; CHECK-NEXT:    bx lr
1670entry:
1671  %c = icmp eq <8 x i16> %b, zeroinitializer
1672  %xx = sext <8 x i16> %x to <8 x i32>
1673  %yy = sext <8 x i16> %y to <8 x i32>
1674  %m = mul <8 x i32> %xx, %yy
1675  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1676  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1677  %r = add i32 %z, %a
1678  ret i32 %r
1679}
1680
1681define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1682; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1683; CHECK:       @ %bb.0: @ %entry
1684; CHECK-NEXT:    vmovlb.u16 q1, q1
1685; CHECK-NEXT:    vmovlb.u16 q0, q0
1686; CHECK-NEXT:    vmovlb.u16 q2, q2
1687; CHECK-NEXT:    vpt.i32 eq, q2, zr
1688; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1689; CHECK-NEXT:    bx lr
1690entry:
1691  %c = icmp eq <4 x i16> %b, zeroinitializer
1692  %xx = zext <4 x i16> %x to <4 x i32>
1693  %yy = zext <4 x i16> %y to <4 x i32>
1694  %m = mul <4 x i32> %xx, %yy
1695  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1696  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1697  %r = add i32 %z, %a
1698  ret i32 %r
1699}
1700
1701define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1702; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1703; CHECK:       @ %bb.0: @ %entry
1704; CHECK-NEXT:    vmovlb.s16 q1, q1
1705; CHECK-NEXT:    vmovlb.s16 q0, q0
1706; CHECK-NEXT:    vmovlb.u16 q2, q2
1707; CHECK-NEXT:    vpt.i32 eq, q2, zr
1708; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1709; CHECK-NEXT:    bx lr
1710entry:
1711  %c = icmp eq <4 x i16> %b, zeroinitializer
1712  %xx = sext <4 x i16> %x to <4 x i32>
1713  %yy = sext <4 x i16> %y to <4 x i32>
1714  %m = mul <4 x i32> %xx, %yy
1715  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1716  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1717  %r = add i32 %z, %a
1718  ret i32 %r
1719}
1720
1721define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) {
1722; CHECK-LABEL: add_v8i16_v8i16_acc:
1723; CHECK:       @ %bb.0: @ %entry
1724; CHECK-NEXT:    vpt.i16 eq, q2, zr
1725; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
1726; CHECK-NEXT:    uxth r0, r0
1727; CHECK-NEXT:    bx lr
1728entry:
1729  %c = icmp eq <8 x i16> %b, zeroinitializer
1730  %m = mul <8 x i16> %x, %y
1731  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
1732  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
1733  %r = add i16 %z, %a
1734  ret i16 %r
1735}
1736
1737define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1738; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1739; CHECK:       @ %bb.0: @ %entry
1740; CHECK-NEXT:    vpt.i16 eq, q2, zr
1741; CHECK-NEXT:    vmlalvat.u16 r0, r1, q0, q1
1742; CHECK-NEXT:    bx lr
1743entry:
1744  %c = icmp eq <8 x i16> %b, zeroinitializer
1745  %xx = zext <8 x i16> %x to <8 x i64>
1746  %yy = zext <8 x i16> %y to <8 x i64>
1747  %m = mul <8 x i64> %xx, %yy
1748  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1749  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1750  %r = add i64 %z, %a
1751  ret i64 %r
1752}
1753
1754define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1755; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1756; CHECK:       @ %bb.0: @ %entry
1757; CHECK-NEXT:    vpt.i16 eq, q2, zr
1758; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q1
1759; CHECK-NEXT:    bx lr
1760entry:
1761  %c = icmp eq <8 x i16> %b, zeroinitializer
1762  %xx = sext <8 x i16> %x to <8 x i64>
1763  %yy = sext <8 x i16> %y to <8 x i64>
1764  %m = mul <8 x i64> %xx, %yy
1765  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1766  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1767  %r = add i64 %z, %a
1768  ret i64 %r
1769}
1770
1771define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1772; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1773; CHECK:       @ %bb.0: @ %entry
1774; CHECK-NEXT:    vpt.i16 eq, q2, zr
1775; CHECK-NEXT:    vmlalvat.u16 r0, r1, q0, q1
1776; CHECK-NEXT:    bx lr
1777entry:
1778  %c = icmp eq <8 x i16> %b, zeroinitializer
1779  %xx = zext <8 x i16> %x to <8 x i32>
1780  %yy = zext <8 x i16> %y to <8 x i32>
1781  %m = mul <8 x i32> %xx, %yy
1782  %ma = zext <8 x i32> %m to <8 x i64>
1783  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1784  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1785  %r = add i64 %z, %a
1786  ret i64 %r
1787}
1788
1789define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1790; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1791; CHECK:       @ %bb.0: @ %entry
1792; CHECK-NEXT:    vpt.i16 eq, q2, zr
1793; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q1
1794; CHECK-NEXT:    bx lr
1795entry:
1796  %c = icmp eq <8 x i16> %b, zeroinitializer
1797  %xx = sext <8 x i16> %x to <8 x i32>
1798  %yy = sext <8 x i16> %y to <8 x i32>
1799  %m = mul <8 x i32> %xx, %yy
1800  %ma = sext <8 x i32> %m to <8 x i64>
1801  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1802  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1803  %r = add i64 %z, %a
1804  ret i64 %r
1805}
1806
1807define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1808; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1809; CHECK:       @ %bb.0: @ %entry
1810; CHECK-NEXT:    vpt.i16 eq, q2, zr
1811; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q0
1812; CHECK-NEXT:    bx lr
1813entry:
1814  %c = icmp eq <8 x i16> %b, zeroinitializer
1815  %xx = sext <8 x i16> %x to <8 x i32>
1816  %m = mul <8 x i32> %xx, %xx
1817  %ma = zext <8 x i32> %m to <8 x i64>
1818  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1819  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1820  %r = add i64 %z, %a
1821  ret i64 %r
1822}
1823
1824define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
1825; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1826; CHECK:       @ %bb.0: @ %entry
1827; CHECK-NEXT:    .save {r7, lr}
1828; CHECK-NEXT:    push {r7, lr}
1829; CHECK-NEXT:    .vsave {d8, d9}
1830; CHECK-NEXT:    vpush {d8, d9}
1831; CHECK-NEXT:    vmov.i64 q3, #0xffff
1832; CHECK-NEXT:    vand q1, q1, q3
1833; CHECK-NEXT:    vand q4, q0, q3
1834; CHECK-NEXT:    vmov r2, s4
1835; CHECK-NEXT:    vmov r3, s16
1836; CHECK-NEXT:    umull r2, r3, r3, r2
1837; CHECK-NEXT:    vmov.32 q0[0], r2
1838; CHECK-NEXT:    vmov r2, s6
1839; CHECK-NEXT:    vmov.32 q0[1], r3
1840; CHECK-NEXT:    vmov r3, s18
1841; CHECK-NEXT:    vand q1, q2, q3
1842; CHECK-NEXT:    umull r2, r3, r3, r2
1843; CHECK-NEXT:    vmov.32 q0[2], r2
1844; CHECK-NEXT:    vmov r2, s4
1845; CHECK-NEXT:    vmov.32 q0[3], r3
1846; CHECK-NEXT:    cmp r2, #0
1847; CHECK-NEXT:    cset r2, eq
1848; CHECK-NEXT:    tst.w r2, #1
1849; CHECK-NEXT:    csetm r2, ne
1850; CHECK-NEXT:    vmov.32 q2[0], r2
1851; CHECK-NEXT:    vmov.32 q2[1], r2
1852; CHECK-NEXT:    vmov r2, s6
1853; CHECK-NEXT:    cmp r2, #0
1854; CHECK-NEXT:    cset r2, eq
1855; CHECK-NEXT:    tst.w r2, #1
1856; CHECK-NEXT:    csetm r2, ne
1857; CHECK-NEXT:    vmov.32 q2[2], r2
1858; CHECK-NEXT:    vmov.32 q2[3], r2
1859; CHECK-NEXT:    vand q0, q0, q2
1860; CHECK-NEXT:    vmov r2, s2
1861; CHECK-NEXT:    vmov r3, s0
1862; CHECK-NEXT:    vmov r12, s3
1863; CHECK-NEXT:    vmov lr, s1
1864; CHECK-NEXT:    adds r2, r2, r3
1865; CHECK-NEXT:    adc.w r3, lr, r12
1866; CHECK-NEXT:    adds r0, r0, r2
1867; CHECK-NEXT:    adcs r1, r3
1868; CHECK-NEXT:    vpop {d8, d9}
1869; CHECK-NEXT:    pop {r7, pc}
1870entry:
1871  %c = icmp eq <2 x i16> %b, zeroinitializer
1872  %xx = zext <2 x i16> %x to <2 x i64>
1873  %yy = zext <2 x i16> %y to <2 x i64>
1874  %m = mul <2 x i64> %xx, %yy
1875  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1876  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1877  %r = add i64 %z, %a
1878  ret i64 %r
1879}
1880
1881define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
1882; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1883; CHECK:       @ %bb.0: @ %entry
1884; CHECK-NEXT:    .save {r7, lr}
1885; CHECK-NEXT:    push {r7, lr}
1886; CHECK-NEXT:    vmov.i32 q3, #0xffff
1887; CHECK-NEXT:    vmov r3, s0
1888; CHECK-NEXT:    vand q3, q2, q3
1889; CHECK-NEXT:    vmov r2, s12
1890; CHECK-NEXT:    sxth r3, r3
1891; CHECK-NEXT:    cmp r2, #0
1892; CHECK-NEXT:    cset r2, eq
1893; CHECK-NEXT:    tst.w r2, #1
1894; CHECK-NEXT:    csetm r2, ne
1895; CHECK-NEXT:    vmov.32 q2[0], r2
1896; CHECK-NEXT:    vmov.32 q2[1], r2
1897; CHECK-NEXT:    vmov r2, s14
1898; CHECK-NEXT:    cmp r2, #0
1899; CHECK-NEXT:    cset r2, eq
1900; CHECK-NEXT:    tst.w r2, #1
1901; CHECK-NEXT:    csetm r2, ne
1902; CHECK-NEXT:    vmov.32 q2[2], r2
1903; CHECK-NEXT:    vmov.32 q2[3], r2
1904; CHECK-NEXT:    vmov r2, s4
1905; CHECK-NEXT:    sxth r2, r2
1906; CHECK-NEXT:    smull r2, r3, r3, r2
1907; CHECK-NEXT:    vmov.32 q3[0], r2
1908; CHECK-NEXT:    vmov r2, s6
1909; CHECK-NEXT:    vmov.32 q3[1], r3
1910; CHECK-NEXT:    vmov r3, s2
1911; CHECK-NEXT:    sxth r2, r2
1912; CHECK-NEXT:    sxth r3, r3
1913; CHECK-NEXT:    smull r2, r3, r3, r2
1914; CHECK-NEXT:    vmov.32 q3[2], r2
1915; CHECK-NEXT:    vmov.32 q3[3], r3
1916; CHECK-NEXT:    vand q0, q3, q2
1917; CHECK-NEXT:    vmov r2, s2
1918; CHECK-NEXT:    vmov r3, s0
1919; CHECK-NEXT:    vmov r12, s3
1920; CHECK-NEXT:    vmov lr, s1
1921; CHECK-NEXT:    adds r2, r2, r3
1922; CHECK-NEXT:    adc.w r3, lr, r12
1923; CHECK-NEXT:    adds r0, r0, r2
1924; CHECK-NEXT:    adcs r1, r3
1925; CHECK-NEXT:    pop {r7, pc}
1926entry:
1927  %c = icmp eq <2 x i16> %b, zeroinitializer
1928  %xx = sext <2 x i16> %x to <2 x i64>
1929  %yy = sext <2 x i16> %y to <2 x i64>
1930  %m = mul <2 x i64> %xx, %yy
1931  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1932  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1933  %r = add i64 %z, %a
1934  ret i64 %r
1935}
1936
1937define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
1938; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
1939; CHECK:       @ %bb.0: @ %entry
1940; CHECK-NEXT:    vpt.i8 eq, q2, zr
1941; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
1942; CHECK-NEXT:    bx lr
1943entry:
1944  %c = icmp eq <16 x i8> %b, zeroinitializer
1945  %xx = zext <16 x i8> %x to <16 x i32>
1946  %yy = zext <16 x i8> %y to <16 x i32>
1947  %m = mul <16 x i32> %xx, %yy
1948  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
1949  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
1950  %r = add i32 %z, %a
1951  ret i32 %r
1952}
1953
1954define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
1955; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
1956; CHECK:       @ %bb.0: @ %entry
1957; CHECK-NEXT:    vpt.i8 eq, q2, zr
1958; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
1959; CHECK-NEXT:    bx lr
1960entry:
1961  %c = icmp eq <16 x i8> %b, zeroinitializer
1962  %xx = sext <16 x i8> %x to <16 x i32>
1963  %yy = sext <16 x i8> %y to <16 x i32>
1964  %m = mul <16 x i32> %xx, %yy
1965  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
1966  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
1967  %r = add i32 %z, %a
1968  ret i32 %r
1969}
1970
1971define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
1972; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
1973; CHECK:       @ %bb.0: @ %entry
1974; CHECK-NEXT:    vpt.i8 eq, q2, zr
1975; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
1976; CHECK-NEXT:    bx lr
1977entry:
1978  %c = icmp eq <16 x i8> %b, zeroinitializer
1979  %xx = zext <16 x i8> %x to <16 x i16>
1980  %yy = zext <16 x i8> %y to <16 x i16>
1981  %m = mul <16 x i16> %xx, %yy
1982  %ma = zext <16 x i16> %m to <16 x i32>
1983  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
1984  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
1985  %r = add i32 %z, %a
1986  ret i32 %r
1987}
1988
1989define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
1990; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
1991; CHECK:       @ %bb.0: @ %entry
1992; CHECK-NEXT:    vpt.i8 eq, q2, zr
1993; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
1994; CHECK-NEXT:    bx lr
1995entry:
1996  %c = icmp eq <16 x i8> %b, zeroinitializer
1997  %xx = sext <16 x i8> %x to <16 x i16>
1998  %yy = sext <16 x i8> %y to <16 x i16>
1999  %m = mul <16 x i16> %xx, %yy
2000  %ma = sext <16 x i16> %m to <16 x i32>
2001  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2002  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2003  %r = add i32 %z, %a
2004  ret i32 %r
2005}
2006
2007define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2008; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
2009; CHECK:       @ %bb.0: @ %entry
2010; CHECK-NEXT:    vpt.i8 eq, q2, zr
2011; CHECK-NEXT:    vmlavat.s8 r0, q0, q0
2012; CHECK-NEXT:    bx lr
2013entry:
2014  %c = icmp eq <16 x i8> %b, zeroinitializer
2015  %xx = sext <16 x i8> %x to <16 x i16>
2016  %m = mul <16 x i16> %xx, %xx
2017  %ma = zext <16 x i16> %m to <16 x i32>
2018  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2019  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2020  %r = add i32 %z, %a
2021  ret i32 %r
2022}
2023
2024define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2025; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
2026; CHECK:       @ %bb.0: @ %entry
2027; CHECK-NEXT:    vmov.i32 q3, #0xff
2028; CHECK-NEXT:    vand q1, q1, q3
2029; CHECK-NEXT:    vand q0, q0, q3
2030; CHECK-NEXT:    vand q2, q2, q3
2031; CHECK-NEXT:    vpt.i32 eq, q2, zr
2032; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
2033; CHECK-NEXT:    bx lr
2034entry:
2035  %c = icmp eq <4 x i8> %b, zeroinitializer
2036  %xx = zext <4 x i8> %x to <4 x i32>
2037  %yy = zext <4 x i8> %y to <4 x i32>
2038  %m = mul <4 x i32> %xx, %yy
2039  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2040  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2041  %r = add i32 %z, %a
2042  ret i32 %r
2043}
2044
2045define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2046; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
2047; CHECK:       @ %bb.0: @ %entry
2048; CHECK-NEXT:    vmovlb.s8 q1, q1
2049; CHECK-NEXT:    vmovlb.s8 q0, q0
2050; CHECK-NEXT:    vmov.i32 q3, #0xff
2051; CHECK-NEXT:    vmovlb.s16 q1, q1
2052; CHECK-NEXT:    vand q2, q2, q3
2053; CHECK-NEXT:    vmovlb.s16 q0, q0
2054; CHECK-NEXT:    vpt.i32 eq, q2, zr
2055; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
2056; CHECK-NEXT:    bx lr
2057entry:
2058  %c = icmp eq <4 x i8> %b, zeroinitializer
2059  %xx = sext <4 x i8> %x to <4 x i32>
2060  %yy = sext <4 x i8> %y to <4 x i32>
2061  %m = mul <4 x i32> %xx, %yy
2062  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2063  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2064  %r = add i32 %z, %a
2065  ret i32 %r
2066}
2067
2068define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2069; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
2070; CHECK:       @ %bb.0: @ %entry
2071; CHECK-NEXT:    vpt.i8 eq, q2, zr
2072; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2073; CHECK-NEXT:    uxth r0, r0
2074; CHECK-NEXT:    bx lr
2075entry:
2076  %c = icmp eq <16 x i8> %b, zeroinitializer
2077  %xx = zext <16 x i8> %x to <16 x i16>
2078  %yy = zext <16 x i8> %y to <16 x i16>
2079  %m = mul <16 x i16> %xx, %yy
2080  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2081  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2082  %r = add i16 %z, %a
2083  ret i16 %r
2084}
2085
2086define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2087; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
2088; CHECK:       @ %bb.0: @ %entry
2089; CHECK-NEXT:    vpt.i8 eq, q2, zr
2090; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
2091; CHECK-NEXT:    sxth r0, r0
2092; CHECK-NEXT:    bx lr
2093entry:
2094  %c = icmp eq <16 x i8> %b, zeroinitializer
2095  %xx = sext <16 x i8> %x to <16 x i16>
2096  %yy = sext <16 x i8> %y to <16 x i16>
2097  %m = mul <16 x i16> %xx, %yy
2098  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2099  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2100  %r = add i16 %z, %a
2101  ret i16 %r
2102}
2103
2104define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2105; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
2106; CHECK:       @ %bb.0: @ %entry
2107; CHECK-NEXT:    vmovlb.u8 q1, q1
2108; CHECK-NEXT:    vmovlb.u8 q0, q0
2109; CHECK-NEXT:    vmovlb.u8 q2, q2
2110; CHECK-NEXT:    vpt.i16 eq, q2, zr
2111; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
2112; CHECK-NEXT:    uxth r0, r0
2113; CHECK-NEXT:    bx lr
2114entry:
2115  %c = icmp eq <8 x i8> %b, zeroinitializer
2116  %xx = zext <8 x i8> %x to <8 x i16>
2117  %yy = zext <8 x i8> %y to <8 x i16>
2118  %m = mul <8 x i16> %xx, %yy
2119  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2120  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2121  %r = add i16 %z, %a
2122  ret i16 %r
2123}
2124
2125define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2126; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
2127; CHECK:       @ %bb.0: @ %entry
2128; CHECK-NEXT:    vmovlb.s8 q1, q1
2129; CHECK-NEXT:    vmovlb.s8 q0, q0
2130; CHECK-NEXT:    vmovlb.u8 q2, q2
2131; CHECK-NEXT:    vpt.i16 eq, q2, zr
2132; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
2133; CHECK-NEXT:    sxth r0, r0
2134; CHECK-NEXT:    bx lr
2135entry:
2136  %c = icmp eq <8 x i8> %b, zeroinitializer
2137  %xx = sext <8 x i8> %x to <8 x i16>
2138  %yy = sext <8 x i8> %y to <8 x i16>
2139  %m = mul <8 x i16> %xx, %yy
2140  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2141  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2142  %r = add i16 %z, %a
2143  ret i16 %r
2144}
2145
2146define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) {
2147; CHECK-LABEL: add_v16i8_v16i8_acc:
2148; CHECK:       @ %bb.0: @ %entry
2149; CHECK-NEXT:    vpt.i8 eq, q2, zr
2150; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2151; CHECK-NEXT:    uxtb r0, r0
2152; CHECK-NEXT:    bx lr
2153entry:
2154  %c = icmp eq <16 x i8> %b, zeroinitializer
2155  %m = mul <16 x i8> %x, %y
2156  %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
2157  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
2158  %r = add i8 %z, %a
2159  ret i8 %r
2160}
2161
2162define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2163; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
2164; CHECK:       @ %bb.0: @ %entry
2165; CHECK-NEXT:    .save {r4, r5, r6, lr}
2166; CHECK-NEXT:    push {r4, r5, r6, lr}
2167; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2168; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2169; CHECK-NEXT:    .pad #80
2170; CHECK-NEXT:    sub sp, #80
2171; CHECK-NEXT:    vmov q3, q1
2172; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
2173; CHECK-NEXT:    vmov.i8 q0, #0x0
2174; CHECK-NEXT:    vmov.i8 q1, #0xff
2175; CHECK-NEXT:    vcmp.i8 eq, q2, zr
2176; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
2177; CHECK-NEXT:    vpsel q5, q1, q0
2178; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
2179; CHECK-NEXT:    vmov.u8 r2, q5[0]
2180; CHECK-NEXT:    vmov.i64 q4, #0xff
2181; CHECK-NEXT:    vmov.16 q2[0], r2
2182; CHECK-NEXT:    vmov.u8 r2, q5[1]
2183; CHECK-NEXT:    vmov.16 q2[1], r2
2184; CHECK-NEXT:    vmov.u8 r2, q5[2]
2185; CHECK-NEXT:    vmov.16 q2[2], r2
2186; CHECK-NEXT:    vmov.u8 r2, q5[3]
2187; CHECK-NEXT:    vmov.16 q2[3], r2
2188; CHECK-NEXT:    vmov.u8 r2, q5[4]
2189; CHECK-NEXT:    vmov.16 q2[4], r2
2190; CHECK-NEXT:    vmov.u8 r2, q5[5]
2191; CHECK-NEXT:    vmov.16 q2[5], r2
2192; CHECK-NEXT:    vmov.u8 r2, q5[6]
2193; CHECK-NEXT:    vmov.16 q2[6], r2
2194; CHECK-NEXT:    vmov.u8 r2, q5[7]
2195; CHECK-NEXT:    vmov.16 q2[7], r2
2196; CHECK-NEXT:    vcmp.i16 ne, q2, zr
2197; CHECK-NEXT:    vpsel q6, q1, q0
2198; CHECK-NEXT:    vmov.u16 r2, q6[0]
2199; CHECK-NEXT:    vmov.32 q2[0], r2
2200; CHECK-NEXT:    vmov.u16 r2, q6[1]
2201; CHECK-NEXT:    vmov.32 q2[1], r2
2202; CHECK-NEXT:    vmov.u16 r2, q6[2]
2203; CHECK-NEXT:    vmov.32 q2[2], r2
2204; CHECK-NEXT:    vmov.u16 r2, q6[3]
2205; CHECK-NEXT:    vmov.32 q2[3], r2
2206; CHECK-NEXT:    vcmp.i32 ne, q2, zr
2207; CHECK-NEXT:    vmrs lr, p0
2208; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
2209; CHECK-NEXT:    and r3, lr, #1
2210; CHECK-NEXT:    rsbs r3, r3, #0
2211; CHECK-NEXT:    vmov.32 q7[0], r3
2212; CHECK-NEXT:    vmov.32 q7[1], r3
2213; CHECK-NEXT:    ubfx r3, lr, #4, #1
2214; CHECK-NEXT:    rsbs r3, r3, #0
2215; CHECK-NEXT:    vmov.32 q7[2], r3
2216; CHECK-NEXT:    vmov.32 q7[3], r3
2217; CHECK-NEXT:    vmov.u8 r3, q3[0]
2218; CHECK-NEXT:    vmov.32 q0[0], r3
2219; CHECK-NEXT:    vmov.u8 r3, q3[1]
2220; CHECK-NEXT:    vmov.32 q0[2], r3
2221; CHECK-NEXT:    vmov.u8 r3, q1[0]
2222; CHECK-NEXT:    vand q2, q0, q4
2223; CHECK-NEXT:    vmov.32 q0[0], r3
2224; CHECK-NEXT:    vmov.u8 r3, q1[1]
2225; CHECK-NEXT:    vmov r12, s8
2226; CHECK-NEXT:    vmov.32 q0[2], r3
2227; CHECK-NEXT:    vand q1, q0, q4
2228; CHECK-NEXT:    vmov r3, s4
2229; CHECK-NEXT:    umull r3, r2, r3, r12
2230; CHECK-NEXT:    vmov.32 q0[0], r3
2231; CHECK-NEXT:    vmov r3, s6
2232; CHECK-NEXT:    vmov.32 q0[1], r2
2233; CHECK-NEXT:    vmov r2, s10
2234; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
2235; CHECK-NEXT:    umull r2, r3, r3, r2
2236; CHECK-NEXT:    vmov.32 q0[2], r2
2237; CHECK-NEXT:    vmov.32 q0[3], r3
2238; CHECK-NEXT:    vand q0, q0, q7
2239; CHECK-NEXT:    vmov r4, s2
2240; CHECK-NEXT:    vmov r2, s0
2241; CHECK-NEXT:    vmov r12, s3
2242; CHECK-NEXT:    vmov r3, s1
2243; CHECK-NEXT:    adds r4, r4, r2
2244; CHECK-NEXT:    ubfx r2, lr, #8, #1
2245; CHECK-NEXT:    rsb.w r2, r2, #0
2246; CHECK-NEXT:    vmov.32 q7[0], r2
2247; CHECK-NEXT:    adc.w r12, r12, r3
2248; CHECK-NEXT:    vmov.32 q7[1], r2
2249; CHECK-NEXT:    ubfx r2, lr, #12, #1
2250; CHECK-NEXT:    rsbs r2, r2, #0
2251; CHECK-NEXT:    vmov.u8 r3, q2[2]
2252; CHECK-NEXT:    vmov.32 q7[2], r2
2253; CHECK-NEXT:    vmov.32 q1[0], r3
2254; CHECK-NEXT:    vmov.32 q7[3], r2
2255; CHECK-NEXT:    vmov.u8 r2, q3[2]
2256; CHECK-NEXT:    vmov.32 q0[0], r2
2257; CHECK-NEXT:    vmov.u8 r2, q3[3]
2258; CHECK-NEXT:    vmov.u8 r3, q2[3]
2259; CHECK-NEXT:    vmov.32 q0[2], r2
2260; CHECK-NEXT:    vmov.32 q1[2], r3
2261; CHECK-NEXT:    vand q0, q0, q4
2262; CHECK-NEXT:    vand q1, q1, q4
2263; CHECK-NEXT:    vmov r2, s0
2264; CHECK-NEXT:    vmov r3, s4
2265; CHECK-NEXT:    umull r2, r3, r3, r2
2266; CHECK-NEXT:    vmov.32 q2[0], r2
2267; CHECK-NEXT:    vmov r2, s2
2268; CHECK-NEXT:    vmov.32 q2[1], r3
2269; CHECK-NEXT:    vmov r3, s6
2270; CHECK-NEXT:    umull r2, r3, r3, r2
2271; CHECK-NEXT:    vmov.32 q2[2], r2
2272; CHECK-NEXT:    vmov.32 q2[3], r3
2273; CHECK-NEXT:    vand q0, q2, q7
2274; CHECK-NEXT:    vmov r3, s0
2275; CHECK-NEXT:    vmov r2, s1
2276; CHECK-NEXT:    adds r3, r3, r4
2277; CHECK-NEXT:    vmov r4, s3
2278; CHECK-NEXT:    adc.w lr, r12, r2
2279; CHECK-NEXT:    vmov r2, s2
2280; CHECK-NEXT:    adds.w r12, r3, r2
2281; CHECK-NEXT:    vmov.u16 r2, q6[4]
2282; CHECK-NEXT:    vmov.32 q0[0], r2
2283; CHECK-NEXT:    vmov.u16 r2, q6[5]
2284; CHECK-NEXT:    vmov.32 q0[1], r2
2285; CHECK-NEXT:    vmov.u16 r2, q6[6]
2286; CHECK-NEXT:    vmov.32 q0[2], r2
2287; CHECK-NEXT:    vmov.u16 r2, q6[7]
2288; CHECK-NEXT:    vmov.32 q0[3], r2
2289; CHECK-NEXT:    adc.w lr, lr, r4
2290; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2291; CHECK-NEXT:    vmrs r6, p0
2292; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
2293; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
2294; CHECK-NEXT:    and r4, r6, #1
2295; CHECK-NEXT:    rsbs r4, r4, #0
2296; CHECK-NEXT:    vmov.32 q6[0], r4
2297; CHECK-NEXT:    vmov.32 q6[1], r4
2298; CHECK-NEXT:    ubfx r4, r6, #4, #1
2299; CHECK-NEXT:    rsbs r4, r4, #0
2300; CHECK-NEXT:    vmov.32 q6[2], r4
2301; CHECK-NEXT:    vmov.32 q6[3], r4
2302; CHECK-NEXT:    vmov.u8 r4, q3[4]
2303; CHECK-NEXT:    vmov.32 q0[0], r4
2304; CHECK-NEXT:    vmov.u8 r4, q3[5]
2305; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
2306; CHECK-NEXT:    vmov.32 q0[2], r4
2307; CHECK-NEXT:    vand q0, q0, q4
2308; CHECK-NEXT:    vmov.u8 r3, q3[4]
2309; CHECK-NEXT:    vmov r4, s0
2310; CHECK-NEXT:    vmov.32 q1[0], r3
2311; CHECK-NEXT:    vmov.u8 r3, q3[5]
2312; CHECK-NEXT:    vmov.32 q1[2], r3
2313; CHECK-NEXT:    vmov q7, q3
2314; CHECK-NEXT:    vand q1, q1, q4
2315; CHECK-NEXT:    vmov r3, s4
2316; CHECK-NEXT:    umull r3, r4, r3, r4
2317; CHECK-NEXT:    vmov.32 q2[0], r3
2318; CHECK-NEXT:    vmov r3, s2
2319; CHECK-NEXT:    vmov.32 q2[1], r4
2320; CHECK-NEXT:    vmov r4, s6
2321; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
2322; CHECK-NEXT:    umull r3, r4, r4, r3
2323; CHECK-NEXT:    vmov.32 q2[2], r3
2324; CHECK-NEXT:    vmov.32 q2[3], r4
2325; CHECK-NEXT:    vand q0, q2, q6
2326; CHECK-NEXT:    vmov r4, s0
2327; CHECK-NEXT:    vmov r3, s1
2328; CHECK-NEXT:    vmov r5, s2
2329; CHECK-NEXT:    vmov r2, s3
2330; CHECK-NEXT:    adds.w r4, r4, r12
2331; CHECK-NEXT:    adc.w r3, r3, lr
2332; CHECK-NEXT:    adds r4, r4, r5
2333; CHECK-NEXT:    adc.w r12, r3, r2
2334; CHECK-NEXT:    ubfx r2, r6, #8, #1
2335; CHECK-NEXT:    rsbs r2, r2, #0
2336; CHECK-NEXT:    vmov.u8 r3, q3[6]
2337; CHECK-NEXT:    vmov.32 q6[0], r2
2338; CHECK-NEXT:    vmov.32 q6[1], r2
2339; CHECK-NEXT:    ubfx r2, r6, #12, #1
2340; CHECK-NEXT:    rsbs r2, r2, #0
2341; CHECK-NEXT:    vmov.32 q6[2], r2
2342; CHECK-NEXT:    vmov.32 q6[3], r2
2343; CHECK-NEXT:    vmov.u8 r2, q1[6]
2344; CHECK-NEXT:    vmov.32 q0[0], r2
2345; CHECK-NEXT:    vmov.u8 r2, q1[7]
2346; CHECK-NEXT:    vmov.32 q1[0], r3
2347; CHECK-NEXT:    vmov.u8 r3, q3[7]
2348; CHECK-NEXT:    vmov.32 q0[2], r2
2349; CHECK-NEXT:    vmov.32 q1[2], r3
2350; CHECK-NEXT:    vand q0, q0, q4
2351; CHECK-NEXT:    vand q1, q1, q4
2352; CHECK-NEXT:    vmov r2, s0
2353; CHECK-NEXT:    vmov r3, s4
2354; CHECK-NEXT:    umull r2, r3, r3, r2
2355; CHECK-NEXT:    vmov.32 q2[0], r2
2356; CHECK-NEXT:    vmov r2, s2
2357; CHECK-NEXT:    vmov.32 q2[1], r3
2358; CHECK-NEXT:    vmov r3, s6
2359; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
2360; CHECK-NEXT:    umull r2, r3, r3, r2
2361; CHECK-NEXT:    vmov.32 q2[2], r2
2362; CHECK-NEXT:    vmov.32 q2[3], r3
2363; CHECK-NEXT:    vand q0, q2, q6
2364; CHECK-NEXT:    vmov r3, s0
2365; CHECK-NEXT:    vmov r2, s1
2366; CHECK-NEXT:    vmov r5, s2
2367; CHECK-NEXT:    vmov r6, s3
2368; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
2369; CHECK-NEXT:    adds r3, r3, r4
2370; CHECK-NEXT:    adc.w r2, r2, r12
2371; CHECK-NEXT:    adds.w r12, r3, r5
2372; CHECK-NEXT:    vmov.u8 r5, q7[8]
2373; CHECK-NEXT:    adc.w r3, r2, r6
2374; CHECK-NEXT:    vmov.u8 r2, q5[8]
2375; CHECK-NEXT:    vmov.16 q6[0], r2
2376; CHECK-NEXT:    vmov.u8 r2, q5[9]
2377; CHECK-NEXT:    vmov.16 q6[1], r2
2378; CHECK-NEXT:    vmov.u8 r2, q5[10]
2379; CHECK-NEXT:    vmov.16 q6[2], r2
2380; CHECK-NEXT:    vmov.u8 r2, q5[11]
2381; CHECK-NEXT:    vmov.16 q6[3], r2
2382; CHECK-NEXT:    vmov.u8 r2, q5[12]
2383; CHECK-NEXT:    vmov.16 q6[4], r2
2384; CHECK-NEXT:    vmov.u8 r2, q5[13]
2385; CHECK-NEXT:    vmov.16 q6[5], r2
2386; CHECK-NEXT:    vmov.u8 r2, q5[14]
2387; CHECK-NEXT:    vmov.16 q6[6], r2
2388; CHECK-NEXT:    vmov.u8 r2, q5[15]
2389; CHECK-NEXT:    vmov.16 q6[7], r2
2390; CHECK-NEXT:    vcmp.i16 ne, q6, zr
2391; CHECK-NEXT:    vpsel q3, q1, q0
2392; CHECK-NEXT:    vmov.32 q1[0], r5
2393; CHECK-NEXT:    vmov.u16 r2, q3[0]
2394; CHECK-NEXT:    vmov.u8 r5, q7[9]
2395; CHECK-NEXT:    vmov.32 q0[0], r2
2396; CHECK-NEXT:    vmov.u16 r2, q3[1]
2397; CHECK-NEXT:    vmov.32 q0[1], r2
2398; CHECK-NEXT:    vmov.u16 r2, q3[2]
2399; CHECK-NEXT:    vmov.32 q0[2], r2
2400; CHECK-NEXT:    vmov.u16 r2, q3[3]
2401; CHECK-NEXT:    vmov.32 q0[3], r2
2402; CHECK-NEXT:    vmov.32 q1[2], r5
2403; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2404; CHECK-NEXT:    vmrs r2, p0
2405; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
2406; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
2407; CHECK-NEXT:    vand q1, q1, q5
2408; CHECK-NEXT:    vmov r5, s4
2409; CHECK-NEXT:    and r6, r2, #1
2410; CHECK-NEXT:    rsbs r6, r6, #0
2411; CHECK-NEXT:    vmov.32 q4[0], r6
2412; CHECK-NEXT:    vmov.32 q4[1], r6
2413; CHECK-NEXT:    ubfx r6, r2, #4, #1
2414; CHECK-NEXT:    rsbs r6, r6, #0
2415; CHECK-NEXT:    vmov.32 q4[2], r6
2416; CHECK-NEXT:    vmov.32 q4[3], r6
2417; CHECK-NEXT:    vmov.u8 r6, q6[8]
2418; CHECK-NEXT:    vmov.32 q0[0], r6
2419; CHECK-NEXT:    vmov.u8 r6, q6[9]
2420; CHECK-NEXT:    vmov.32 q0[2], r6
2421; CHECK-NEXT:    vand q0, q0, q5
2422; CHECK-NEXT:    vmov r6, s0
2423; CHECK-NEXT:    umull r6, r5, r5, r6
2424; CHECK-NEXT:    vmov.32 q2[0], r6
2425; CHECK-NEXT:    vmov r6, s2
2426; CHECK-NEXT:    vmov.32 q2[1], r5
2427; CHECK-NEXT:    vmov r5, s6
2428; CHECK-NEXT:    umull r6, r5, r5, r6
2429; CHECK-NEXT:    vmov.32 q2[2], r6
2430; CHECK-NEXT:    vmov.32 q2[3], r5
2431; CHECK-NEXT:    vand q0, q2, q4
2432; CHECK-NEXT:    vmov r5, s0
2433; CHECK-NEXT:    vmov r6, s1
2434; CHECK-NEXT:    vmov r4, s3
2435; CHECK-NEXT:    adds.w r5, r5, r12
2436; CHECK-NEXT:    adcs r6, r3
2437; CHECK-NEXT:    vmov r3, s2
2438; CHECK-NEXT:    adds r3, r3, r5
2439; CHECK-NEXT:    adc.w r12, r6, r4
2440; CHECK-NEXT:    ubfx r6, r2, #8, #1
2441; CHECK-NEXT:    rsbs r6, r6, #0
2442; CHECK-NEXT:    ubfx r2, r2, #12, #1
2443; CHECK-NEXT:    vmov.32 q4[0], r6
2444; CHECK-NEXT:    rsbs r2, r2, #0
2445; CHECK-NEXT:    vmov.32 q4[1], r6
2446; CHECK-NEXT:    vmov.u8 r6, q7[10]
2447; CHECK-NEXT:    vmov.32 q4[2], r2
2448; CHECK-NEXT:    vmov.32 q1[0], r6
2449; CHECK-NEXT:    vmov.32 q4[3], r2
2450; CHECK-NEXT:    vmov.u8 r2, q6[10]
2451; CHECK-NEXT:    vmov.32 q0[0], r2
2452; CHECK-NEXT:    vmov.u8 r2, q6[11]
2453; CHECK-NEXT:    vmov.u8 r6, q7[11]
2454; CHECK-NEXT:    vmov.32 q0[2], r2
2455; CHECK-NEXT:    vmov.32 q1[2], r6
2456; CHECK-NEXT:    vand q0, q0, q5
2457; CHECK-NEXT:    vand q1, q1, q5
2458; CHECK-NEXT:    vmov r2, s0
2459; CHECK-NEXT:    vmov r6, s4
2460; CHECK-NEXT:    umull r2, r6, r6, r2
2461; CHECK-NEXT:    vmov.32 q2[0], r2
2462; CHECK-NEXT:    vmov r2, s2
2463; CHECK-NEXT:    vmov.32 q2[1], r6
2464; CHECK-NEXT:    vmov r6, s6
2465; CHECK-NEXT:    umull r2, r6, r6, r2
2466; CHECK-NEXT:    vmov.32 q2[2], r2
2467; CHECK-NEXT:    vmov.32 q2[3], r6
2468; CHECK-NEXT:    vand q0, q2, q4
2469; CHECK-NEXT:    vmov r6, s0
2470; CHECK-NEXT:    vmov r2, s1
2471; CHECK-NEXT:    vmov r5, s2
2472; CHECK-NEXT:    adds r3, r3, r6
2473; CHECK-NEXT:    vmov r6, s3
2474; CHECK-NEXT:    adc.w r2, r2, r12
2475; CHECK-NEXT:    adds.w r12, r3, r5
2476; CHECK-NEXT:    vmov.u8 r5, q7[12]
2477; CHECK-NEXT:    vmov.32 q1[0], r5
2478; CHECK-NEXT:    vmov.u8 r5, q7[13]
2479; CHECK-NEXT:    vmov.32 q1[2], r5
2480; CHECK-NEXT:    vand q1, q1, q5
2481; CHECK-NEXT:    vmov r5, s4
2482; CHECK-NEXT:    adc.w r3, r2, r6
2483; CHECK-NEXT:    vmov.u16 r2, q3[4]
2484; CHECK-NEXT:    vmov.32 q0[0], r2
2485; CHECK-NEXT:    vmov.u16 r2, q3[5]
2486; CHECK-NEXT:    vmov.32 q0[1], r2
2487; CHECK-NEXT:    vmov.u16 r2, q3[6]
2488; CHECK-NEXT:    vmov.32 q0[2], r2
2489; CHECK-NEXT:    vmov.u16 r2, q3[7]
2490; CHECK-NEXT:    vmov.32 q0[3], r2
2491; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2492; CHECK-NEXT:    vmrs r2, p0
2493; CHECK-NEXT:    and r6, r2, #1
2494; CHECK-NEXT:    rsbs r6, r6, #0
2495; CHECK-NEXT:    vmov.32 q3[0], r6
2496; CHECK-NEXT:    vmov.32 q3[1], r6
2497; CHECK-NEXT:    ubfx r6, r2, #4, #1
2498; CHECK-NEXT:    rsbs r6, r6, #0
2499; CHECK-NEXT:    vmov.32 q3[2], r6
2500; CHECK-NEXT:    vmov.32 q3[3], r6
2501; CHECK-NEXT:    vmov.u8 r6, q6[12]
2502; CHECK-NEXT:    vmov.32 q0[0], r6
2503; CHECK-NEXT:    vmov.u8 r6, q6[13]
2504; CHECK-NEXT:    vmov.32 q0[2], r6
2505; CHECK-NEXT:    vand q0, q0, q5
2506; CHECK-NEXT:    vmov r6, s0
2507; CHECK-NEXT:    umull r6, r5, r5, r6
2508; CHECK-NEXT:    vmov.32 q2[0], r6
2509; CHECK-NEXT:    vmov r6, s2
2510; CHECK-NEXT:    vmov.32 q2[1], r5
2511; CHECK-NEXT:    vmov r5, s6
2512; CHECK-NEXT:    umull r6, r5, r5, r6
2513; CHECK-NEXT:    vmov.32 q2[2], r6
2514; CHECK-NEXT:    vmov.32 q2[3], r5
2515; CHECK-NEXT:    vand q0, q2, q3
2516; CHECK-NEXT:    vmov r5, s0
2517; CHECK-NEXT:    vmov r6, s1
2518; CHECK-NEXT:    vmov r4, s3
2519; CHECK-NEXT:    adds.w r5, r5, r12
2520; CHECK-NEXT:    adcs r6, r3
2521; CHECK-NEXT:    vmov r3, s2
2522; CHECK-NEXT:    adds r3, r3, r5
2523; CHECK-NEXT:    adc.w r12, r6, r4
2524; CHECK-NEXT:    ubfx r6, r2, #8, #1
2525; CHECK-NEXT:    rsbs r6, r6, #0
2526; CHECK-NEXT:    ubfx r2, r2, #12, #1
2527; CHECK-NEXT:    vmov.32 q3[0], r6
2528; CHECK-NEXT:    rsbs r2, r2, #0
2529; CHECK-NEXT:    vmov.32 q3[1], r6
2530; CHECK-NEXT:    vmov.u8 r6, q7[14]
2531; CHECK-NEXT:    vmov.32 q3[2], r2
2532; CHECK-NEXT:    vmov.32 q1[0], r6
2533; CHECK-NEXT:    vmov.32 q3[3], r2
2534; CHECK-NEXT:    vmov.u8 r2, q6[14]
2535; CHECK-NEXT:    vmov.32 q0[0], r2
2536; CHECK-NEXT:    vmov.u8 r2, q6[15]
2537; CHECK-NEXT:    vmov.u8 r6, q7[15]
2538; CHECK-NEXT:    vmov.32 q0[2], r2
2539; CHECK-NEXT:    vmov.32 q1[2], r6
2540; CHECK-NEXT:    vand q0, q0, q5
2541; CHECK-NEXT:    vand q1, q1, q5
2542; CHECK-NEXT:    vmov r2, s0
2543; CHECK-NEXT:    vmov r6, s4
2544; CHECK-NEXT:    umull r2, r6, r6, r2
2545; CHECK-NEXT:    vmov.32 q2[0], r2
2546; CHECK-NEXT:    vmov r2, s2
2547; CHECK-NEXT:    vmov.32 q2[1], r6
2548; CHECK-NEXT:    vmov r6, s6
2549; CHECK-NEXT:    umull r2, r6, r6, r2
2550; CHECK-NEXT:    vmov.32 q2[2], r2
2551; CHECK-NEXT:    vmov.32 q2[3], r6
2552; CHECK-NEXT:    vand q0, q2, q3
2553; CHECK-NEXT:    vmov r6, s0
2554; CHECK-NEXT:    vmov r2, s1
2555; CHECK-NEXT:    vmov r5, s2
2556; CHECK-NEXT:    adds r3, r3, r6
2557; CHECK-NEXT:    vmov r6, s3
2558; CHECK-NEXT:    adc.w r2, r2, r12
2559; CHECK-NEXT:    adds r3, r3, r5
2560; CHECK-NEXT:    adcs r2, r6
2561; CHECK-NEXT:    adds r0, r0, r3
2562; CHECK-NEXT:    adcs r1, r2
2563; CHECK-NEXT:    add sp, #80
2564; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2565; CHECK-NEXT:    pop {r4, r5, r6, pc}
2566entry:
2567  %c = icmp eq <16 x i8> %b, zeroinitializer
2568  %xx = zext <16 x i8> %x to <16 x i64>
2569  %yy = zext <16 x i8> %y to <16 x i64>
2570  %m = mul <16 x i64> %xx, %yy
2571  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2572  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2573  %r = add i64 %z, %a
2574  ret i64 %r
2575}
2576
2577define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2578; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
2579; CHECK:       @ %bb.0: @ %entry
2580; CHECK-NEXT:    .save {r4, r5, r7, lr}
2581; CHECK-NEXT:    push {r4, r5, r7, lr}
2582; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2583; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2584; CHECK-NEXT:    vcmp.i8 eq, q2, zr
2585; CHECK-NEXT:    vmov.i8 q2, #0x0
2586; CHECK-NEXT:    vmov.i8 q3, #0xff
2587; CHECK-NEXT:    vpsel q4, q3, q2
2588; CHECK-NEXT:    vmov.u8 r2, q4[0]
2589; CHECK-NEXT:    vmov.16 q5[0], r2
2590; CHECK-NEXT:    vmov.u8 r2, q4[1]
2591; CHECK-NEXT:    vmov.16 q5[1], r2
2592; CHECK-NEXT:    vmov.u8 r2, q4[2]
2593; CHECK-NEXT:    vmov.16 q5[2], r2
2594; CHECK-NEXT:    vmov.u8 r2, q4[3]
2595; CHECK-NEXT:    vmov.16 q5[3], r2
2596; CHECK-NEXT:    vmov.u8 r2, q4[4]
2597; CHECK-NEXT:    vmov.16 q5[4], r2
2598; CHECK-NEXT:    vmov.u8 r2, q4[5]
2599; CHECK-NEXT:    vmov.16 q5[5], r2
2600; CHECK-NEXT:    vmov.u8 r2, q4[6]
2601; CHECK-NEXT:    vmov.16 q5[6], r2
2602; CHECK-NEXT:    vmov.u8 r2, q4[7]
2603; CHECK-NEXT:    vmov.16 q5[7], r2
2604; CHECK-NEXT:    vcmp.i16 ne, q5, zr
2605; CHECK-NEXT:    vpsel q5, q3, q2
2606; CHECK-NEXT:    vmov.u16 r2, q5[0]
2607; CHECK-NEXT:    vmov.32 q6[0], r2
2608; CHECK-NEXT:    vmov.u16 r2, q5[1]
2609; CHECK-NEXT:    vmov.32 q6[1], r2
2610; CHECK-NEXT:    vmov.u16 r2, q5[2]
2611; CHECK-NEXT:    vmov.32 q6[2], r2
2612; CHECK-NEXT:    vmov.u16 r2, q5[3]
2613; CHECK-NEXT:    vmov.32 q6[3], r2
2614; CHECK-NEXT:    vcmp.i32 ne, q6, zr
2615; CHECK-NEXT:    vmrs r2, p0
2616; CHECK-NEXT:    and r3, r2, #1
2617; CHECK-NEXT:    rsbs r3, r3, #0
2618; CHECK-NEXT:    vmov.32 q6[0], r3
2619; CHECK-NEXT:    vmov.32 q6[1], r3
2620; CHECK-NEXT:    ubfx r3, r2, #4, #1
2621; CHECK-NEXT:    rsbs r3, r3, #0
2622; CHECK-NEXT:    vmov.32 q6[2], r3
2623; CHECK-NEXT:    vmov.32 q6[3], r3
2624; CHECK-NEXT:    vmov.u8 r3, q1[0]
2625; CHECK-NEXT:    sxtb.w r12, r3
2626; CHECK-NEXT:    vmov.u8 r3, q0[0]
2627; CHECK-NEXT:    sxtb r3, r3
2628; CHECK-NEXT:    smull r3, r12, r3, r12
2629; CHECK-NEXT:    vmov.32 q7[0], r3
2630; CHECK-NEXT:    vmov.u8 r3, q1[1]
2631; CHECK-NEXT:    vmov.32 q7[1], r12
2632; CHECK-NEXT:    sxtb.w r12, r3
2633; CHECK-NEXT:    vmov.u8 r3, q0[1]
2634; CHECK-NEXT:    sxtb r3, r3
2635; CHECK-NEXT:    smull r3, r12, r3, r12
2636; CHECK-NEXT:    vmov.32 q7[2], r3
2637; CHECK-NEXT:    vmov.32 q7[3], r12
2638; CHECK-NEXT:    vand q6, q7, q6
2639; CHECK-NEXT:    vmov r3, s26
2640; CHECK-NEXT:    vmov r4, s24
2641; CHECK-NEXT:    vmov r12, s27
2642; CHECK-NEXT:    vmov r5, s25
2643; CHECK-NEXT:    adds.w lr, r4, r3
2644; CHECK-NEXT:    ubfx r3, r2, #8, #1
2645; CHECK-NEXT:    rsb.w r3, r3, #0
2646; CHECK-NEXT:    ubfx r2, r2, #12, #1
2647; CHECK-NEXT:    vmov.32 q6[0], r3
2648; CHECK-NEXT:    rsb.w r2, r2, #0
2649; CHECK-NEXT:    vmov.32 q6[1], r3
2650; CHECK-NEXT:    vmov.u8 r3, q0[2]
2651; CHECK-NEXT:    vmov.32 q6[2], r2
2652; CHECK-NEXT:    sxtb r3, r3
2653; CHECK-NEXT:    vmov.32 q6[3], r2
2654; CHECK-NEXT:    vmov.u8 r2, q1[2]
2655; CHECK-NEXT:    sxtb r2, r2
2656; CHECK-NEXT:    adc.w r12, r12, r5
2657; CHECK-NEXT:    smull r2, r3, r3, r2
2658; CHECK-NEXT:    vmov.32 q7[0], r2
2659; CHECK-NEXT:    vmov.u8 r2, q1[3]
2660; CHECK-NEXT:    vmov.32 q7[1], r3
2661; CHECK-NEXT:    vmov.u8 r3, q0[3]
2662; CHECK-NEXT:    sxtb r2, r2
2663; CHECK-NEXT:    sxtb r3, r3
2664; CHECK-NEXT:    smull r2, r3, r3, r2
2665; CHECK-NEXT:    vmov.32 q7[2], r2
2666; CHECK-NEXT:    vmov.32 q7[3], r3
2667; CHECK-NEXT:    vand q6, q7, q6
2668; CHECK-NEXT:    vmov r3, s24
2669; CHECK-NEXT:    vmov r2, s25
2670; CHECK-NEXT:    vmov r4, s26
2671; CHECK-NEXT:    vmov r5, s27
2672; CHECK-NEXT:    adds.w r3, r3, lr
2673; CHECK-NEXT:    adc.w r2, r2, r12
2674; CHECK-NEXT:    adds.w r12, r3, r4
2675; CHECK-NEXT:    vmov.u8 r4, q0[4]
2676; CHECK-NEXT:    adc.w r3, r2, r5
2677; CHECK-NEXT:    vmov.u16 r2, q5[4]
2678; CHECK-NEXT:    vmov.32 q6[0], r2
2679; CHECK-NEXT:    vmov.u16 r2, q5[5]
2680; CHECK-NEXT:    vmov.32 q6[1], r2
2681; CHECK-NEXT:    vmov.u16 r2, q5[6]
2682; CHECK-NEXT:    vmov.32 q6[2], r2
2683; CHECK-NEXT:    vmov.u16 r2, q5[7]
2684; CHECK-NEXT:    vmov.32 q6[3], r2
2685; CHECK-NEXT:    sxtb r4, r4
2686; CHECK-NEXT:    vcmp.i32 ne, q6, zr
2687; CHECK-NEXT:    vmrs r2, p0
2688; CHECK-NEXT:    and r5, r2, #1
2689; CHECK-NEXT:    rsbs r5, r5, #0
2690; CHECK-NEXT:    vmov.32 q5[0], r5
2691; CHECK-NEXT:    vmov.32 q5[1], r5
2692; CHECK-NEXT:    ubfx r5, r2, #4, #1
2693; CHECK-NEXT:    rsbs r5, r5, #0
2694; CHECK-NEXT:    vmov.32 q5[2], r5
2695; CHECK-NEXT:    vmov.32 q5[3], r5
2696; CHECK-NEXT:    vmov.u8 r5, q1[4]
2697; CHECK-NEXT:    sxtb r5, r5
2698; CHECK-NEXT:    smull r5, r4, r4, r5
2699; CHECK-NEXT:    vmov.32 q6[0], r5
2700; CHECK-NEXT:    vmov.u8 r5, q1[5]
2701; CHECK-NEXT:    vmov.32 q6[1], r4
2702; CHECK-NEXT:    vmov.u8 r4, q0[5]
2703; CHECK-NEXT:    sxtb r5, r5
2704; CHECK-NEXT:    sxtb r4, r4
2705; CHECK-NEXT:    smull r5, r4, r4, r5
2706; CHECK-NEXT:    vmov.32 q6[2], r5
2707; CHECK-NEXT:    vmov.32 q6[3], r4
2708; CHECK-NEXT:    vand q5, q6, q5
2709; CHECK-NEXT:    vmov r4, s20
2710; CHECK-NEXT:    vmov r5, s21
2711; CHECK-NEXT:    adds.w r12, r12, r4
2712; CHECK-NEXT:    vmov r4, s22
2713; CHECK-NEXT:    adcs r5, r3
2714; CHECK-NEXT:    vmov r3, s23
2715; CHECK-NEXT:    adds.w r4, r4, r12
2716; CHECK-NEXT:    adc.w r12, r5, r3
2717; CHECK-NEXT:    ubfx r3, r2, #8, #1
2718; CHECK-NEXT:    rsbs r3, r3, #0
2719; CHECK-NEXT:    ubfx r2, r2, #12, #1
2720; CHECK-NEXT:    vmov.32 q5[0], r3
2721; CHECK-NEXT:    rsbs r2, r2, #0
2722; CHECK-NEXT:    vmov.32 q5[1], r3
2723; CHECK-NEXT:    vmov.u8 r3, q0[6]
2724; CHECK-NEXT:    vmov.32 q5[2], r2
2725; CHECK-NEXT:    sxtb r3, r3
2726; CHECK-NEXT:    vmov.32 q5[3], r2
2727; CHECK-NEXT:    vmov.u8 r2, q1[6]
2728; CHECK-NEXT:    sxtb r2, r2
2729; CHECK-NEXT:    smull r2, r3, r3, r2
2730; CHECK-NEXT:    vmov.32 q6[0], r2
2731; CHECK-NEXT:    vmov.u8 r2, q1[7]
2732; CHECK-NEXT:    vmov.32 q6[1], r3
2733; CHECK-NEXT:    vmov.u8 r3, q0[7]
2734; CHECK-NEXT:    sxtb r2, r2
2735; CHECK-NEXT:    sxtb r3, r3
2736; CHECK-NEXT:    smull r2, r3, r3, r2
2737; CHECK-NEXT:    vmov.32 q6[2], r2
2738; CHECK-NEXT:    vmov.32 q6[3], r3
2739; CHECK-NEXT:    vand q5, q6, q5
2740; CHECK-NEXT:    vmov r3, s20
2741; CHECK-NEXT:    vmov r2, s21
2742; CHECK-NEXT:    vmov r5, s23
2743; CHECK-NEXT:    adds r3, r3, r4
2744; CHECK-NEXT:    vmov r4, s22
2745; CHECK-NEXT:    adc.w r2, r2, r12
2746; CHECK-NEXT:    adds.w r12, r3, r4
2747; CHECK-NEXT:    vmov.u8 r4, q0[8]
2748; CHECK-NEXT:    adc.w r3, r2, r5
2749; CHECK-NEXT:    vmov.u8 r2, q4[8]
2750; CHECK-NEXT:    vmov.16 q5[0], r2
2751; CHECK-NEXT:    vmov.u8 r2, q4[9]
2752; CHECK-NEXT:    vmov.16 q5[1], r2
2753; CHECK-NEXT:    vmov.u8 r2, q4[10]
2754; CHECK-NEXT:    vmov.16 q5[2], r2
2755; CHECK-NEXT:    vmov.u8 r2, q4[11]
2756; CHECK-NEXT:    vmov.16 q5[3], r2
2757; CHECK-NEXT:    vmov.u8 r2, q4[12]
2758; CHECK-NEXT:    vmov.16 q5[4], r2
2759; CHECK-NEXT:    vmov.u8 r2, q4[13]
2760; CHECK-NEXT:    vmov.16 q5[5], r2
2761; CHECK-NEXT:    vmov.u8 r2, q4[14]
2762; CHECK-NEXT:    vmov.16 q5[6], r2
2763; CHECK-NEXT:    vmov.u8 r2, q4[15]
2764; CHECK-NEXT:    vmov.16 q5[7], r2
2765; CHECK-NEXT:    sxtb r4, r4
2766; CHECK-NEXT:    vcmp.i16 ne, q5, zr
2767; CHECK-NEXT:    vpsel q2, q3, q2
2768; CHECK-NEXT:    vmov.u16 r2, q2[0]
2769; CHECK-NEXT:    vmov.32 q3[0], r2
2770; CHECK-NEXT:    vmov.u16 r2, q2[1]
2771; CHECK-NEXT:    vmov.32 q3[1], r2
2772; CHECK-NEXT:    vmov.u16 r2, q2[2]
2773; CHECK-NEXT:    vmov.32 q3[2], r2
2774; CHECK-NEXT:    vmov.u16 r2, q2[3]
2775; CHECK-NEXT:    vmov.32 q3[3], r2
2776; CHECK-NEXT:    vcmp.i32 ne, q3, zr
2777; CHECK-NEXT:    vmrs r2, p0
2778; CHECK-NEXT:    and r5, r2, #1
2779; CHECK-NEXT:    rsbs r5, r5, #0
2780; CHECK-NEXT:    vmov.32 q3[0], r5
2781; CHECK-NEXT:    vmov.32 q3[1], r5
2782; CHECK-NEXT:    ubfx r5, r2, #4, #1
2783; CHECK-NEXT:    rsbs r5, r5, #0
2784; CHECK-NEXT:    vmov.32 q3[2], r5
2785; CHECK-NEXT:    vmov.32 q3[3], r5
2786; CHECK-NEXT:    vmov.u8 r5, q1[8]
2787; CHECK-NEXT:    sxtb r5, r5
2788; CHECK-NEXT:    smull r5, r4, r4, r5
2789; CHECK-NEXT:    vmov.32 q4[0], r5
2790; CHECK-NEXT:    vmov.u8 r5, q1[9]
2791; CHECK-NEXT:    vmov.32 q4[1], r4
2792; CHECK-NEXT:    vmov.u8 r4, q0[9]
2793; CHECK-NEXT:    sxtb r5, r5
2794; CHECK-NEXT:    sxtb r4, r4
2795; CHECK-NEXT:    smull r5, r4, r4, r5
2796; CHECK-NEXT:    vmov.32 q4[2], r5
2797; CHECK-NEXT:    vmov.32 q4[3], r4
2798; CHECK-NEXT:    vand q3, q4, q3
2799; CHECK-NEXT:    vmov r4, s12
2800; CHECK-NEXT:    vmov r5, s13
2801; CHECK-NEXT:    adds.w r12, r12, r4
2802; CHECK-NEXT:    vmov r4, s14
2803; CHECK-NEXT:    adcs r5, r3
2804; CHECK-NEXT:    vmov r3, s15
2805; CHECK-NEXT:    adds.w r4, r4, r12
2806; CHECK-NEXT:    adc.w r12, r5, r3
2807; CHECK-NEXT:    ubfx r3, r2, #8, #1
2808; CHECK-NEXT:    rsbs r3, r3, #0
2809; CHECK-NEXT:    ubfx r2, r2, #12, #1
2810; CHECK-NEXT:    vmov.32 q3[0], r3
2811; CHECK-NEXT:    rsbs r2, r2, #0
2812; CHECK-NEXT:    vmov.32 q3[1], r3
2813; CHECK-NEXT:    vmov.u8 r3, q0[10]
2814; CHECK-NEXT:    vmov.32 q3[2], r2
2815; CHECK-NEXT:    sxtb r3, r3
2816; CHECK-NEXT:    vmov.32 q3[3], r2
2817; CHECK-NEXT:    vmov.u8 r2, q1[10]
2818; CHECK-NEXT:    sxtb r2, r2
2819; CHECK-NEXT:    smull r2, r3, r3, r2
2820; CHECK-NEXT:    vmov.32 q4[0], r2
2821; CHECK-NEXT:    vmov.u8 r2, q1[11]
2822; CHECK-NEXT:    vmov.32 q4[1], r3
2823; CHECK-NEXT:    vmov.u8 r3, q0[11]
2824; CHECK-NEXT:    sxtb r2, r2
2825; CHECK-NEXT:    sxtb r3, r3
2826; CHECK-NEXT:    smull r2, r3, r3, r2
2827; CHECK-NEXT:    vmov.32 q4[2], r2
2828; CHECK-NEXT:    vmov.32 q4[3], r3
2829; CHECK-NEXT:    vand q3, q4, q3
2830; CHECK-NEXT:    vmov r3, s12
2831; CHECK-NEXT:    vmov r2, s13
2832; CHECK-NEXT:    vmov r5, s15
2833; CHECK-NEXT:    adds r3, r3, r4
2834; CHECK-NEXT:    vmov r4, s14
2835; CHECK-NEXT:    adc.w r2, r2, r12
2836; CHECK-NEXT:    adds.w r12, r3, r4
2837; CHECK-NEXT:    vmov.u8 r4, q0[12]
2838; CHECK-NEXT:    adc.w r3, r2, r5
2839; CHECK-NEXT:    vmov.u16 r2, q2[4]
2840; CHECK-NEXT:    vmov.32 q3[0], r2
2841; CHECK-NEXT:    vmov.u16 r2, q2[5]
2842; CHECK-NEXT:    vmov.32 q3[1], r2
2843; CHECK-NEXT:    vmov.u16 r2, q2[6]
2844; CHECK-NEXT:    vmov.32 q3[2], r2
2845; CHECK-NEXT:    vmov.u16 r2, q2[7]
2846; CHECK-NEXT:    vmov.32 q3[3], r2
2847; CHECK-NEXT:    sxtb r4, r4
2848; CHECK-NEXT:    vcmp.i32 ne, q3, zr
2849; CHECK-NEXT:    vmrs r2, p0
2850; CHECK-NEXT:    and r5, r2, #1
2851; CHECK-NEXT:    rsbs r5, r5, #0
2852; CHECK-NEXT:    vmov.32 q2[0], r5
2853; CHECK-NEXT:    vmov.32 q2[1], r5
2854; CHECK-NEXT:    ubfx r5, r2, #4, #1
2855; CHECK-NEXT:    rsbs r5, r5, #0
2856; CHECK-NEXT:    vmov.32 q2[2], r5
2857; CHECK-NEXT:    vmov.32 q2[3], r5
2858; CHECK-NEXT:    vmov.u8 r5, q1[12]
2859; CHECK-NEXT:    sxtb r5, r5
2860; CHECK-NEXT:    smull r5, r4, r4, r5
2861; CHECK-NEXT:    vmov.32 q3[0], r5
2862; CHECK-NEXT:    vmov.u8 r5, q1[13]
2863; CHECK-NEXT:    vmov.32 q3[1], r4
2864; CHECK-NEXT:    vmov.u8 r4, q0[13]
2865; CHECK-NEXT:    sxtb r5, r5
2866; CHECK-NEXT:    sxtb r4, r4
2867; CHECK-NEXT:    smull r5, r4, r4, r5
2868; CHECK-NEXT:    vmov.32 q3[2], r5
2869; CHECK-NEXT:    vmov.32 q3[3], r4
2870; CHECK-NEXT:    vand q2, q3, q2
2871; CHECK-NEXT:    vmov r4, s8
2872; CHECK-NEXT:    vmov r5, s9
2873; CHECK-NEXT:    adds.w r12, r12, r4
2874; CHECK-NEXT:    vmov r4, s10
2875; CHECK-NEXT:    adcs r5, r3
2876; CHECK-NEXT:    vmov r3, s11
2877; CHECK-NEXT:    adds.w r4, r4, r12
2878; CHECK-NEXT:    adc.w r12, r5, r3
2879; CHECK-NEXT:    ubfx r3, r2, #8, #1
2880; CHECK-NEXT:    rsbs r3, r3, #0
2881; CHECK-NEXT:    ubfx r2, r2, #12, #1
2882; CHECK-NEXT:    vmov.32 q2[0], r3
2883; CHECK-NEXT:    rsbs r2, r2, #0
2884; CHECK-NEXT:    vmov.32 q2[1], r3
2885; CHECK-NEXT:    vmov.u8 r3, q0[14]
2886; CHECK-NEXT:    vmov.32 q2[2], r2
2887; CHECK-NEXT:    sxtb r3, r3
2888; CHECK-NEXT:    vmov.32 q2[3], r2
2889; CHECK-NEXT:    vmov.u8 r2, q1[14]
2890; CHECK-NEXT:    sxtb r2, r2
2891; CHECK-NEXT:    smull r2, r3, r3, r2
2892; CHECK-NEXT:    vmov.32 q3[0], r2
2893; CHECK-NEXT:    vmov.u8 r2, q1[15]
2894; CHECK-NEXT:    vmov.32 q3[1], r3
2895; CHECK-NEXT:    vmov.u8 r3, q0[15]
2896; CHECK-NEXT:    sxtb r2, r2
2897; CHECK-NEXT:    sxtb r3, r3
2898; CHECK-NEXT:    smull r2, r3, r3, r2
2899; CHECK-NEXT:    vmov.32 q3[2], r2
2900; CHECK-NEXT:    vmov.32 q3[3], r3
2901; CHECK-NEXT:    vand q0, q3, q2
2902; CHECK-NEXT:    vmov r3, s0
2903; CHECK-NEXT:    vmov r2, s1
2904; CHECK-NEXT:    vmov r5, s3
2905; CHECK-NEXT:    adds r3, r3, r4
2906; CHECK-NEXT:    vmov r4, s2
2907; CHECK-NEXT:    adc.w r2, r2, r12
2908; CHECK-NEXT:    adds r3, r3, r4
2909; CHECK-NEXT:    adcs r2, r5
2910; CHECK-NEXT:    adds r0, r0, r3
2911; CHECK-NEXT:    adcs r1, r2
2912; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2913; CHECK-NEXT:    pop {r4, r5, r7, pc}
2914entry:
2915  %c = icmp eq <16 x i8> %b, zeroinitializer
2916  %xx = sext <16 x i8> %x to <16 x i64>
2917  %yy = sext <16 x i8> %y to <16 x i64>
2918  %m = mul <16 x i64> %xx, %yy
2919  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2920  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2921  %r = add i64 %z, %a
2922  ret i64 %r
2923}
2924
2925define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2926; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
2927; CHECK:       @ %bb.0: @ %entry
2928; CHECK-NEXT:    .save {r7, lr}
2929; CHECK-NEXT:    push {r7, lr}
2930; CHECK-NEXT:    .vsave {d8, d9}
2931; CHECK-NEXT:    vpush {d8, d9}
2932; CHECK-NEXT:    vmov.i64 q3, #0xff
2933; CHECK-NEXT:    vand q1, q1, q3
2934; CHECK-NEXT:    vand q4, q0, q3
2935; CHECK-NEXT:    vmov r2, s4
2936; CHECK-NEXT:    vmov r3, s16
2937; CHECK-NEXT:    umull r2, r3, r3, r2
2938; CHECK-NEXT:    vmov.32 q0[0], r2
2939; CHECK-NEXT:    vmov r2, s6
2940; CHECK-NEXT:    vmov.32 q0[1], r3
2941; CHECK-NEXT:    vmov r3, s18
2942; CHECK-NEXT:    vand q1, q2, q3
2943; CHECK-NEXT:    umull r2, r3, r3, r2
2944; CHECK-NEXT:    vmov.32 q0[2], r2
2945; CHECK-NEXT:    vmov r2, s4
2946; CHECK-NEXT:    vmov.32 q0[3], r3
2947; CHECK-NEXT:    cmp r2, #0
2948; CHECK-NEXT:    cset r2, eq
2949; CHECK-NEXT:    tst.w r2, #1
2950; CHECK-NEXT:    csetm r2, ne
2951; CHECK-NEXT:    vmov.32 q2[0], r2
2952; CHECK-NEXT:    vmov.32 q2[1], r2
2953; CHECK-NEXT:    vmov r2, s6
2954; CHECK-NEXT:    cmp r2, #0
2955; CHECK-NEXT:    cset r2, eq
2956; CHECK-NEXT:    tst.w r2, #1
2957; CHECK-NEXT:    csetm r2, ne
2958; CHECK-NEXT:    vmov.32 q2[2], r2
2959; CHECK-NEXT:    vmov.32 q2[3], r2
2960; CHECK-NEXT:    vand q0, q0, q2
2961; CHECK-NEXT:    vmov r2, s2
2962; CHECK-NEXT:    vmov r3, s0
2963; CHECK-NEXT:    vmov r12, s3
2964; CHECK-NEXT:    vmov lr, s1
2965; CHECK-NEXT:    adds r2, r2, r3
2966; CHECK-NEXT:    adc.w r3, lr, r12
2967; CHECK-NEXT:    adds r0, r0, r2
2968; CHECK-NEXT:    adcs r1, r3
2969; CHECK-NEXT:    vpop {d8, d9}
2970; CHECK-NEXT:    pop {r7, pc}
2971entry:
2972  %c = icmp eq <2 x i8> %b, zeroinitializer
2973  %xx = zext <2 x i8> %x to <2 x i64>
2974  %yy = zext <2 x i8> %y to <2 x i64>
2975  %m = mul <2 x i64> %xx, %yy
2976  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2977  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2978  %r = add i64 %z, %a
2979  ret i64 %r
2980}
2981
2982define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2983; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
2984; CHECK:       @ %bb.0: @ %entry
2985; CHECK-NEXT:    .save {r7, lr}
2986; CHECK-NEXT:    push {r7, lr}
2987; CHECK-NEXT:    vmov.i32 q3, #0xff
2988; CHECK-NEXT:    vmov r3, s0
2989; CHECK-NEXT:    vand q3, q2, q3
2990; CHECK-NEXT:    vmov r2, s12
2991; CHECK-NEXT:    sxtb r3, r3
2992; CHECK-NEXT:    cmp r2, #0
2993; CHECK-NEXT:    cset r2, eq
2994; CHECK-NEXT:    tst.w r2, #1
2995; CHECK-NEXT:    csetm r2, ne
2996; CHECK-NEXT:    vmov.32 q2[0], r2
2997; CHECK-NEXT:    vmov.32 q2[1], r2
2998; CHECK-NEXT:    vmov r2, s14
2999; CHECK-NEXT:    cmp r2, #0
3000; CHECK-NEXT:    cset r2, eq
3001; CHECK-NEXT:    tst.w r2, #1
3002; CHECK-NEXT:    csetm r2, ne
3003; CHECK-NEXT:    vmov.32 q2[2], r2
3004; CHECK-NEXT:    vmov.32 q2[3], r2
3005; CHECK-NEXT:    vmov r2, s4
3006; CHECK-NEXT:    sxtb r2, r2
3007; CHECK-NEXT:    smull r2, r3, r3, r2
3008; CHECK-NEXT:    vmov.32 q3[0], r2
3009; CHECK-NEXT:    vmov r2, s6
3010; CHECK-NEXT:    vmov.32 q3[1], r3
3011; CHECK-NEXT:    vmov r3, s2
3012; CHECK-NEXT:    sxtb r2, r2
3013; CHECK-NEXT:    sxtb r3, r3
3014; CHECK-NEXT:    smull r2, r3, r3, r2
3015; CHECK-NEXT:    vmov.32 q3[2], r2
3016; CHECK-NEXT:    vmov.32 q3[3], r3
3017; CHECK-NEXT:    vand q0, q3, q2
3018; CHECK-NEXT:    vmov r2, s2
3019; CHECK-NEXT:    vmov r3, s0
3020; CHECK-NEXT:    vmov r12, s3
3021; CHECK-NEXT:    vmov lr, s1
3022; CHECK-NEXT:    adds r2, r2, r3
3023; CHECK-NEXT:    adc.w r3, lr, r12
3024; CHECK-NEXT:    adds r0, r0, r2
3025; CHECK-NEXT:    adcs r1, r3
3026; CHECK-NEXT:    pop {r7, pc}
3027entry:
3028  %c = icmp eq <2 x i8> %b, zeroinitializer
3029  %xx = sext <2 x i8> %x to <2 x i64>
3030  %yy = sext <2 x i8> %y to <2 x i64>
3031  %m = mul <2 x i64> %xx, %yy
3032  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3033  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3034  %r = add i64 %z, %a
3035  ret i64 %r
3036}
3037
3038define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
3039; CHECK-LABEL: add_v2i64_v2i64_acc:
3040; CHECK:       @ %bb.0: @ %entry
3041; CHECK-NEXT:    .save {r4, lr}
3042; CHECK-NEXT:    push {r4, lr}
3043; CHECK-NEXT:    vmov r2, s4
3044; CHECK-NEXT:    vmov r3, s0
3045; CHECK-NEXT:    vmov r4, s5
3046; CHECK-NEXT:    umull r12, lr, r3, r2
3047; CHECK-NEXT:    mla r3, r3, r4, lr
3048; CHECK-NEXT:    vmov r4, s1
3049; CHECK-NEXT:    vmov.32 q3[0], r12
3050; CHECK-NEXT:    mla r2, r4, r2, r3
3051; CHECK-NEXT:    vmov r3, s2
3052; CHECK-NEXT:    vmov.32 q3[1], r2
3053; CHECK-NEXT:    vmov r2, s6
3054; CHECK-NEXT:    umull r4, r12, r3, r2
3055; CHECK-NEXT:    vmov.32 q3[2], r4
3056; CHECK-NEXT:    vmov r4, s7
3057; CHECK-NEXT:    mla r3, r3, r4, r12
3058; CHECK-NEXT:    vmov r4, s3
3059; CHECK-NEXT:    mla r2, r4, r2, r3
3060; CHECK-NEXT:    vmov r3, s8
3061; CHECK-NEXT:    vmov.32 q3[3], r2
3062; CHECK-NEXT:    vmov r2, s9
3063; CHECK-NEXT:    orrs r2, r3
3064; CHECK-NEXT:    vmov r3, s10
3065; CHECK-NEXT:    cset r2, eq
3066; CHECK-NEXT:    tst.w r2, #1
3067; CHECK-NEXT:    csetm r2, ne
3068; CHECK-NEXT:    vmov.32 q0[0], r2
3069; CHECK-NEXT:    vmov.32 q0[1], r2
3070; CHECK-NEXT:    vmov r2, s11
3071; CHECK-NEXT:    orrs r2, r3
3072; CHECK-NEXT:    cset r2, eq
3073; CHECK-NEXT:    tst.w r2, #1
3074; CHECK-NEXT:    csetm r2, ne
3075; CHECK-NEXT:    vmov.32 q0[2], r2
3076; CHECK-NEXT:    vmov.32 q0[3], r2
3077; CHECK-NEXT:    vand q0, q3, q0
3078; CHECK-NEXT:    vmov r4, s2
3079; CHECK-NEXT:    vmov r2, s0
3080; CHECK-NEXT:    vmov r12, s3
3081; CHECK-NEXT:    vmov r3, s1
3082; CHECK-NEXT:    adds r2, r2, r4
3083; CHECK-NEXT:    adc.w r3, r3, r12
3084; CHECK-NEXT:    adds r0, r0, r2
3085; CHECK-NEXT:    adcs r1, r3
3086; CHECK-NEXT:    pop {r4, pc}
3087entry:
3088  %c = icmp eq <2 x i64> %b, zeroinitializer
3089  %m = mul <2 x i64> %x, %y
3090  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3091  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3092  %r = add i64 %z, %a
3093  ret i64 %r
3094}
3095
3096declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3097declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3098declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3099declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3100declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3101declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
3102declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
3103declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
3104declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
3105declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3106