• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
3
4define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
5; CHECK-LABEL: mul_reduce_add:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r2, #0
8; CHECK-NEXT:    itt eq
9; CHECK-NEXT:    moveq r0, #0
10; CHECK-NEXT:    bxeq lr
11; CHECK-NEXT:  .LBB0_1: @ %vector.ph
12; CHECK-NEXT:    push {r7, lr}
13; CHECK-NEXT:    adds r3, r2, #3
14; CHECK-NEXT:    vmov.i32 q0, #0x0
15; CHECK-NEXT:    bic r3, r3, #3
16; CHECK-NEXT:    sub.w r12, r3, #4
17; CHECK-NEXT:    movs r3, #1
18; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
19; CHECK-NEXT:    movs r3, #0
20; CHECK-NEXT:    dls lr, lr
21; CHECK-NEXT:  .LBB0_2: @ %vector.body
22; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
23; CHECK-NEXT:    vctp.32 r2
24; CHECK-NEXT:    vmov q1, q0
25; CHECK-NEXT:    vpstt
26; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
27; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
28; CHECK-NEXT:    adds r3, #4
29; CHECK-NEXT:    vmul.i32 q0, q2, q0
30; CHECK-NEXT:    subs r2, #4
31; CHECK-NEXT:    vadd.i32 q0, q0, q1
32; CHECK-NEXT:    le lr, .LBB0_2
33; CHECK-NEXT:  @ %bb.3: @ %middle.block
34; CHECK-NEXT:    vpsel q0, q0, q1
35; CHECK-NEXT:    vaddv.u32 r0, q0
36; CHECK-NEXT:    pop {r7, pc}
37entry:
38  %cmp8 = icmp eq i32 %N, 0
39  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
40
41vector.ph:                                        ; preds = %entry
42  %n.rnd.up = add i32 %N, 3
43  %n.vec = and i32 %n.rnd.up, -4
44  %trip.count.minus.1 = add i32 %N, -1
45  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
46  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
47  br label %vector.body
48
49vector.body:                                      ; preds = %vector.body, %vector.ph
50  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
51  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
52  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
53  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
54  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
55  %0 = getelementptr inbounds i32, i32* %a, i32 %index
56
57  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
58  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
59
60  %2 = bitcast i32* %0 to <4 x i32>*
61  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
62  %3 = getelementptr inbounds i32, i32* %b, i32 %index
63  %4 = bitcast i32* %3 to <4 x i32>*
64  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
65  %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
66  %6 = add nsw <4 x i32> %5, %vec.phi
67  %index.next = add i32 %index, 4
68  %7 = icmp eq i32 %index.next, %n.vec
69  br i1 %7, label %middle.block, label %vector.body
70
71middle.block:                                     ; preds = %vector.body
72  %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
73  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
74  br label %for.cond.cleanup
75
76for.cond.cleanup:                                 ; preds = %middle.block, %entry
77  %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
78  ret i32 %res.0.lcssa
79}
80
81define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
82; CHECK-LABEL: mul_reduce_add_const:
83; CHECK:       @ %bb.0: @ %entry
84; CHECK-NEXT:    cmp r2, #0
85; CHECK-NEXT:    itt eq
86; CHECK-NEXT:    moveq r0, #0
87; CHECK-NEXT:    bxeq lr
88; CHECK-NEXT:  .LBB1_1: @ %vector.ph
89; CHECK-NEXT:    push {r7, lr}
90; CHECK-NEXT:    adds r1, r2, #3
91; CHECK-NEXT:    movs r3, #1
92; CHECK-NEXT:    bic r1, r1, #3
93; CHECK-NEXT:    vmov.i32 q0, #0x0
94; CHECK-NEXT:    subs r1, #4
95; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
96; CHECK-NEXT:    movs r1, #0
97; CHECK-NEXT:    dls lr, lr
98; CHECK-NEXT:  .LBB1_2: @ %vector.body
99; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
100; CHECK-NEXT:    vctp.32 r2
101; CHECK-NEXT:    vmov q1, q0
102; CHECK-NEXT:    vpst
103; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
104; CHECK-NEXT:    adds r1, #4
105; CHECK-NEXT:    subs r2, #4
106; CHECK-NEXT:    vadd.i32 q0, q0, q1
107; CHECK-NEXT:    le lr, .LBB1_2
108; CHECK-NEXT:  @ %bb.3: @ %middle.block
109; CHECK-NEXT:    vpsel q0, q0, q1
110; CHECK-NEXT:    vaddv.u32 r0, q0
111; CHECK-NEXT:    pop {r7, pc}
112entry:
113  %cmp6 = icmp eq i32 %N, 0
114  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
115
116vector.ph:                                        ; preds = %entry
117  %n.rnd.up = add i32 %N, 3
118  %n.vec = and i32 %n.rnd.up, -4
119  %trip.count.minus.1 = add i32 %N, -1
120  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
121  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
122  br label %vector.body
123
124vector.body:                                      ; preds = %vector.body, %vector.ph
125  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
126  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
127  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
128  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
129  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
130  %0 = getelementptr inbounds i32, i32* %a, i32 %index
131
132  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
133  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
134
135  %2 = bitcast i32* %0 to <4 x i32>*
136  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
137  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
138  %index.next = add i32 %index, 4
139  %4 = icmp eq i32 %index.next, %n.vec
140  br i1 %4, label %middle.block, label %vector.body
141
142middle.block:                                     ; preds = %vector.body
143  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
144  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
145  br label %for.cond.cleanup
146
147for.cond.cleanup:                                 ; preds = %middle.block, %entry
148  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
149  ret i32 %res.0.lcssa
150}
151
152define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
153; CHECK-LABEL: add_reduce_add_const:
154; CHECK:       @ %bb.0: @ %entry
155; CHECK-NEXT:    cmp r2, #0
156; CHECK-NEXT:    itt eq
157; CHECK-NEXT:    moveq r0, #0
158; CHECK-NEXT:    bxeq lr
159; CHECK-NEXT:  .LBB2_1: @ %vector.ph
160; CHECK-NEXT:    push {r7, lr}
161; CHECK-NEXT:    adds r1, r2, #3
162; CHECK-NEXT:    movs r3, #1
163; CHECK-NEXT:    bic r1, r1, #3
164; CHECK-NEXT:    vmov.i32 q0, #0x0
165; CHECK-NEXT:    subs r1, #4
166; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
167; CHECK-NEXT:    movs r1, #0
168; CHECK-NEXT:    dls lr, lr
169; CHECK-NEXT:  .LBB2_2: @ %vector.body
170; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
171; CHECK-NEXT:    vctp.32 r2
172; CHECK-NEXT:    vmov q1, q0
173; CHECK-NEXT:    vpst
174; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
175; CHECK-NEXT:    adds r1, #4
176; CHECK-NEXT:    subs r2, #4
177; CHECK-NEXT:    vadd.i32 q0, q0, q1
178; CHECK-NEXT:    le lr, .LBB2_2
179; CHECK-NEXT:  @ %bb.3: @ %middle.block
180; CHECK-NEXT:    vpsel q0, q0, q1
181; CHECK-NEXT:    vaddv.u32 r0, q0
182; CHECK-NEXT:    pop {r7, pc}
183entry:
184  %cmp6 = icmp eq i32 %N, 0
185  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
186
187vector.ph:                                        ; preds = %entry
188  %n.rnd.up = add i32 %N, 3
189  %n.vec = and i32 %n.rnd.up, -4
190  %trip.count.minus.1 = add i32 %N, -1
191  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
192  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
193  br label %vector.body
194
195vector.body:                                      ; preds = %vector.body, %vector.ph
196  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
197  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
198  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
199  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
200  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
201  %0 = getelementptr inbounds i32, i32* %a, i32 %index
202
203  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
204  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
205
206  %2 = bitcast i32* %0 to <4 x i32>*
207  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
208  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
209  %index.next = add i32 %index, 4
210  %4 = icmp eq i32 %index.next, %n.vec
211  br i1 %4, label %middle.block, label %vector.body
212
213middle.block:                                     ; preds = %vector.body
214  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
215  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
216  br label %for.cond.cleanup
217
218for.cond.cleanup:                                 ; preds = %middle.block, %entry
219  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
220  ret i32 %res.0.lcssa
221}
222
223define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
224; CHECK-LABEL: vector_mul_const:
225; CHECK:       @ %bb.0: @ %entry
226; CHECK-NEXT:    push {r7, lr}
227; CHECK-NEXT:    cmp r3, #0
228; CHECK-NEXT:    it eq
229; CHECK-NEXT:    popeq {r7, pc}
230; CHECK-NEXT:  .LBB3_1: @ %vector.ph
231; CHECK-NEXT:    mov.w r12, #0
232; CHECK-NEXT:    dlstp.32 lr, r3
233; CHECK-NEXT:  .LBB3_2: @ %vector.body
234; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
235; CHECK-NEXT:    add.w r12, r12, #4
236; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
237; CHECK-NEXT:    vmul.i32 q0, q0, r2
238; CHECK-NEXT:    vstrw.32 q0, [r0], #16
239; CHECK-NEXT:    letp lr, .LBB3_2
240; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
241; CHECK-NEXT:    pop {r7, pc}
242entry:
243  %cmp6 = icmp eq i32 %N, 0
244  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
245
246vector.ph:                                        ; preds = %entry
247  %n.rnd.up = add i32 %N, 3
248  %n.vec = and i32 %n.rnd.up, -4
249  %trip.count.minus.1 = add i32 %N, -1
250  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
251  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
252  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
253  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
254  br label %vector.body
255
256vector.body:                                      ; preds = %vector.body, %vector.ph
257  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
258  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
259  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
260  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
261  %0 = getelementptr inbounds i32, i32* %b, i32 %index
262
263  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
264  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
265
266  %2 = bitcast i32* %0 to <4 x i32>*
267  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
268  %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
269  %4 = getelementptr inbounds i32, i32* %a, i32 %index
270  %5 = bitcast i32* %4 to <4 x i32>*
271  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
272  %index.next = add i32 %index, 4
273  %6 = icmp eq i32 %index.next, %n.vec
274  br i1 %6, label %for.cond.cleanup, label %vector.body
275
276for.cond.cleanup:                                 ; preds = %vector.body, %entry
277  ret void
278}
279
280define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
281; CHECK-LABEL: vector_add_const:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    push {r7, lr}
284; CHECK-NEXT:    cmp r3, #0
285; CHECK-NEXT:    it eq
286; CHECK-NEXT:    popeq {r7, pc}
287; CHECK-NEXT:  .LBB4_1: @ %vector.ph
288; CHECK-NEXT:    mov.w r12, #0
289; CHECK-NEXT:    dlstp.32 lr, r3
290; CHECK-NEXT:  .LBB4_2: @ %vector.body
291; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
292; CHECK-NEXT:    add.w r12, r12, #4
293; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
294; CHECK-NEXT:    vadd.i32 q0, q0, r2
295; CHECK-NEXT:    vstrw.32 q0, [r0], #16
296; CHECK-NEXT:    letp lr, .LBB4_2
297; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
298; CHECK-NEXT:    pop {r7, pc}
299entry:
300  %cmp6 = icmp eq i32 %N, 0
301  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
302
303vector.ph:                                        ; preds = %entry
304  %n.rnd.up = add i32 %N, 3
305  %n.vec = and i32 %n.rnd.up, -4
306  %trip.count.minus.1 = add i32 %N, -1
307  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
308  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
309  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
310  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
311  br label %vector.body
312
313vector.body:                                      ; preds = %vector.body, %vector.ph
314  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
315  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
316  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
317  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
318  %0 = getelementptr inbounds i32, i32* %b, i32 %index
319
320  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
321  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
322
323  %2 = bitcast i32* %0 to <4 x i32>*
324  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
325  %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
326  %4 = getelementptr inbounds i32, i32* %a, i32 %index
327  %5 = bitcast i32* %4 to <4 x i32>*
328  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
329  %index.next = add i32 %index, 4
330  %6 = icmp eq i32 %index.next, %n.vec
331  br i1 %6, label %for.cond.cleanup, label %vector.body
332
333for.cond.cleanup:                                 ; preds = %vector.body, %entry
334  ret void
335}
336
337define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) {
338; CHECK-LABEL: vector_mul_vector_i8:
339; CHECK:       @ %bb.0: @ %entry
340; CHECK-NEXT:    push {r7, lr}
341; CHECK-NEXT:    cmp r3, #0
342; CHECK-NEXT:    it eq
343; CHECK-NEXT:    popeq {r7, pc}
344; CHECK-NEXT:  .LBB5_1: @ %vector.ph
345; CHECK-NEXT:    mov.w r12, #0
346; CHECK-NEXT:    dlstp.8 lr, r3
347; CHECK-NEXT:  .LBB5_2: @ %vector.body
348; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
349; CHECK-NEXT:    add.w r12, r12, #16
350; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
351; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
352; CHECK-NEXT:    vmul.i8 q0, q1, q0
353; CHECK-NEXT:    vstrb.8 q0, [r0], #16
354; CHECK-NEXT:    letp lr, .LBB5_2
355; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
356; CHECK-NEXT:    pop {r7, pc}
357entry:
358  %cmp10 = icmp eq i32 %N, 0
359  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
360
361vector.ph:                                        ; preds = %entry
362  %n.rnd.up = add i32 %N, 15
363  %n.vec = and i32 %n.rnd.up, -16
364  %trip.count.minus.1 = add i32 %N, -1
365  %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
366  %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
367  br label %vector.body
368
369vector.body:                                      ; preds = %vector.body, %vector.ph
370  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
371  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
372  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
373  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
374  %0 = getelementptr inbounds i8, i8* %b, i32 %index
375
376  ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
377  %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
378
379  %2 = bitcast i8* %0 to <16 x i8>*
380  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
381  %3 = getelementptr inbounds i8, i8* %c, i32 %index
382  %4 = bitcast i8* %3 to <16 x i8>*
383  %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef)
384  %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load
385  %6 = getelementptr inbounds i8, i8* %a, i32 %index
386  %7 = bitcast i8* %6 to <16 x i8>*
387  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1)
388  %index.next = add i32 %index, 16
389  %8 = icmp eq i32 %index.next, %n.vec
390  br i1 %8, label %for.cond.cleanup, label %vector.body
391
392for.cond.cleanup:                                 ; preds = %vector.body, %entry
393  ret void
394}
395
396; Function Attrs: nofree norecurse nounwind
397define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
398; CHECK-LABEL: vector_mul_vector_i16:
399; CHECK:       @ %bb.0: @ %entry
400; CHECK-NEXT:    push {r7, lr}
401; CHECK-NEXT:    cmp r3, #0
402; CHECK-NEXT:    it eq
403; CHECK-NEXT:    popeq {r7, pc}
404; CHECK-NEXT:  .LBB6_1: @ %vector.ph
405; CHECK-NEXT:    mov.w r12, #0
406; CHECK-NEXT:    dlstp.16 lr, r3
407; CHECK-NEXT:  .LBB6_2: @ %vector.body
408; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
409; CHECK-NEXT:    add.w r12, r12, #8
410; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
411; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
412; CHECK-NEXT:    vmul.i16 q0, q1, q0
413; CHECK-NEXT:    vstrh.16 q0, [r0], #16
414; CHECK-NEXT:    letp lr, .LBB6_2
415; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
416; CHECK-NEXT:    pop {r7, pc}
417entry:
418  %cmp10 = icmp eq i32 %N, 0
419  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
420
421vector.ph:                                        ; preds = %entry
422  %n.rnd.up = add i32 %N, 7
423  %n.vec = and i32 %n.rnd.up, -8
424  %trip.count.minus.1 = add i32 %N, -1
425  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
426  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
427  br label %vector.body
428
429vector.body:                                      ; preds = %vector.body, %vector.ph
430  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
431  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
432  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
433  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
434  %0 = getelementptr inbounds i16, i16* %b, i32 %index
435
436  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
437  %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
438
439  %2 = bitcast i16* %0 to <8 x i16>*
440  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
441  %3 = getelementptr inbounds i16, i16* %c, i32 %index
442  %4 = bitcast i16* %3 to <8 x i16>*
443  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef)
444  %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load
445  %6 = getelementptr inbounds i16, i16* %a, i32 %index
446  %7 = bitcast i16* %6 to <8 x i16>*
447  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1)
448  %index.next = add i32 %index, 8
449  %8 = icmp eq i32 %index.next, %n.vec
450  br i1 %8, label %for.cond.cleanup, label %vector.body
451
452for.cond.cleanup:                                 ; preds = %vector.body, %entry
453  ret void
454}
455
456declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
457declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
458declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
459declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
460declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
461declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
462declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
463declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
464declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
465declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
466