• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
3
4; The following functions should all fail to become tail-predicated.
5; CHECK-NOT: call i32 @llvm.arm.vctp
6
7; trip.count.minus.1 has been inserted into element 1, not 0.
8define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
9entry:
10  %cmp8 = icmp eq i32 %N, 0
11  %tmp8 = add i32 %N, 3
12  %tmp9 = lshr i32 %tmp8, 2
13  %tmp10 = shl nuw i32 %tmp9, 2
14  %tmp11 = add i32 %tmp10, -4
15  %tmp12 = lshr i32 %tmp11, 2
16  %tmp13 = add nuw nsw i32 %tmp12, 1
17  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
18
19vector.ph:                                        ; preds = %entry
20  %trip.count.minus.1 = add i32 %N, -1
21  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
22  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
23  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
24  br label %vector.body
25
26vector.body:                                      ; preds = %vector.body, %vector.ph
27  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
28  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
29  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
30  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
31  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
32  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
33  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
34  %tmp2 = bitcast i32* %tmp to <4 x i32>*
35  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
36  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
37  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
38  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
39  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
40  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
41  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
42  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
43  %index.next = add i32 %index, 4
44  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
45  %tmp16 = icmp ne i32 %tmp15, 0
46  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
47
48for.cond.cleanup:                                 ; preds = %vector.body, %entry
49  ret void
50}
51
52; The insert isn't using an undef for operand 0.
53define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
54entry:
55  %cmp8 = icmp eq i32 %N, 0
56  %tmp8 = add i32 %N, 3
57  %tmp9 = lshr i32 %tmp8, 2
58  %tmp10 = shl nuw i32 %tmp9, 2
59  %tmp11 = add i32 %tmp10, -4
60  %tmp12 = lshr i32 %tmp11, 2
61  %tmp13 = add nuw nsw i32 %tmp12, 1
62  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
63
64vector.ph:                                        ; preds = %entry
65  %trip.count.minus.1 = add i32 %N, -1
66  %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
67  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
68  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
69  br label %vector.body
70
71vector.body:                                      ; preds = %vector.body, %vector.ph
72  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
73  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
74  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
75  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
76  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
77  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
78  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
79  %tmp2 = bitcast i32* %tmp to <4 x i32>*
80  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
81  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
82  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
83  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
84  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
85  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
86  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
87  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
88  %index.next = add i32 %index, 4
89  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
90  %tmp16 = icmp ne i32 %tmp15, 0
91  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
92
93for.cond.cleanup:                                 ; preds = %vector.body, %entry
94  ret void
95}
96
97; The shuffle uses a defined value for operand 1.
98define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
99entry:
100  %cmp8 = icmp eq i32 %N, 0
101  %tmp8 = add i32 %N, 3
102  %tmp9 = lshr i32 %tmp8, 2
103  %tmp10 = shl nuw i32 %tmp9, 2
104  %tmp11 = add i32 %tmp10, -4
105  %tmp12 = lshr i32 %tmp11, 2
106  %tmp13 = add nuw nsw i32 %tmp12, 1
107  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
108
109vector.ph:                                        ; preds = %entry
110  %trip.count.minus.1 = add i32 %N, -1
111  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
112  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
113  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
114  br label %vector.body
115
116vector.body:                                      ; preds = %vector.body, %vector.ph
117  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
118  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
119  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
120  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
121  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
122  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
123  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
124  %tmp2 = bitcast i32* %tmp to <4 x i32>*
125  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
126  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
127  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
128  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
129  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
130  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
131  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
132  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
133  %index.next = add i32 %index, 4
134  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
135  %tmp16 = icmp ne i32 %tmp15, 0
136  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
137
138for.cond.cleanup:                                 ; preds = %vector.body, %entry
139  ret void
140}
141
142; The shuffle uses a non zero value for operand 2.
143define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
144entry:
145  %cmp8 = icmp eq i32 %N, 0
146  %tmp8 = add i32 %N, 3
147  %tmp9 = lshr i32 %tmp8, 2
148  %tmp10 = shl nuw i32 %tmp9, 2
149  %tmp11 = add i32 %tmp10, -4
150  %tmp12 = lshr i32 %tmp11, 2
151  %tmp13 = add nuw nsw i32 %tmp12, 1
152  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
153
154vector.ph:                                        ; preds = %entry
155  %trip.count.minus.1 = add i32 %N, -1
156  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
157  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
158  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
159  br label %vector.body
160
161vector.body:                                      ; preds = %vector.body, %vector.ph
162  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
163  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
164  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
165  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
166  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
167  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
168  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
169  %tmp2 = bitcast i32* %tmp to <4 x i32>*
170  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
171  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
172  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
173  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
174  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
175  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
176  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
177  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
178  %index.next = add i32 %index, 4
179  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
180  %tmp16 = icmp ne i32 %tmp15, 0
181  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
182
183for.cond.cleanup:                                 ; preds = %vector.body, %entry
184  ret void
185}
186
187; %N - 2
188define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
189entry:
190  %cmp8 = icmp eq i32 %N, 0
191  %tmp8 = add i32 %N, 3
192  %tmp9 = lshr i32 %tmp8, 2
193  %tmp10 = shl nuw i32 %tmp9, 2
194  %tmp11 = add i32 %tmp10, -4
195  %tmp12 = lshr i32 %tmp11, 2
196  %tmp13 = add nuw nsw i32 %tmp12, 1
197  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
198
199vector.ph:                                        ; preds = %entry
200  %trip.count.minus.2 = add i32 %N, -2
201  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
202  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
203  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
204  br label %vector.body
205
206vector.body:                                      ; preds = %vector.body, %vector.ph
207  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
208  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
209  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
210  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
211  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
212  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
213  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
214  %tmp2 = bitcast i32* %tmp to <4 x i32>*
215  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
216  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
217  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
218  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
219  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
220  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
221  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
222  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
223  %index.next = add i32 %index, 4
224  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
225  %tmp16 = icmp ne i32 %tmp15, 0
226  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
227
228for.cond.cleanup:                                 ; preds = %vector.body, %entry
229  ret void
230}
231
232; index has been inserted at element 1, not 0.
233define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
234entry:
235  %cmp8 = icmp eq i32 %N, 0
236  %tmp8 = add i32 %N, 3
237  %tmp9 = lshr i32 %tmp8, 2
238  %tmp10 = shl nuw i32 %tmp9, 2
239  %tmp11 = add i32 %tmp10, -4
240  %tmp12 = lshr i32 %tmp11, 2
241  %tmp13 = add nuw nsw i32 %tmp12, 1
242  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
243
244vector.ph:                                        ; preds = %entry
245  %trip.count.minus.1 = add i32 %N, -1
246  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
247  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
248  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
249  br label %vector.body
250
251vector.body:                                      ; preds = %vector.body, %vector.ph
252  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
253  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
254  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
255  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
256  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
257  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
258  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
259  %tmp2 = bitcast i32* %tmp to <4 x i32>*
260  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
261  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
262  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
263  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
264  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
265  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
266  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
267  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
268  %index.next = add i32 %index, 4
269  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
270  %tmp16 = icmp ne i32 %tmp15, 0
271  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
272
273for.cond.cleanup:                                 ; preds = %vector.body, %entry
274  ret void
275}
276
277define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
278entry:
279  %cmp8 = icmp eq i32 %N, 0
280  %tmp8 = add i32 %N, 3
281  %tmp9 = lshr i32 %tmp8, 2
282  %tmp10 = shl nuw i32 %tmp9, 2
283  %tmp11 = add i32 %tmp10, -4
284  %tmp12 = lshr i32 %tmp11, 2
285  %tmp13 = add nuw nsw i32 %tmp12, 1
286  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
287
288vector.ph:                                        ; preds = %entry
289  %trip.count.minus.1 = add i32 %N, -1
290  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
291  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
292  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
293  br label %vector.body
294
295vector.body:                                      ; preds = %vector.body, %vector.ph
296  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
297  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
298  %incorrect = add i32 %index, 1
299  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
300  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
301  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
302  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
303  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
304  %tmp2 = bitcast i32* %tmp to <4 x i32>*
305  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
306  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
307  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
308  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
309  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
310  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
311  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
312  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
313  %index.next = add i32 %index, 4
314  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
315  %tmp16 = icmp ne i32 %tmp15, 0
316  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
317
318for.cond.cleanup:                                 ; preds = %vector.body, %entry
319  ret void
320}
321
322; Now using ult, not ule for the vector icmp
323define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
324entry:
325  %cmp8 = icmp eq i32 %N, 0
326  %tmp8 = add i32 %N, 3
327  %tmp9 = lshr i32 %tmp8, 2
328  %tmp10 = shl nuw i32 %tmp9, 2
329  %tmp11 = add i32 %tmp10, -4
330  %tmp12 = lshr i32 %tmp11, 2
331  %tmp13 = add nuw nsw i32 %tmp12, 1
332  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
333
334vector.ph:                                        ; preds = %entry
335  %trip.count.minus.1 = add i32 %N, -1
336  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
337  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
338  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
339  br label %vector.body
340
341vector.body:                                      ; preds = %vector.body, %vector.ph
342  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
343  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
344  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
345  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
346  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
347  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
348  %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
349  %tmp2 = bitcast i32* %tmp to <4 x i32>*
350  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
351  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
352  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
353  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
354  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
355  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
356  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
357  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
358  %index.next = add i32 %index, 4
359  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
360  %tmp16 = icmp ne i32 %tmp15, 0
361  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
362
363for.cond.cleanup:                                 ; preds = %vector.body, %entry
364  ret void
365}
366
367; The add in the body uses 1, 2, 3, 4
368define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
369entry:
370  %cmp8 = icmp eq i32 %N, 0
371  %tmp8 = add i32 %N, 3
372  %tmp9 = lshr i32 %tmp8, 2
373  %tmp10 = shl nuw i32 %tmp9, 2
374  %tmp11 = add i32 %tmp10, -4
375  %tmp12 = lshr i32 %tmp11, 2
376  %tmp13 = add nuw nsw i32 %tmp12, 1
377  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
378
379vector.ph:                                        ; preds = %entry
380  %trip.count.minus.1 = add i32 %N, -1
381  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
382  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
383  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
384  br label %vector.body
385
386vector.body:                                      ; preds = %vector.body, %vector.ph
387  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
388  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
389  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
390  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
391  %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
392  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
393  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
394  %tmp2 = bitcast i32* %tmp to <4 x i32>*
395  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
396  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
397  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
398  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
399  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
400  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
401  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
402  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
403  %index.next = add i32 %index, 4
404  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
405  %tmp16 = icmp ne i32 %tmp15, 0
406  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
407
408for.cond.cleanup:                                 ; preds = %vector.body, %entry
409  ret void
410}
411
412; Using a variable for the loop body broadcast.
413define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
414entry:
415  %cmp8 = icmp eq i32 %N, 0
416  %tmp8 = add i32 %N, 3
417  %tmp9 = lshr i32 %tmp8, 2
418  %tmp10 = shl nuw i32 %tmp9, 2
419  %tmp11 = add i32 %tmp10, -4
420  %tmp12 = lshr i32 %tmp11, 2
421  %tmp13 = add nuw nsw i32 %tmp12, 1
422  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
423
424vector.ph:                                        ; preds = %entry
425  %trip.count.minus.1 = add i32 %N, -1
426  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
427  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
428  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
429  br label %vector.body
430
431vector.body:                                      ; preds = %vector.body, %vector.ph
432  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
433  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
434  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
435  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
436  %induction = add <4 x i32> %broadcast.splat, %offsets
437  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
438  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
439  %tmp2 = bitcast i32* %tmp to <4 x i32>*
440  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
441  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
442  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
443  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
444  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
445  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
446  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
447  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
448  %index.next = add i32 %index, 4
449  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
450  %tmp16 = icmp ne i32 %tmp15, 0
451  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
452
453for.cond.cleanup:                                 ; preds = %vector.body, %entry
454  ret void
455}
456
457; adding 5, instead of 4, to index.
458define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
459entry:
460  %cmp8 = icmp eq i32 %N, 0
461  %tmp8 = add i32 %N, 3
462  %tmp9 = lshr i32 %tmp8, 2
463  %tmp10 = shl nuw i32 %tmp9, 2
464  %tmp11 = add i32 %tmp10, -4
465  %tmp12 = lshr i32 %tmp11, 2
466  %tmp13 = add nuw nsw i32 %tmp12, 1
467  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
468
469vector.ph:                                        ; preds = %entry
470  %trip.count.minus.1 = add i32 %N, -1
471  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
472  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
473  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
474  br label %vector.body
475
476vector.body:                                      ; preds = %vector.body, %vector.ph
477  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
478  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
479  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
480  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
481  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
482  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
483  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
484  %tmp2 = bitcast i32* %tmp to <4 x i32>*
485  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
486  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
487  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
488  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
489  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
490  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
491  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
492  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
493  %index.next = add i32 %index, 5
494  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
495  %tmp16 = icmp ne i32 %tmp15, 0
496  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
497
498for.cond.cleanup:                                 ; preds = %vector.body, %entry
499  ret void
500}
501
502declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
503declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
504declare i32 @llvm.start.loop.iterations.i32(i32) #3
505declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
506
507