• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=hexagon < %s | FileCheck %s
2; CHECK-NOT: setbit(r{{[0-9]+}},#1)
3
4target triple = "hexagon-unknown--elf"
5
6%s.8 = type { i8*, i32, i32, i32, i32, %s.9*, %s.9*, %s.9* }
7%s.9 = type { %s.10 }
8%s.10 = type { i64 }
9%s.4 = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
10
11@g0 = private constant [6 x i8] c"input\00", align 32
12@g1 = private constant [11 x i8] c"gaussian11\00", align 32
13@g2 = private constant [2 x %s.8] [%s.8 { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @g0, i32 0, i32 0), i32 1, i32 2, i32 1, i32 8, %s.9* null, %s.9* null, %s.9* null }, %s.8 { i8* getelementptr inbounds ([11 x i8], [11 x i8]* @g1, i32 0, i32 0), i32 2, i32 2, i32 1, i32 8, %s.9* null, %s.9* null, %s.9* null }]
14@g3 = private constant [53 x i8] c"hexagon-32-os_unknown-no_asserts-no_bounds_query-hvx\00", align 32
15
16; Function Attrs: nounwind
17declare i8* @f0(i8*, i32) #0
18
19; Function Attrs: nounwind
20declare void @f1(i8*, i8*) #0
21
22; Function Attrs: nounwind
23declare noalias i8* @f2(i8*, i32) #0
24
25; Function Attrs: nounwind
26declare void @f3(i8*, i8*) #0
27
28; Function Attrs: nounwind
29declare void @f4() #0
30
31; Function Attrs: nounwind
32declare void @f5() #0
33
34; Function Attrs: nounwind
35define i32 @f6(%s.4* noalias nocapture readonly %a0, %s.4* noalias nocapture readonly %a1) #0 {
36b0:
37  %v0 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 1
38  %v1 = load i8*, i8** %v0
39  %v2 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 2, i32 0
40  %v3 = load i32, i32* %v2
41  %v4 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 2, i32 1
42  %v5 = load i32, i32* %v4
43  %v6 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 3, i32 1
44  %v7 = load i32, i32* %v6
45  %v8 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 4, i32 0
46  %v9 = load i32, i32* %v8
47  %v10 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 4, i32 1
48  %v11 = load i32, i32* %v10
49  %v12 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 1
50  %v13 = load i8*, i8** %v12
51  %v14 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 2, i32 0
52  %v15 = load i32, i32* %v14
53  %v16 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 2, i32 1
54  %v17 = load i32, i32* %v16
55  %v18 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 3, i32 1
56  %v19 = load i32, i32* %v18
57  %v20 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 4, i32 0
58  %v21 = load i32, i32* %v20
59  %v22 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 4, i32 1
60  %v23 = load i32, i32* %v22
61  %v24 = add nsw i32 %v21, %v15
62  %v25 = add nsw i32 %v24, -64
63  %v26 = icmp slt i32 %v21, %v25
64  %v27 = select i1 %v26, i32 %v21, i32 %v25
65  %v28 = add nsw i32 %v15, -1
66  %v29 = and i32 %v28, -64
67  %v30 = add i32 %v21, 63
68  %v31 = add i32 %v30, %v29
69  %v32 = add nsw i32 %v24, -1
70  %v33 = icmp slt i32 %v31, %v32
71  %v34 = select i1 %v33, i32 %v31, i32 %v32
72  %v35 = sub nsw i32 %v34, %v27
73  %v36 = icmp slt i32 %v24, %v34
74  %v37 = select i1 %v36, i32 %v34, i32 %v24
75  %v38 = add nsw i32 %v37, -1
76  %v39 = icmp slt i32 %v38, %v34
77  %v40 = select i1 %v39, i32 %v34, i32 %v38
78  %v41 = add nsw i32 %v17, 1
79  %v42 = sext i32 %v41 to i64
80  %v43 = sub nsw i32 %v40, %v27
81  %v44 = add nsw i32 %v43, 2
82  %v45 = sext i32 %v44 to i64
83  %v46 = mul nsw i64 %v45, %v42
84  %v47 = trunc i64 %v46 to i32
85  %v48 = tail call i8* @f2(i8* null, i32 %v47)
86  %v49 = add nsw i32 %v23, -1
87  %v50 = add i32 %v23, %v17
88  %v51 = icmp sgt i32 %v23, %v50
89  br i1 %v51, label %b12, label %b1, !prof !3
90
91b1:                                               ; preds = %b11, %b0
92  %v52 = phi i32 [ %v220, %b11 ], [ %v49, %b0 ]
93  %v53 = icmp slt i32 %v9, %v24
94  %v54 = select i1 %v53, i32 %v9, i32 %v24
95  %v55 = add nsw i32 %v21, -1
96  %v56 = icmp slt i32 %v54, %v55
97  %v57 = select i1 %v56, i32 %v55, i32 %v54
98  %v58 = add nsw i32 %v9, %v3
99  %v59 = icmp slt i32 %v58, %v24
100  %v60 = select i1 %v59, i32 %v58, i32 %v24
101  %v61 = icmp slt i32 %v60, %v57
102  %v62 = select i1 %v61, i32 %v57, i32 %v60
103  %v63 = icmp slt i32 %v57, %v21
104  br i1 %v63, label %b7, label %b2, !prof !3
105
106b2:                                               ; preds = %b1
107  %v64 = add nsw i32 %v11, %v5
108  %v65 = add nsw i32 %v64, -1
109  %v66 = icmp slt i32 %v52, %v65
110  br i1 %v66, label %b3, label %b4
111
112b3:                                               ; preds = %b3, %b2
113  %v67 = phi i32 [ %v96, %b3 ], [ %v55, %b2 ]
114  %v68 = mul nsw i32 %v11, %v7
115  %v69 = icmp slt i32 %v52, %v11
116  %v70 = select i1 %v69, i32 %v11, i32 %v52
117  %v71 = mul nsw i32 %v70, %v7
118  %v72 = add nsw i32 %v58, -1
119  %v73 = icmp slt i32 %v67, %v72
120  %v74 = select i1 %v73, i32 %v67, i32 %v72
121  %v75 = icmp slt i32 %v74, %v9
122  %v76 = select i1 %v75, i32 %v9, i32 %v74
123  %v77 = add i32 %v68, %v9
124  %v78 = sub i32 %v71, %v77
125  %v79 = add i32 %v78, %v76
126  %v80 = getelementptr inbounds i8, i8* %v1, i32 %v79
127  %v81 = load i8, i8* %v80, align 1, !tbaa !4
128  %v82 = icmp sle i32 %v64, %v52
129  %v83 = icmp sle i32 %v58, %v67
130  %v84 = icmp slt i32 %v67, %v9
131  %v85 = or i1 %v84, %v83
132  %v86 = or i1 %v69, %v85
133  %v87 = or i1 %v82, %v86
134  %v88 = select i1 %v87, i8 0, i8 %v81
135  %v89 = sub i32 1, %v23
136  %v90 = add i32 %v89, %v52
137  %v91 = mul nsw i32 %v90, %v44
138  %v92 = sub i32 1, %v27
139  %v93 = add i32 %v92, %v91
140  %v94 = add i32 %v93, %v67
141  %v95 = getelementptr inbounds i8, i8* %v48, i32 %v94
142  store i8 %v88, i8* %v95, align 1, !tbaa !7
143  %v96 = add nsw i32 %v67, 1
144  %v97 = icmp eq i32 %v96, %v57
145  br i1 %v97, label %b7, label %b3
146
147b4:                                               ; preds = %b2
148  %v98 = icmp slt i32 %v5, 1
149  br i1 %v98, label %b5, label %b6
150
151b5:                                               ; preds = %b5, %b4
152  %v99 = phi i32 [ %v123, %b5 ], [ %v55, %b4 ]
153  %v100 = add nsw i32 %v58, -1
154  %v101 = icmp slt i32 %v99, %v100
155  %v102 = select i1 %v101, i32 %v99, i32 %v100
156  %v103 = icmp slt i32 %v102, %v9
157  %v104 = select i1 %v103, i32 %v9, i32 %v102
158  %v105 = sub i32 %v104, %v9
159  %v106 = getelementptr inbounds i8, i8* %v1, i32 %v105
160  %v107 = load i8, i8* %v106, align 1, !tbaa !4
161  %v108 = icmp sle i32 %v64, %v52
162  %v109 = icmp slt i32 %v52, %v11
163  %v110 = icmp sle i32 %v58, %v99
164  %v111 = icmp slt i32 %v99, %v9
165  %v112 = or i1 %v111, %v110
166  %v113 = or i1 %v109, %v112
167  %v114 = or i1 %v108, %v113
168  %v115 = select i1 %v114, i8 0, i8 %v107
169  %v116 = sub i32 1, %v23
170  %v117 = add i32 %v116, %v52
171  %v118 = mul nsw i32 %v117, %v44
172  %v119 = sub i32 1, %v27
173  %v120 = add i32 %v119, %v118
174  %v121 = add i32 %v120, %v99
175  %v122 = getelementptr inbounds i8, i8* %v48, i32 %v121
176  store i8 %v115, i8* %v122, align 1, !tbaa !7
177  %v123 = add nsw i32 %v99, 1
178  %v124 = icmp eq i32 %v123, %v57
179  br i1 %v124, label %b7, label %b5
180
181b6:                                               ; preds = %b6, %b4
182  %v125 = phi i32 [ %v153, %b6 ], [ %v55, %b4 ]
183  %v126 = mul nsw i32 %v11, %v7
184  %v127 = mul nsw i32 %v65, %v7
185  %v128 = add nsw i32 %v58, -1
186  %v129 = icmp slt i32 %v125, %v128
187  %v130 = select i1 %v129, i32 %v125, i32 %v128
188  %v131 = icmp slt i32 %v130, %v9
189  %v132 = select i1 %v131, i32 %v9, i32 %v130
190  %v133 = add i32 %v126, %v9
191  %v134 = sub i32 %v127, %v133
192  %v135 = add i32 %v134, %v132
193  %v136 = getelementptr inbounds i8, i8* %v1, i32 %v135
194  %v137 = load i8, i8* %v136, align 1, !tbaa !4
195  %v138 = icmp sle i32 %v64, %v52
196  %v139 = icmp slt i32 %v52, %v11
197  %v140 = icmp sle i32 %v58, %v125
198  %v141 = icmp slt i32 %v125, %v9
199  %v142 = or i1 %v141, %v140
200  %v143 = or i1 %v139, %v142
201  %v144 = or i1 %v138, %v143
202  %v145 = select i1 %v144, i8 0, i8 %v137
203  %v146 = sub i32 1, %v23
204  %v147 = add i32 %v146, %v52
205  %v148 = mul nsw i32 %v147, %v44
206  %v149 = sub i32 1, %v27
207  %v150 = add i32 %v149, %v148
208  %v151 = add i32 %v150, %v125
209  %v152 = getelementptr inbounds i8, i8* %v48, i32 %v151
210  store i8 %v145, i8* %v152, align 1, !tbaa !7
211  %v153 = add nsw i32 %v125, 1
212  %v154 = icmp eq i32 %v153, %v57
213  br i1 %v154, label %b7, label %b6
214
215b7:                                               ; preds = %b6, %b5, %b3, %b1
216  %v155 = icmp slt i32 %v57, %v62
217  br i1 %v155, label %b8, label %b9, !prof !9
218
219b8:                                               ; preds = %b8, %b7
220  %v156 = phi i32 [ %v181, %b8 ], [ %v57, %b7 ]
221  %v157 = mul nsw i32 %v11, %v7
222  %v158 = add nsw i32 %v11, %v5
223  %v159 = add nsw i32 %v158, -1
224  %v160 = icmp slt i32 %v52, %v159
225  %v161 = select i1 %v160, i32 %v52, i32 %v159
226  %v162 = icmp slt i32 %v161, %v11
227  %v163 = select i1 %v162, i32 %v11, i32 %v161
228  %v164 = mul nsw i32 %v163, %v7
229  %v165 = add i32 %v157, %v9
230  %v166 = sub i32 %v164, %v165
231  %v167 = add i32 %v166, %v156
232  %v168 = getelementptr inbounds i8, i8* %v1, i32 %v167
233  %v169 = load i8, i8* %v168, align 1, !tbaa !4
234  %v170 = icmp sle i32 %v158, %v52
235  %v171 = icmp slt i32 %v52, %v11
236  %v172 = or i1 %v171, %v170
237  %v173 = select i1 %v172, i8 0, i8 %v169
238  %v174 = sub i32 1, %v23
239  %v175 = add i32 %v174, %v52
240  %v176 = mul nsw i32 %v175, %v44
241  %v177 = sub i32 1, %v27
242  %v178 = add i32 %v177, %v176
243  %v179 = add i32 %v178, %v156
244  %v180 = getelementptr inbounds i8, i8* %v48, i32 %v179
245  store i8 %v173, i8* %v180, align 1, !tbaa !7
246  %v181 = add nsw i32 %v156, 1
247  %v182 = icmp eq i32 %v181, %v62
248  br i1 %v182, label %b9, label %b8
249
250b9:                                               ; preds = %b8, %b7
251  %v183 = icmp slt i32 %v62, %v24
252  br i1 %v183, label %b10, label %b11, !prof !9
253
254b10:                                              ; preds = %b10, %b9
255  %v184 = phi i32 [ %v218, %b10 ], [ %v62, %b9 ]
256  %v185 = mul nsw i32 %v11, %v7
257  %v186 = add nsw i32 %v11, %v5
258  %v187 = add nsw i32 %v186, -1
259  %v188 = icmp slt i32 %v52, %v187
260  %v189 = select i1 %v188, i32 %v52, i32 %v187
261  %v190 = icmp slt i32 %v189, %v11
262  %v191 = select i1 %v190, i32 %v11, i32 %v189
263  %v192 = mul nsw i32 %v191, %v7
264  %v193 = add nsw i32 %v58, -1
265  %v194 = icmp slt i32 %v184, %v193
266  %v195 = select i1 %v194, i32 %v184, i32 %v193
267  %v196 = icmp slt i32 %v195, %v9
268  %v197 = select i1 %v196, i32 %v9, i32 %v195
269  %v198 = add i32 %v185, %v9
270  %v199 = sub i32 %v192, %v198
271  %v200 = add i32 %v199, %v197
272  %v201 = getelementptr inbounds i8, i8* %v1, i32 %v200
273  %v202 = load i8, i8* %v201, align 1, !tbaa !4
274  %v203 = icmp sle i32 %v186, %v52
275  %v204 = icmp slt i32 %v52, %v11
276  %v205 = icmp sle i32 %v58, %v184
277  %v206 = icmp slt i32 %v184, %v9
278  %v207 = or i1 %v206, %v205
279  %v208 = or i1 %v204, %v207
280  %v209 = or i1 %v203, %v208
281  %v210 = select i1 %v209, i8 0, i8 %v202
282  %v211 = sub i32 1, %v23
283  %v212 = add i32 %v211, %v52
284  %v213 = mul nsw i32 %v212, %v44
285  %v214 = sub i32 1, %v27
286  %v215 = add i32 %v214, %v213
287  %v216 = add i32 %v215, %v184
288  %v217 = getelementptr inbounds i8, i8* %v48, i32 %v216
289  store i8 %v210, i8* %v217, align 1, !tbaa !7
290  %v218 = add nsw i32 %v184, 1
291  %v219 = icmp eq i32 %v218, %v24
292  br i1 %v219, label %b11, label %b10
293
294b11:                                              ; preds = %b10, %b9
295  %v220 = add nsw i32 %v52, 1
296  %v221 = icmp eq i32 %v220, %v50
297  br i1 %v221, label %b12, label %b1
298
299b12:                                              ; preds = %b11, %b0
300  %v222 = add nsw i32 %v35, 1
301  %v223 = sext i32 %v222 to i64
302  %v224 = shl nsw i64 %v42, 2
303  %v225 = mul i64 %v224, %v223
304  %v226 = trunc i64 %v225 to i32
305  %v227 = tail call i8* @f2(i8* null, i32 %v226)
306  br i1 %v51, label %b14, label %b13, !prof !3
307
308b13:                                              ; preds = %b19, %b12
309  %v228 = phi i32 [ %v351, %b19 ], [ %v49, %b12 ]
310  %v229 = ashr i32 %v15, 6
311  %v230 = icmp slt i32 %v229, 0
312  %v231 = select i1 %v230, i32 0, i32 %v229
313  %v232 = icmp sgt i32 %v231, 0
314  br i1 %v232, label %b16, label %b17, !prof !9
315
316b14:                                              ; preds = %b19, %b12
317  %v233 = icmp eq i8* %v48, null
318  br i1 %v233, label %b20, label %b15
319
320b15:                                              ; preds = %b14
321  tail call void @f3(i8* null, i8* %v48) #2
322  br label %b20
323
324b16:                                              ; preds = %b16, %b13
325  %v234 = phi i32 [ %v289, %b16 ], [ 0, %b13 ]
326  %v235 = sub nsw i32 %v228, %v23
327  %v236 = add nsw i32 %v235, 1
328  %v237 = mul nsw i32 %v236, %v44
329  %v238 = shl i32 %v234, 6
330  %v239 = sub i32 %v21, %v27
331  %v240 = add i32 %v239, %v238
332  %v241 = add nsw i32 %v240, %v237
333  %v242 = getelementptr inbounds i8, i8* %v48, i32 %v241
334  %v243 = bitcast i8* %v242 to <16 x i32>*
335  %v244 = load <16 x i32>, <16 x i32>* %v243, align 1, !tbaa !7
336  %v245 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v244)
337  %v246 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v245)
338  %v247 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v245)
339  %v248 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v247)
340  %v249 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v246)
341  %v250 = add nsw i32 %v241, 1
342  %v251 = getelementptr inbounds i8, i8* %v48, i32 %v250
343  %v252 = bitcast i8* %v251 to <16 x i32>*
344  %v253 = load <16 x i32>, <16 x i32>* %v252, align 1, !tbaa !7
345  %v254 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v253)
346  %v255 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v254)
347  %v256 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v255)
348  %v257 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v256)
349  %v258 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v256)
350  %v259 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v257, i32 168430090)
351  %v260 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v258, i32 168430090)
352  %v261 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v259, <16 x i32> %v260)
353  %v262 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v254)
354  %v263 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v262)
355  %v264 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v263)
356  %v265 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v263)
357  %v266 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v264, i32 168430090)
358  %v267 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v265, i32 168430090)
359  %v268 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v266, <16 x i32> %v267)
360  %v269 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v248, <32 x i32> %v261)
361  %v270 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v249, <32 x i32> %v268)
362  %v271 = shufflevector <32 x i32> %v269, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
363  %v272 = mul nsw i32 %v236, %v222
364  %v273 = add nsw i32 %v240, %v272
365  %v274 = bitcast i8* %v227 to i32*
366  %v275 = getelementptr inbounds i32, i32* %v274, i32 %v273
367  %v276 = bitcast i32* %v275 to <16 x i32>*
368  store <16 x i32> %v271, <16 x i32>* %v276, align 4, !tbaa !10
369  %v277 = shufflevector <32 x i32> %v269, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
370  %v278 = add nsw i32 %v273, 16
371  %v279 = getelementptr inbounds i32, i32* %v274, i32 %v278
372  %v280 = bitcast i32* %v279 to <16 x i32>*
373  store <16 x i32> %v277, <16 x i32>* %v280, align 4, !tbaa !10
374  %v281 = shufflevector <32 x i32> %v270, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
375  %v282 = add nsw i32 %v273, 32
376  %v283 = getelementptr inbounds i32, i32* %v274, i32 %v282
377  %v284 = bitcast i32* %v283 to <16 x i32>*
378  store <16 x i32> %v281, <16 x i32>* %v284, align 4, !tbaa !10
379  %v285 = shufflevector <32 x i32> %v270, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
380  %v286 = add nsw i32 %v273, 48
381  %v287 = getelementptr inbounds i32, i32* %v274, i32 %v286
382  %v288 = bitcast i32* %v287 to <16 x i32>*
383  store <16 x i32> %v285, <16 x i32>* %v288, align 4, !tbaa !10
384  %v289 = add nuw nsw i32 %v234, 1
385  %v290 = icmp eq i32 %v289, %v231
386  br i1 %v290, label %b17, label %b16
387
388b17:                                              ; preds = %b16, %b13
389  %v291 = add nsw i32 %v15, 63
390  %v292 = ashr i32 %v291, 6
391  %v293 = icmp slt i32 %v231, %v292
392  br i1 %v293, label %b18, label %b19, !prof !9
393
394b18:                                              ; preds = %b18, %b17
395  %v294 = phi i32 [ %v349, %b18 ], [ %v231, %b17 ]
396  %v295 = sub nsw i32 %v228, %v23
397  %v296 = add nsw i32 %v295, 1
398  %v297 = mul nsw i32 %v296, %v44
399  %v298 = sub nsw i32 %v24, %v27
400  %v299 = add nsw i32 %v297, %v298
401  %v300 = add nsw i32 %v299, -64
402  %v301 = getelementptr inbounds i8, i8* %v48, i32 %v300
403  %v302 = bitcast i8* %v301 to <16 x i32>*
404  %v303 = load <16 x i32>, <16 x i32>* %v302, align 1, !tbaa !7
405  %v304 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v303)
406  %v305 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v304)
407  %v306 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v304)
408  %v307 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v306)
409  %v308 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v305)
410  %v309 = add nsw i32 %v299, -63
411  %v310 = getelementptr inbounds i8, i8* %v48, i32 %v309
412  %v311 = bitcast i8* %v310 to <16 x i32>*
413  %v312 = load <16 x i32>, <16 x i32>* %v311, align 1, !tbaa !7
414  %v313 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v312)
415  %v314 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v313)
416  %v315 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v314)
417  %v316 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v315)
418  %v317 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v315)
419  %v318 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v316, i32 168430090)
420  %v319 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v317, i32 168430090)
421  %v320 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v318, <16 x i32> %v319)
422  %v321 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v313)
423  %v322 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v321)
424  %v323 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v322)
425  %v324 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v322)
426  %v325 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v323, i32 168430090)
427  %v326 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v324, i32 168430090)
428  %v327 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v325, <16 x i32> %v326)
429  %v328 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v307, <32 x i32> %v320)
430  %v329 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v308, <32 x i32> %v327)
431  %v330 = shufflevector <32 x i32> %v328, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
432  %v331 = mul nsw i32 %v296, %v222
433  %v332 = add nsw i32 %v331, %v298
434  %v333 = add nsw i32 %v332, -64
435  %v334 = bitcast i8* %v227 to i32*
436  %v335 = getelementptr inbounds i32, i32* %v334, i32 %v333
437  %v336 = bitcast i32* %v335 to <16 x i32>*
438  store <16 x i32> %v330, <16 x i32>* %v336, align 4, !tbaa !10
439  %v337 = shufflevector <32 x i32> %v328, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
440  %v338 = add nsw i32 %v332, -48
441  %v339 = getelementptr inbounds i32, i32* %v334, i32 %v338
442  %v340 = bitcast i32* %v339 to <16 x i32>*
443  store <16 x i32> %v337, <16 x i32>* %v340, align 4, !tbaa !10
444  %v341 = shufflevector <32 x i32> %v329, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
445  %v342 = add nsw i32 %v332, -32
446  %v343 = getelementptr inbounds i32, i32* %v334, i32 %v342
447  %v344 = bitcast i32* %v343 to <16 x i32>*
448  store <16 x i32> %v341, <16 x i32>* %v344, align 4, !tbaa !10
449  %v345 = shufflevector <32 x i32> %v329, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
450  %v346 = add nsw i32 %v332, -16
451  %v347 = getelementptr inbounds i32, i32* %v334, i32 %v346
452  %v348 = bitcast i32* %v347 to <16 x i32>*
453  store <16 x i32> %v345, <16 x i32>* %v348, align 4, !tbaa !10
454  %v349 = add nuw nsw i32 %v294, 1
455  %v350 = icmp eq i32 %v349, %v292
456  br i1 %v350, label %b19, label %b18
457
458b19:                                              ; preds = %b18, %b17
459  %v351 = add nsw i32 %v228, 1
460  %v352 = icmp eq i32 %v351, %v50
461  br i1 %v352, label %b14, label %b13
462
463b20:                                              ; preds = %b15, %b14
464  %v353 = icmp sgt i32 %v17, 0
465  br i1 %v353, label %b21, label %b31, !prof !9
466
467b21:                                              ; preds = %b20
468  %v354 = ashr i32 %v15, 6
469  %v355 = icmp slt i32 %v354, 0
470  %v356 = select i1 %v355, i32 0, i32 %v354
471  %v357 = icmp sgt i32 %v356, 0
472  br i1 %v357, label %b25, label %b27
473
474b22:                                              ; preds = %b25, %b22
475  %v358 = phi i32 [ %v442, %b22 ], [ 0, %b25 ]
476  %v359 = sub nsw i32 %v525, %v23
477  %v360 = mul nsw i32 %v359, %v222
478  %v361 = shl nsw i32 %v358, 6
479  %v362 = add nsw i32 %v361, %v21
480  %v363 = sub nsw i32 %v362, %v27
481  %v364 = add nsw i32 %v363, %v360
482  %v365 = bitcast i8* %v227 to i32*
483  %v366 = getelementptr inbounds i32, i32* %v365, i32 %v364
484  %v367 = bitcast i32* %v366 to <16 x i32>*
485  %v368 = load <16 x i32>, <16 x i32>* %v367, align 4, !tbaa !10
486  %v369 = add nsw i32 %v364, 16
487  %v370 = getelementptr inbounds i32, i32* %v365, i32 %v369
488  %v371 = bitcast i32* %v370 to <16 x i32>*
489  %v372 = load <16 x i32>, <16 x i32>* %v371, align 4, !tbaa !10
490  %v373 = shufflevector <16 x i32> %v368, <16 x i32> %v372, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
491  %v374 = add nsw i32 %v359, 1
492  %v375 = mul nsw i32 %v374, %v222
493  %v376 = add nsw i32 %v363, %v375
494  %v377 = getelementptr inbounds i32, i32* %v365, i32 %v376
495  %v378 = bitcast i32* %v377 to <16 x i32>*
496  %v379 = load <16 x i32>, <16 x i32>* %v378, align 4, !tbaa !10
497  %v380 = add nsw i32 %v376, 16
498  %v381 = getelementptr inbounds i32, i32* %v365, i32 %v380
499  %v382 = bitcast i32* %v381 to <16 x i32>*
500  %v383 = load <16 x i32>, <16 x i32>* %v382, align 4, !tbaa !10
501  %v384 = shufflevector <16 x i32> %v379, <16 x i32> %v383, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
502  %v385 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v384)
503  %v386 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v384)
504  %v387 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v385, i32 168430090)
505  %v388 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v386, i32 168430090)
506  %v389 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v387, <16 x i32> %v388)
507  %v390 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v373, <32 x i32> %v389)
508  %v391 = shufflevector <32 x i32> %v390, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
509  %v392 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v391, i32 20)
510  %v393 = shufflevector <32 x i32> %v390, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
511  %v394 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v393, i32 20)
512  %v395 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v394, <16 x i32> %v392)
513  %v396 = add nsw i32 %v364, 32
514  %v397 = getelementptr inbounds i32, i32* %v365, i32 %v396
515  %v398 = bitcast i32* %v397 to <16 x i32>*
516  %v399 = load <16 x i32>, <16 x i32>* %v398, align 4, !tbaa !10
517  %v400 = add nsw i32 %v364, 48
518  %v401 = getelementptr inbounds i32, i32* %v365, i32 %v400
519  %v402 = bitcast i32* %v401 to <16 x i32>*
520  %v403 = load <16 x i32>, <16 x i32>* %v402, align 4, !tbaa !10
521  %v404 = shufflevector <16 x i32> %v399, <16 x i32> %v403, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
522  %v405 = add nsw i32 %v376, 32
523  %v406 = getelementptr inbounds i32, i32* %v365, i32 %v405
524  %v407 = bitcast i32* %v406 to <16 x i32>*
525  %v408 = load <16 x i32>, <16 x i32>* %v407, align 4, !tbaa !10
526  %v409 = add nsw i32 %v376, 48
527  %v410 = getelementptr inbounds i32, i32* %v365, i32 %v409
528  %v411 = bitcast i32* %v410 to <16 x i32>*
529  %v412 = load <16 x i32>, <16 x i32>* %v411, align 4, !tbaa !10
530  %v413 = shufflevector <16 x i32> %v408, <16 x i32> %v412, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
531  %v414 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v413)
532  %v415 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v413)
533  %v416 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v414, i32 168430090)
534  %v417 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v415, i32 168430090)
535  %v418 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v416, <16 x i32> %v417)
536  %v419 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v404, <32 x i32> %v418)
537  %v420 = shufflevector <32 x i32> %v419, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
538  %v421 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v420, i32 20)
539  %v422 = shufflevector <32 x i32> %v419, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
540  %v423 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v422, i32 20)
541  %v424 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v423, <16 x i32> %v421)
542  %v425 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v395)
543  %v426 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v395)
544  %v427 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v425, <16 x i32> %v426)
545  %v428 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v424)
546  %v429 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v424)
547  %v430 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v428, <16 x i32> %v429)
548  %v431 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v430, <16 x i32> %v427)
549  %v432 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v431)
550  %v433 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v431)
551  %v434 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v432, <16 x i32> %v433)
552  %v435 = mul nsw i32 %v23, %v19
553  %v436 = mul nsw i32 %v525, %v19
554  %v437 = add i32 %v435, %v21
555  %v438 = sub i32 %v436, %v437
556  %v439 = add i32 %v438, %v362
557  %v440 = getelementptr inbounds i8, i8* %v13, i32 %v439
558  %v441 = bitcast i8* %v440 to <16 x i32>*
559  store <16 x i32> %v434, <16 x i32>* %v441, align 1, !tbaa !12
560  %v442 = add nuw nsw i32 %v358, 1
561  %v443 = icmp eq i32 %v442, %v356
562  br i1 %v443, label %b26, label %b22
563
564b23:                                              ; preds = %b26, %b23
565  %v444 = phi i32 [ %v521, %b23 ], [ %v356, %b26 ]
566  %v445 = sub nsw i32 %v24, %v27
567  %v446 = add nsw i32 %v360, %v445
568  %v447 = add nsw i32 %v446, -64
569  %v448 = getelementptr inbounds i32, i32* %v365, i32 %v447
570  %v449 = bitcast i32* %v448 to <16 x i32>*
571  %v450 = load <16 x i32>, <16 x i32>* %v449, align 4, !tbaa !10
572  %v451 = add nsw i32 %v446, -48
573  %v452 = getelementptr inbounds i32, i32* %v365, i32 %v451
574  %v453 = bitcast i32* %v452 to <16 x i32>*
575  %v454 = load <16 x i32>, <16 x i32>* %v453, align 4, !tbaa !10
576  %v455 = shufflevector <16 x i32> %v450, <16 x i32> %v454, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
577  %v456 = add nsw i32 %v375, %v445
578  %v457 = add nsw i32 %v456, -64
579  %v458 = getelementptr inbounds i32, i32* %v365, i32 %v457
580  %v459 = bitcast i32* %v458 to <16 x i32>*
581  %v460 = load <16 x i32>, <16 x i32>* %v459, align 4, !tbaa !10
582  %v461 = add nsw i32 %v456, -48
583  %v462 = getelementptr inbounds i32, i32* %v365, i32 %v461
584  %v463 = bitcast i32* %v462 to <16 x i32>*
585  %v464 = load <16 x i32>, <16 x i32>* %v463, align 4, !tbaa !10
586  %v465 = shufflevector <16 x i32> %v460, <16 x i32> %v464, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
587  %v466 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v465)
588  %v467 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v465)
589  %v468 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v466, i32 168430090)
590  %v469 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v467, i32 168430090)
591  %v470 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v468, <16 x i32> %v469)
592  %v471 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v455, <32 x i32> %v470)
593  %v472 = shufflevector <32 x i32> %v471, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
594  %v473 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v472, i32 20)
595  %v474 = shufflevector <32 x i32> %v471, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
596  %v475 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v474, i32 20)
597  %v476 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v475, <16 x i32> %v473)
598  %v477 = add nsw i32 %v446, -32
599  %v478 = getelementptr inbounds i32, i32* %v365, i32 %v477
600  %v479 = bitcast i32* %v478 to <16 x i32>*
601  %v480 = load <16 x i32>, <16 x i32>* %v479, align 4, !tbaa !10
602  %v481 = add nsw i32 %v446, -16
603  %v482 = getelementptr inbounds i32, i32* %v365, i32 %v481
604  %v483 = bitcast i32* %v482 to <16 x i32>*
605  %v484 = load <16 x i32>, <16 x i32>* %v483, align 4, !tbaa !10
606  %v485 = shufflevector <16 x i32> %v480, <16 x i32> %v484, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
607  %v486 = add nsw i32 %v456, -32
608  %v487 = getelementptr inbounds i32, i32* %v365, i32 %v486
609  %v488 = bitcast i32* %v487 to <16 x i32>*
610  %v489 = load <16 x i32>, <16 x i32>* %v488, align 4, !tbaa !10
611  %v490 = add nsw i32 %v456, -16
612  %v491 = getelementptr inbounds i32, i32* %v365, i32 %v490
613  %v492 = bitcast i32* %v491 to <16 x i32>*
614  %v493 = load <16 x i32>, <16 x i32>* %v492, align 4, !tbaa !10
615  %v494 = shufflevector <16 x i32> %v489, <16 x i32> %v493, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
616  %v495 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v494)
617  %v496 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v494)
618  %v497 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v495, i32 168430090)
619  %v498 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v496, i32 168430090)
620  %v499 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v497, <16 x i32> %v498)
621  %v500 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v485, <32 x i32> %v499)
622  %v501 = shufflevector <32 x i32> %v500, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
623  %v502 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v501, i32 20)
624  %v503 = shufflevector <32 x i32> %v500, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
625  %v504 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v503, i32 20)
626  %v505 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v504, <16 x i32> %v502)
627  %v506 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v476)
628  %v507 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v476)
629  %v508 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v506, <16 x i32> %v507)
630  %v509 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v505)
631  %v510 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v505)
632  %v511 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v509, <16 x i32> %v510)
633  %v512 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v511, <16 x i32> %v508)
634  %v513 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v512)
635  %v514 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v512)
636  %v515 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v513, <16 x i32> %v514)
637  %v516 = add i32 %v15, -64
638  %v517 = sub i32 %v516, %v435
639  %v518 = add i32 %v517, %v436
640  %v519 = getelementptr inbounds i8, i8* %v13, i32 %v518
641  %v520 = bitcast i8* %v519 to <16 x i32>*
642  store <16 x i32> %v515, <16 x i32>* %v520, align 1, !tbaa !12
643  %v521 = add nuw nsw i32 %v444, 1
644  %v522 = icmp eq i32 %v521, %v527
645  br i1 %v522, label %b24, label %b23
646
647b24:                                              ; preds = %b26, %b23
648  %v523 = add nsw i32 %v525, 1
649  %v524 = icmp eq i32 %v523, %v50
650  br i1 %v524, label %b32, label %b25
651
652b25:                                              ; preds = %b24, %b21
653  %v525 = phi i32 [ %v523, %b24 ], [ %v23, %b21 ]
654  br label %b22
655
656b26:                                              ; preds = %b22
657  %v526 = add nsw i32 %v15, 63
658  %v527 = ashr i32 %v526, 6
659  %v528 = icmp slt i32 %v356, %v527
660  br i1 %v528, label %b23, label %b24, !prof !9
661
662b27:                                              ; preds = %b21
663  %v529 = add nsw i32 %v15, 63
664  %v530 = ashr i32 %v529, 6
665  %v531 = icmp slt i32 %v356, %v530
666  br i1 %v531, label %b29, label %b31
667
668b28:                                              ; preds = %b29, %b28
669  %v532 = phi i32 [ %v616, %b28 ], [ %v356, %b29 ]
670  %v533 = sub nsw i32 %v618, %v23
671  %v534 = mul nsw i32 %v533, %v222
672  %v535 = sub nsw i32 %v24, %v27
673  %v536 = add nsw i32 %v534, %v535
674  %v537 = add nsw i32 %v536, -64
675  %v538 = bitcast i8* %v227 to i32*
676  %v539 = getelementptr inbounds i32, i32* %v538, i32 %v537
677  %v540 = bitcast i32* %v539 to <16 x i32>*
678  %v541 = load <16 x i32>, <16 x i32>* %v540, align 4, !tbaa !10
679  %v542 = add nsw i32 %v536, -48
680  %v543 = getelementptr inbounds i32, i32* %v538, i32 %v542
681  %v544 = bitcast i32* %v543 to <16 x i32>*
682  %v545 = load <16 x i32>, <16 x i32>* %v544, align 4, !tbaa !10
683  %v546 = shufflevector <16 x i32> %v541, <16 x i32> %v545, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
684  %v547 = add nsw i32 %v533, 1
685  %v548 = mul nsw i32 %v547, %v222
686  %v549 = add nsw i32 %v548, %v535
687  %v550 = add nsw i32 %v549, -64
688  %v551 = getelementptr inbounds i32, i32* %v538, i32 %v550
689  %v552 = bitcast i32* %v551 to <16 x i32>*
690  %v553 = load <16 x i32>, <16 x i32>* %v552, align 4, !tbaa !10
691  %v554 = add nsw i32 %v549, -48
692  %v555 = getelementptr inbounds i32, i32* %v538, i32 %v554
693  %v556 = bitcast i32* %v555 to <16 x i32>*
694  %v557 = load <16 x i32>, <16 x i32>* %v556, align 4, !tbaa !10
695  %v558 = shufflevector <16 x i32> %v553, <16 x i32> %v557, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
696  %v559 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v558)
697  %v560 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v558)
698  %v561 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v559, i32 168430090)
699  %v562 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v560, i32 168430090)
700  %v563 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v561, <16 x i32> %v562)
701  %v564 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v546, <32 x i32> %v563)
702  %v565 = shufflevector <32 x i32> %v564, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
703  %v566 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v565, i32 20)
704  %v567 = shufflevector <32 x i32> %v564, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
705  %v568 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v567, i32 20)
706  %v569 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v568, <16 x i32> %v566)
707  %v570 = add nsw i32 %v536, -32
708  %v571 = getelementptr inbounds i32, i32* %v538, i32 %v570
709  %v572 = bitcast i32* %v571 to <16 x i32>*
710  %v573 = load <16 x i32>, <16 x i32>* %v572, align 4, !tbaa !10
711  %v574 = add nsw i32 %v536, -16
712  %v575 = getelementptr inbounds i32, i32* %v538, i32 %v574
713  %v576 = bitcast i32* %v575 to <16 x i32>*
714  %v577 = load <16 x i32>, <16 x i32>* %v576, align 4, !tbaa !10
715  %v578 = shufflevector <16 x i32> %v573, <16 x i32> %v577, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
716  %v579 = add nsw i32 %v549, -32
717  %v580 = getelementptr inbounds i32, i32* %v538, i32 %v579
718  %v581 = bitcast i32* %v580 to <16 x i32>*
719  %v582 = load <16 x i32>, <16 x i32>* %v581, align 4, !tbaa !10
720  %v583 = add nsw i32 %v549, -16
721  %v584 = getelementptr inbounds i32, i32* %v538, i32 %v583
722  %v585 = bitcast i32* %v584 to <16 x i32>*
723  %v586 = load <16 x i32>, <16 x i32>* %v585, align 4, !tbaa !10
724  %v587 = shufflevector <16 x i32> %v582, <16 x i32> %v586, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
725  %v588 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v587)
726  %v589 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v587)
727  %v590 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v588, i32 168430090)
728  %v591 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v589, i32 168430090)
729  %v592 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v590, <16 x i32> %v591)
730  %v593 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v578, <32 x i32> %v592)
731  %v594 = shufflevector <32 x i32> %v593, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
732  %v595 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v594, i32 20)
733  %v596 = shufflevector <32 x i32> %v593, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
734  %v597 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v596, i32 20)
735  %v598 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v597, <16 x i32> %v595)
736  %v599 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v569)
737  %v600 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v569)
738  %v601 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v599, <16 x i32> %v600)
739  %v602 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v598)
740  %v603 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v598)
741  %v604 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v602, <16 x i32> %v603)
742  %v605 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v604, <16 x i32> %v601)
743  %v606 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v605)
744  %v607 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v605)
745  %v608 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v606, <16 x i32> %v607)
746  %v609 = mul nsw i32 %v23, %v19
747  %v610 = mul nsw i32 %v618, %v19
748  %v611 = add i32 %v15, -64
749  %v612 = sub i32 %v611, %v609
750  %v613 = add i32 %v612, %v610
751  %v614 = getelementptr inbounds i8, i8* %v13, i32 %v613
752  %v615 = bitcast i8* %v614 to <16 x i32>*
753  store <16 x i32> %v608, <16 x i32>* %v615, align 1, !tbaa !12
754  %v616 = add nuw nsw i32 %v532, 1
755  %v617 = icmp eq i32 %v616, %v530
756  br i1 %v617, label %b30, label %b28
757
758b29:                                              ; preds = %b30, %b27
759  %v618 = phi i32 [ %v619, %b30 ], [ %v23, %b27 ]
760  br label %b28
761
762b30:                                              ; preds = %b28
763  %v619 = add nsw i32 %v618, 1
764  %v620 = icmp eq i32 %v619, %v50
765  br i1 %v620, label %b32, label %b29
766
767b31:                                              ; preds = %b27, %b20
768  %v621 = icmp eq i8* %v227, null
769  br i1 %v621, label %b33, label %b32
770
771b32:                                              ; preds = %b31, %b30, %b24
772  tail call void @f3(i8* null, i8* %v227) #2
773  br label %b33
774
775b33:                                              ; preds = %b32, %b31
776  ret i32 0
777}
778
779; Function Attrs: nounwind readnone
780declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
781
782; Function Attrs: nounwind readnone
783declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
784
785; Function Attrs: nounwind readnone
786declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
787
788; Function Attrs: nounwind readnone
789declare <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32>) #1
790
791; Function Attrs: nounwind readnone
792declare <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32>, i32) #1
793
794; Function Attrs: nounwind readnone
795declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>) #1
796
797; Function Attrs: nounwind readnone
798declare <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32>, <32 x i32>) #1
799
800; Function Attrs: nounwind readnone
801declare <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32>, i32) #1
802
803; Function Attrs: nounwind readnone
804declare <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32>, <16 x i32>) #1
805
806; Function Attrs: nounwind readnone
807declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
808
809attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" }
810attributes #1 = { nounwind readnone }
811attributes #2 = { nobuiltin nounwind }
812
813!llvm.module.flags = !{!0, !1, !2}
814
815!0 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
816!1 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
817!2 = !{i32 2, !"halide_mattrs", !"+hvx"}
818!3 = !{!"branch_weights", i32 0, i32 1073741824}
819!4 = !{!5, !5, i64 0}
820!5 = !{!"input", !6}
821!6 = !{!"Halide buffer"}
822!7 = !{!8, !8, i64 0}
823!8 = !{!"constant_exterior", !6}
824!9 = !{!"branch_weights", i32 1073741824, i32 0}
825!10 = !{!11, !11, i64 0}
826!11 = !{!"rows", !6}
827!12 = !{!13, !13, i64 0}
828!13 = !{!"gaussian11", !6}
829