1; RUN: llc -O3 -march=hexagon < %s | FileCheck %s 2 3; CHECK: loop0(.[[BLOCK:LBB0_[0-9]+]] 4; CHECK: .[[BLOCK]]: 5; CHECK: = vmemu({{r[0-9]+}}++#1) 6; CHECK: = vmemu({{r[0-9]+}}++#1) 7; CHECK: = vmemu({{r[0-9]+}}++#1) 8; CHECK: = vmemu({{r[0-9]+}}++#1) 9; CHECK: endloop0 10 11target triple = "hexagon-unknown--elf" 12 13%0 = type { i8*, i32, i32, i32, i32, %1*, %1*, %1* } 14%1 = type { %2 } 15%2 = type { i64 } 16%3 = type { i8*, i32, i32, i32, i32, i32, i32, i8*, i32, i32* } 17%4 = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] } 18 19@g0 = private unnamed_addr constant [5 x i8] c"Load\00", align 1 20@g1 = private unnamed_addr constant [6 x i8] c"Store\00", align 1 21@g2 = private unnamed_addr constant [18 x i8] c"Begin realization\00", align 1 22@g3 = private unnamed_addr constant [16 x i8] c"End realization\00", align 1 23@g4 = private unnamed_addr constant [8 x i8] c"Produce\00", align 1 24@g5 = private unnamed_addr constant [7 x i8] c"Update\00", align 1 25@g6 = private unnamed_addr constant [8 x i8] c"Consume\00", align 1 26@g7 = private unnamed_addr constant [12 x i8] c"End consume\00", align 1 27@g8 = private constant [6 x i8] c"input\00", align 32 28@g9 = private constant [10 x i8] c"dilate3x3\00", align 32 29@g10 = private constant [2 x %0] [%0 { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @g8, i32 0, i32 0), i32 1, i32 2, i32 1, i32 8, %1* null, %1* null, %1* null }, %0 { i8* getelementptr inbounds ([10 x i8], [10 x i8]* @g9, i32 0, i32 0), i32 2, i32 2, i32 1, i32 8, %1* null, %1* null, %1* null }] 30@g11 = private constant [64 x i8] c"...............................................................\00", align 32 31 32; Function Attrs: nounwind 33declare i8* @f0(i8*, i32) #0 34 35; Function Attrs: nounwind 36declare void @f1(i8*, i8*) #0 37 38; Function Attrs: nounwind 39declare void @f2(i8*, i8*) #0 40 41; Function Attrs: nounwind 42declare i32 @f3(i8*, %3*) #0 43 44; Function Attrs: nounwind 45declare void @f4() #0 46 47; Function Attrs: nounwind 48declare void @f5() #0 49 50; Function Attrs: nounwind 51define i32 @f6(%4* noalias nocapture readonly %a0, %4* noalias nocapture readonly %a1) #0 { 52b0: 53 %v0 = getelementptr inbounds %4, %4* %a0, i32 0, i32 1 54 %v1 = load i8*, i8** %v0, align 4 55 %v2 = getelementptr inbounds %4, %4* %a0, i32 0, i32 3, i32 1 56 %v3 = load i32, i32* %v2, align 4 57 %v4 = getelementptr inbounds %4, %4* %a0, i32 0, i32 4, i32 0 58 %v5 = load i32, i32* %v4, align 4 59 %v6 = getelementptr inbounds %4, %4* %a0, i32 0, i32 4, i32 1 60 %v7 = load i32, i32* %v6, align 4 61 %v8 = getelementptr inbounds %4, %4* %a1, i32 0, i32 1 62 %v9 = load i8*, i8** %v8, align 4 63 %v10 = getelementptr inbounds %4, %4* %a1, i32 0, i32 2, i32 0 64 %v11 = load i32, i32* %v10, align 4 65 %v12 = getelementptr inbounds %4, %4* %a1, i32 0, i32 3, i32 1 66 %v13 = load i32, i32* %v12, align 4 67 %v14 = getelementptr inbounds %4, %4* %a1, i32 0, i32 4, i32 0 68 %v15 = load i32, i32* %v14, align 4 69 %v16 = getelementptr inbounds %4, %4* %a1, i32 0, i32 4, i32 1 70 %v17 = load i32, i32* %v16, align 4 71 %v18 = getelementptr inbounds %4, %4* %a1, i32 0, i32 2, i32 1 72 %v19 = load i32, i32* %v18, align 4 73 %v20 = add nsw i32 %v19, %v17 74 %v21 = icmp sgt i32 %v19, 0 75 br i1 %v21, label %b1, label %b11, !prof !3 76 77b1: ; preds = %b0 78 %v22 = ashr i32 %v11, 7 79 %v23 = icmp slt i32 %v22, 0 80 %v24 = select i1 %v23, i32 0, i32 %v22 81 %v25 = icmp sgt i32 %v24, 0 82 br i1 %v25, label %b5, label %b7, !prof !3 83 84b2: ; preds = %b5, %b2 85 %v26 = phi i32 [ %v90, %b2 ], [ 0, %b5 ] 86 %v27 = mul nsw i32 %v7, %v3 87 %v28 = add nsw i32 %v27, %v5 88 %v29 = shl nsw i32 %v26, 7 89 %v30 = add nsw i32 %v29, %v15 90 %v31 = add nsw i32 %v150, -1 91 %v32 = mul nsw i32 %v31, %v3 92 %v33 = mul nsw i32 %v150, %v3 93 %v34 = add nsw i32 %v150, 1 94 %v35 = mul nsw i32 %v34, %v3 95 %v36 = sub i32 %v32, %v28 96 %v37 = add i32 %v36, %v30 97 %v38 = add nsw i32 %v37, -1 98 %v39 = getelementptr inbounds i8, i8* %v1, i32 %v38 99 %v40 = bitcast i8* %v39 to <32 x i32>* 100 %v41 = load <32 x i32>, <32 x i32>* %v40, align 1, !tbaa !4 101 %v42 = getelementptr inbounds i8, i8* %v1, i32 %v37 102 %v43 = bitcast i8* %v42 to <32 x i32>* 103 %v44 = load <32 x i32>, <32 x i32>* %v43, align 1, !tbaa !4 104 %v45 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v41, <32 x i32> %v44) 105 %v46 = add nsw i32 %v37, 1 106 %v47 = getelementptr inbounds i8, i8* %v1, i32 %v46 107 %v48 = bitcast i8* %v47 to <32 x i32>* 108 %v49 = load <32 x i32>, <32 x i32>* %v48, align 1, !tbaa !4 109 %v50 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v45, <32 x i32> %v49) 110 %v51 = sub i32 %v33, %v28 111 %v52 = add i32 %v51, %v30 112 %v53 = add nsw i32 %v52, -1 113 %v54 = getelementptr inbounds i8, i8* %v1, i32 %v53 114 %v55 = bitcast i8* %v54 to <32 x i32>* 115 %v56 = load <32 x i32>, <32 x i32>* %v55, align 1, !tbaa !4 116 %v57 = getelementptr inbounds i8, i8* %v1, i32 %v52 117 %v58 = bitcast i8* %v57 to <32 x i32>* 118 %v59 = load <32 x i32>, <32 x i32>* %v58, align 1, !tbaa !4 119 %v60 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v56, <32 x i32> %v59) 120 %v61 = add nsw i32 %v52, 1 121 %v62 = getelementptr inbounds i8, i8* %v1, i32 %v61 122 %v63 = bitcast i8* %v62 to <32 x i32>* 123 %v64 = load <32 x i32>, <32 x i32>* %v63, align 1, !tbaa !4 124 %v65 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v60, <32 x i32> %v64) 125 %v66 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v50, <32 x i32> %v65) 126 %v67 = sub i32 %v35, %v28 127 %v68 = add i32 %v67, %v30 128 %v69 = add nsw i32 %v68, -1 129 %v70 = getelementptr inbounds i8, i8* %v1, i32 %v69 130 %v71 = bitcast i8* %v70 to <32 x i32>* 131 %v72 = load <32 x i32>, <32 x i32>* %v71, align 1, !tbaa !4 132 %v73 = getelementptr inbounds i8, i8* %v1, i32 %v68 133 %v74 = bitcast i8* %v73 to <32 x i32>* 134 %v75 = load <32 x i32>, <32 x i32>* %v74, align 1, !tbaa !4 135 %v76 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v72, <32 x i32> %v75) 136 %v77 = add nsw i32 %v68, 1 137 %v78 = getelementptr inbounds i8, i8* %v1, i32 %v77 138 %v79 = bitcast i8* %v78 to <32 x i32>* 139 %v80 = load <32 x i32>, <32 x i32>* %v79, align 1, !tbaa !4 140 %v81 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v76, <32 x i32> %v80) 141 %v82 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v66, <32 x i32> %v81) 142 %v83 = mul nsw i32 %v150, %v13 143 %v84 = mul nsw i32 %v17, %v13 144 %v85 = add i32 %v84, %v15 145 %v86 = sub i32 %v83, %v85 146 %v87 = add i32 %v86, %v30 147 %v88 = getelementptr inbounds i8, i8* %v9, i32 %v87 148 %v89 = bitcast i8* %v88 to <32 x i32>* 149 store <32 x i32> %v82, <32 x i32>* %v89, align 1, !tbaa !7 150 %v90 = add nuw nsw i32 %v26, 1 151 %v91 = icmp eq i32 %v90, %v24 152 br i1 %v91, label %b6, label %b2 153 154b3: ; preds = %b6, %b3 155 %v92 = phi i32 [ %v147, %b3 ], [ %v24, %b6 ] 156 %v93 = add nsw i32 %v15, %v11 157 %v94 = sub i32 %v93, %v28 158 %v95 = add i32 %v94, %v32 159 %v96 = add nsw i32 %v95, -129 160 %v97 = getelementptr inbounds i8, i8* %v1, i32 %v96 161 %v98 = bitcast i8* %v97 to <32 x i32>* 162 %v99 = load <32 x i32>, <32 x i32>* %v98, align 1, !tbaa !4 163 %v100 = add nsw i32 %v95, -128 164 %v101 = getelementptr inbounds i8, i8* %v1, i32 %v100 165 %v102 = bitcast i8* %v101 to <32 x i32>* 166 %v103 = load <32 x i32>, <32 x i32>* %v102, align 1, !tbaa !4 167 %v104 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v99, <32 x i32> %v103) 168 %v105 = add nsw i32 %v95, -127 169 %v106 = getelementptr inbounds i8, i8* %v1, i32 %v105 170 %v107 = bitcast i8* %v106 to <32 x i32>* 171 %v108 = load <32 x i32>, <32 x i32>* %v107, align 1, !tbaa !4 172 %v109 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v104, <32 x i32> %v108) 173 %v110 = add i32 %v94, %v33 174 %v111 = add nsw i32 %v110, -129 175 %v112 = getelementptr inbounds i8, i8* %v1, i32 %v111 176 %v113 = bitcast i8* %v112 to <32 x i32>* 177 %v114 = load <32 x i32>, <32 x i32>* %v113, align 1, !tbaa !4 178 %v115 = add nsw i32 %v110, -128 179 %v116 = getelementptr inbounds i8, i8* %v1, i32 %v115 180 %v117 = bitcast i8* %v116 to <32 x i32>* 181 %v118 = load <32 x i32>, <32 x i32>* %v117, align 1, !tbaa !4 182 %v119 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v114, <32 x i32> %v118) 183 %v120 = add nsw i32 %v110, -127 184 %v121 = getelementptr inbounds i8, i8* %v1, i32 %v120 185 %v122 = bitcast i8* %v121 to <32 x i32>* 186 %v123 = load <32 x i32>, <32 x i32>* %v122, align 1, !tbaa !4 187 %v124 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v119, <32 x i32> %v123) 188 %v125 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v109, <32 x i32> %v124) 189 %v126 = add i32 %v94, %v35 190 %v127 = add nsw i32 %v126, -129 191 %v128 = getelementptr inbounds i8, i8* %v1, i32 %v127 192 %v129 = bitcast i8* %v128 to <32 x i32>* 193 %v130 = load <32 x i32>, <32 x i32>* %v129, align 1, !tbaa !4 194 %v131 = add nsw i32 %v126, -128 195 %v132 = getelementptr inbounds i8, i8* %v1, i32 %v131 196 %v133 = bitcast i8* %v132 to <32 x i32>* 197 %v134 = load <32 x i32>, <32 x i32>* %v133, align 1, !tbaa !4 198 %v135 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v130, <32 x i32> %v134) 199 %v136 = add nsw i32 %v126, -127 200 %v137 = getelementptr inbounds i8, i8* %v1, i32 %v136 201 %v138 = bitcast i8* %v137 to <32 x i32>* 202 %v139 = load <32 x i32>, <32 x i32>* %v138, align 1, !tbaa !4 203 %v140 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v135, <32 x i32> %v139) 204 %v141 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v125, <32 x i32> %v140) 205 %v142 = add i32 %v11, -128 206 %v143 = sub i32 %v142, %v84 207 %v144 = add i32 %v143, %v83 208 %v145 = getelementptr inbounds i8, i8* %v9, i32 %v144 209 %v146 = bitcast i8* %v145 to <32 x i32>* 210 store <32 x i32> %v141, <32 x i32>* %v146, align 1, !tbaa !7 211 %v147 = add nuw nsw i32 %v92, 1 212 %v148 = icmp eq i32 %v147, %v152 213 br i1 %v148, label %b4, label %b3 214 215b4: ; preds = %b6, %b3 216 %v149 = icmp eq i32 %v34, %v20 217 br i1 %v149, label %b11, label %b5 218 219b5: ; preds = %b4, %b1 220 %v150 = phi i32 [ %v34, %b4 ], [ %v17, %b1 ] 221 br label %b2 222 223b6: ; preds = %b2 224 %v151 = add nsw i32 %v11, 127 225 %v152 = ashr i32 %v151, 7 226 %v153 = icmp slt i32 %v24, %v152 227 br i1 %v153, label %b3, label %b4, !prof !3 228 229b7: ; preds = %b1 230 %v154 = add nsw i32 %v11, 127 231 %v155 = ashr i32 %v154, 7 232 %v156 = icmp slt i32 %v24, %v155 233 br i1 %v156, label %b9, label %b11, !prof !3 234 235b8: ; preds = %b9, %b8 236 %v157 = phi i32 [ %v221, %b8 ], [ %v24, %b9 ] 237 %v158 = mul nsw i32 %v7, %v3 238 %v159 = add nsw i32 %v158, %v5 239 %v160 = add nsw i32 %v15, %v11 240 %v161 = add nsw i32 %v223, -1 241 %v162 = mul nsw i32 %v161, %v3 242 %v163 = mul nsw i32 %v223, %v3 243 %v164 = add nsw i32 %v223, 1 244 %v165 = mul nsw i32 %v164, %v3 245 %v166 = sub i32 %v160, %v159 246 %v167 = add i32 %v166, %v162 247 %v168 = add nsw i32 %v167, -129 248 %v169 = getelementptr inbounds i8, i8* %v1, i32 %v168 249 %v170 = bitcast i8* %v169 to <32 x i32>* 250 %v171 = load <32 x i32>, <32 x i32>* %v170, align 1, !tbaa !4 251 %v172 = add nsw i32 %v167, -128 252 %v173 = getelementptr inbounds i8, i8* %v1, i32 %v172 253 %v174 = bitcast i8* %v173 to <32 x i32>* 254 %v175 = load <32 x i32>, <32 x i32>* %v174, align 1, !tbaa !4 255 %v176 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v171, <32 x i32> %v175) 256 %v177 = add nsw i32 %v167, -127 257 %v178 = getelementptr inbounds i8, i8* %v1, i32 %v177 258 %v179 = bitcast i8* %v178 to <32 x i32>* 259 %v180 = load <32 x i32>, <32 x i32>* %v179, align 1, !tbaa !4 260 %v181 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v176, <32 x i32> %v180) 261 %v182 = add i32 %v166, %v163 262 %v183 = add nsw i32 %v182, -129 263 %v184 = getelementptr inbounds i8, i8* %v1, i32 %v183 264 %v185 = bitcast i8* %v184 to <32 x i32>* 265 %v186 = load <32 x i32>, <32 x i32>* %v185, align 1, !tbaa !4 266 %v187 = add nsw i32 %v182, -128 267 %v188 = getelementptr inbounds i8, i8* %v1, i32 %v187 268 %v189 = bitcast i8* %v188 to <32 x i32>* 269 %v190 = load <32 x i32>, <32 x i32>* %v189, align 1, !tbaa !4 270 %v191 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v186, <32 x i32> %v190) 271 %v192 = add nsw i32 %v182, -127 272 %v193 = getelementptr inbounds i8, i8* %v1, i32 %v192 273 %v194 = bitcast i8* %v193 to <32 x i32>* 274 %v195 = load <32 x i32>, <32 x i32>* %v194, align 1, !tbaa !4 275 %v196 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v191, <32 x i32> %v195) 276 %v197 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v181, <32 x i32> %v196) 277 %v198 = add i32 %v166, %v165 278 %v199 = add nsw i32 %v198, -129 279 %v200 = getelementptr inbounds i8, i8* %v1, i32 %v199 280 %v201 = bitcast i8* %v200 to <32 x i32>* 281 %v202 = load <32 x i32>, <32 x i32>* %v201, align 1, !tbaa !4 282 %v203 = add nsw i32 %v198, -128 283 %v204 = getelementptr inbounds i8, i8* %v1, i32 %v203 284 %v205 = bitcast i8* %v204 to <32 x i32>* 285 %v206 = load <32 x i32>, <32 x i32>* %v205, align 1, !tbaa !4 286 %v207 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v202, <32 x i32> %v206) 287 %v208 = add nsw i32 %v198, -127 288 %v209 = getelementptr inbounds i8, i8* %v1, i32 %v208 289 %v210 = bitcast i8* %v209 to <32 x i32>* 290 %v211 = load <32 x i32>, <32 x i32>* %v210, align 1, !tbaa !4 291 %v212 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v207, <32 x i32> %v211) 292 %v213 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v197, <32 x i32> %v212) 293 %v214 = mul nsw i32 %v223, %v13 294 %v215 = mul nsw i32 %v17, %v13 295 %v216 = add i32 %v11, -128 296 %v217 = sub i32 %v216, %v215 297 %v218 = add i32 %v217, %v214 298 %v219 = getelementptr inbounds i8, i8* %v9, i32 %v218 299 %v220 = bitcast i8* %v219 to <32 x i32>* 300 store <32 x i32> %v213, <32 x i32>* %v220, align 1, !tbaa !7 301 %v221 = add nuw nsw i32 %v157, 1 302 %v222 = icmp eq i32 %v221, %v155 303 br i1 %v222, label %b10, label %b8 304 305b9: ; preds = %b10, %b7 306 %v223 = phi i32 [ %v164, %b10 ], [ %v17, %b7 ] 307 br label %b8 308 309b10: ; preds = %b8 310 %v224 = icmp eq i32 %v164, %v20 311 br i1 %v224, label %b11, label %b9 312 313b11: ; preds = %b10, %b7, %b4, %b0 314 ret i32 0 315} 316 317; Function Attrs: nounwind readnone 318declare <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32>, <32 x i32>) #1 319 320; Function Attrs: nounwind 321define i32 @f7(%4* noalias nocapture readonly %a0, %4* noalias nocapture readonly %a1) #0 { 322b0: 323 %v0 = tail call i32 @f6(%4* %a0, %4* %a1) #0 324 ret i32 0 325} 326 327; Function Attrs: nounwind 328define i32 @f8(i8** nocapture readonly %a0) #0 { 329b0: 330 %v0 = bitcast i8** %a0 to %4** 331 %v1 = load %4*, %4** %v0, align 4 332 %v2 = getelementptr i8*, i8** %a0, i32 1 333 %v3 = bitcast i8** %v2 to %4** 334 %v4 = load %4*, %4** %v3, align 4 335 %v5 = tail call i32 @f7(%4* %v1, %4* %v4) 336 ret i32 0 337} 338 339attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length128b" } 340attributes #1 = { nounwind readnone } 341 342!llvm.module.flags = !{!0, !1, !2} 343 344!0 = !{i32 2, !"halide_use_soft_float_abi", i32 0} 345!1 = !{i32 2, !"halide_mcpu", !"hexagonv60"} 346!2 = !{i32 2, !"halide_mattrs", !"+hvxv60,+hvx-length64b"} 347!3 = !{!"branch_weights", i32 1073741824, i32 0} 348!4 = !{!5, !5, i64 0} 349!5 = !{!"input", !6} 350!6 = !{!"Halide buffer"} 351!7 = !{!8, !8, i64 0} 352!8 = !{!"dilate3x3", !6} 353