1; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \ 2; RUN: < %s | FileCheck %s 3 4; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -loop-vectorize -force-vector-width=4 \ 5; RUN: -verify-loop-info -verify-dom-info -S < %s | \ 6; RUN: FileCheck --check-prefix=VECTORIZE %s 7 8; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \ 9; RUN: -loop-accesses -analyze < %s -enable-new-pm=0 | FileCheck %s --check-prefix=ANALYSIS 10 11; TODO: the following changes the order loop-access printing prints loops, remove legacy RUN and change after NPM switch 12; TODO: opt -aa-pipeline=basic-aa -passes='loop-distribute,print-access-info' -enable-loop-distribute \ 13; TODO: -verify-loop-info -verify-dom-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS 14 15; The memcheck version of basic.ll. We should distribute and vectorize the 16; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B}) 17; 18; for (i = 0; i < n; i++) { 19; A[i + 1] = A[i] * B[i]; 20; ------------------------------- 21; C[i] = D[i] * E[i]; 22; } 23 24target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 25target triple = "x86_64-apple-macosx10.10.0" 26 27@B = common global i32* null, align 8 28@A = common global i32* null, align 8 29@C = common global i32* null, align 8 30@D = common global i32* null, align 8 31@E = common global i32* null, align 8 32 33; CHECK-LABEL: @f( 34define void @f() { 35entry: 36 %a = load i32*, i32** @A, align 8 37 %b = load i32*, i32** @B, align 8 38 %c = load i32*, i32** @C, align 8 39 %d = load i32*, i32** @D, align 8 40 %e = load i32*, i32** @E, align 8 41 br label %for.body 42 43; We have two compares for each array overlap check. 44; Since the checks to A and A + 4 get merged, this will give us a 45; total of 8 compares. 46; 47; CHECK: for.body.lver.check: 48; CHECK: = icmp 49; CHECK: = icmp 50 51; CHECK: = icmp 52; CHECK: = icmp 53 54; CHECK: = icmp 55; CHECK: = icmp 56 57; CHECK: = icmp 58; CHECK: = icmp 59 60; CHECK-NOT: = icmp 61; CHECK: br i1 %memcheck.conflict, label %for.body.ph.lver.orig, label %for.body.ph.ldist1 62 63; The non-distributed loop that the memchecks fall back on. 64 65; CHECK: for.body.ph.lver.orig: 66; CHECK: br label %for.body.lver.orig 67; CHECK: for.body.lver.orig: 68; CHECK: br i1 %exitcond.lver.orig, label %for.end.loopexit, label %for.body.lver.orig 69 70; Verify the two distributed loops. 71 72; CHECK: for.body.ph.ldist1: 73; CHECK: br label %for.body.ldist1 74; CHECK: for.body.ldist1: 75; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 76; CHECK: br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1 77 78; CHECK: for.body.ph: 79; CHECK: br label %for.body 80; CHECK: for.body: 81; CHECK: %mulC = mul i32 %loadD, %loadE 82; CHECK: for.end: 83 84 85; VECTORIZE: mul <4 x i32> 86 87for.body: ; preds = %for.body, %entry 88 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 89 90 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 91 %loadA = load i32, i32* %arrayidxA, align 4 92 93 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 94 %loadB = load i32, i32* %arrayidxB, align 4 95 96 %mulA = mul i32 %loadB, %loadA 97 98 %add = add nuw nsw i64 %ind, 1 99 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 100 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 101 102 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 103 %loadD = load i32, i32* %arrayidxD, align 4 104 105 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 106 %loadE = load i32, i32* %arrayidxE, align 4 107 108 %mulC = mul i32 %loadD, %loadE 109 110 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 111 store i32 %mulC, i32* %arrayidxC, align 4 112 113 %exitcond = icmp eq i64 %add, 20 114 br i1 %exitcond, label %for.end, label %for.body 115 116for.end: ; preds = %for.body 117 ret void 118} 119 120; Make sure there's no "Multiple reports generated" assert with a 121; volatile load, and no distribution 122 123; TODO: Distribution of volatile may be possible under some 124; circumstance, but the current implementation does not touch them. 125 126; CHECK-LABEL: @f_volatile_load( 127; CHECK: br label %for.body{{$}} 128 129; CHECK-NOT: load 130 131; CHECK: {{^}}for.body: 132; CHECK: load i32 133; CHECK: load i32 134; CHECK: load volatile i32 135; CHECK: load i32 136; CHECK: br i1 %exitcond, label %for.end, label %for.body{{$}} 137 138; CHECK-NOT: load 139 140; VECTORIZE-NOT: load <4 x i32> 141; VECTORIZE-NOT: mul <4 x i32> 142define void @f_volatile_load() { 143entry: 144 %a = load i32*, i32** @A, align 8 145 %b = load i32*, i32** @B, align 8 146 %c = load i32*, i32** @C, align 8 147 %d = load i32*, i32** @D, align 8 148 %e = load i32*, i32** @E, align 8 149 br label %for.body 150 151for.body: 152 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 153 154 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 155 %loadA = load i32, i32* %arrayidxA, align 4 156 157 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 158 %loadB = load i32, i32* %arrayidxB, align 4 159 160 %mulA = mul i32 %loadB, %loadA 161 162 %add = add nuw nsw i64 %ind, 1 163 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 164 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 165 166 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 167 %loadD = load volatile i32, i32* %arrayidxD, align 4 168 169 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 170 %loadE = load i32, i32* %arrayidxE, align 4 171 172 %mulC = mul i32 %loadD, %loadE 173 174 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 175 store i32 %mulC, i32* %arrayidxC, align 4 176 177 %exitcond = icmp eq i64 %add, 20 178 br i1 %exitcond, label %for.end, label %for.body 179 180for.end: 181 ret void 182} 183 184declare i32 @llvm.convergent(i32) #0 185 186; This is the same as f, and would require the same bounds 187; check. However, it is not OK to introduce new control dependencies 188; on the convergent call. 189 190; CHECK-LABEL: @f_with_convergent( 191; CHECK: call i32 @llvm.convergent 192; CHECK-NOT: call i32 @llvm.convergent 193 194; ANALYSIS: for.body: 195; ANALYSIS: Report: cannot add control dependency to convergent operation 196define void @f_with_convergent() #1 { 197entry: 198 %a = load i32*, i32** @A, align 8 199 %b = load i32*, i32** @B, align 8 200 %c = load i32*, i32** @C, align 8 201 %d = load i32*, i32** @D, align 8 202 %e = load i32*, i32** @E, align 8 203 br label %for.body 204 205for.body: ; preds = %for.body, %entry 206 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 207 208 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 209 %loadA = load i32, i32* %arrayidxA, align 4 210 211 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 212 %loadB = load i32, i32* %arrayidxB, align 4 213 214 %mulA = mul i32 %loadB, %loadA 215 216 %add = add nuw nsw i64 %ind, 1 217 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 218 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 219 220 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 221 %loadD = load i32, i32* %arrayidxD, align 4 222 223 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 224 %loadE = load i32, i32* %arrayidxE, align 4 225 226 %convergentD = call i32 @llvm.convergent(i32 %loadD) 227 %mulC = mul i32 %convergentD, %loadE 228 229 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 230 store i32 %mulC, i32* %arrayidxC, align 4 231 232 %exitcond = icmp eq i64 %add, 20 233 br i1 %exitcond, label %for.end, label %for.body 234 235for.end: ; preds = %for.body 236 ret void 237} 238 239; Make sure an explicit request for distribution is ignored if it 240; requires possibly illegal checks. 241 242; CHECK-LABEL: @f_with_convergent_forced_distribute( 243; CHECK: call i32 @llvm.convergent 244; CHECK-NOT: call i32 @llvm.convergent 245define void @f_with_convergent_forced_distribute() #1 { 246entry: 247 %a = load i32*, i32** @A, align 8 248 %b = load i32*, i32** @B, align 8 249 %c = load i32*, i32** @C, align 8 250 %d = load i32*, i32** @D, align 8 251 %e = load i32*, i32** @E, align 8 252 br label %for.body 253 254for.body: ; preds = %for.body, %entry 255 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 256 257 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 258 %loadA = load i32, i32* %arrayidxA, align 4 259 260 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 261 %loadB = load i32, i32* %arrayidxB, align 4 262 263 %mulA = mul i32 %loadB, %loadA 264 265 %add = add nuw nsw i64 %ind, 1 266 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 267 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 268 269 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 270 %loadD = load i32, i32* %arrayidxD, align 4 271 272 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 273 %loadE = load i32, i32* %arrayidxE, align 4 274 275 %convergentD = call i32 @llvm.convergent(i32 %loadD) 276 %mulC = mul i32 %convergentD, %loadE 277 278 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 279 store i32 %mulC, i32* %arrayidxC, align 4 280 281 %exitcond = icmp eq i64 %add, 20 282 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 283 284for.end: ; preds = %for.body 285 ret void 286} 287 288attributes #0 = { nounwind readnone convergent } 289attributes #1 = { nounwind convergent } 290 291!0 = distinct !{!0, !1} 292!1 = !{!"llvm.loop.distribute.enable", i1 true} 293