1; RUN: opt %s -analyze -divergence | FileCheck %s 2 3target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" 4target triple = "nvptx64-nvidia-cuda" 5 6; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) 7define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { 8; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' 9entry: 10 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 11 %cond = icmp slt i32 %n, 0 12 br i1 %cond, label %then, label %else ; uniform 13; CHECK-NOT: DIVERGENT: br i1 %cond, 14then: 15 %a1 = add i32 %a, %tid 16 br label %merge 17else: 18 %b2 = add i32 %b, %tid 19 br label %merge 20merge: 21 %c = phi i32 [ %a1, %then ], [ %b2, %else ] 22 ret i32 %c 23} 24 25; c = a; 26; if (threadIdx.x < 5) // divergent: data dependent 27; c = b; 28; return c; // c is divergent: sync dependent 29define i32 @sync(i32 %a, i32 %b) { 30; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' 31bb1: 32 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() 33 %cond = icmp slt i32 %tid, 5 34 br i1 %cond, label %bb2, label %bb3 35; CHECK: DIVERGENT: br i1 %cond, 36bb2: 37 br label %bb3 38bb3: 39 %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid 40; CHECK: DIVERGENT: %c = 41 ret i32 %c 42} 43 44; c = 0; 45; if (threadIdx.x >= 5) { // divergent 46; c = (n < 0 ? a : b); // c here is uniform because n is uniform 47; } 48; // c here is divergent because it is sync dependent on threadIdx.x >= 5 49; return c; 50define i32 @mixed(i32 %n, i32 %a, i32 %b) { 51; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' 52bb1: 53 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() 54 %cond = icmp slt i32 %tid, 5 55 br i1 %cond, label %bb6, label %bb2 56; CHECK: DIVERGENT: br i1 %cond, 57bb2: 58 %cond2 = icmp slt i32 %n, 0 59 br i1 %cond2, label %bb4, label %bb3 60bb3: 61 br label %bb5 62bb4: 63 br label %bb5 64bb5: 65 %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] 66; CHECK-NOT: DIVERGENT: %c = 67 br label %bb6 68bb6: 69 %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] 70; CHECK: DIVERGENT: %c2 = 71 ret i32 %c2 72} 73 74; We conservatively treats all parameters of a __device__ function as divergent. 75define i32 @device(i32 %n, i32 %a, i32 %b) { 76; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' 77; CHECK: DIVERGENT: i32 %n 78; CHECK: DIVERGENT: i32 %a 79; CHECK: DIVERGENT: i32 %b 80entry: 81 %cond = icmp slt i32 %n, 0 82 br i1 %cond, label %then, label %else 83; CHECK: DIVERGENT: br i1 %cond, 84then: 85 br label %merge 86else: 87 br label %merge 88merge: 89 %c = phi i32 [ %a, %then ], [ %b, %else ] 90 ret i32 %c 91} 92 93; int i = 0; 94; do { 95; i++; // i here is uniform 96; } while (i < laneid); 97; return i == 10 ? 0 : 1; // i here is divergent 98; 99; The i defined in the loop is used outside. 100define i32 @loop() { 101; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' 102entry: 103 %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() 104 br label %loop 105loop: 106 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 107; CHECK-NOT: DIVERGENT: %i = 108 %i1 = add i32 %i, 1 109 %exit_cond = icmp sge i32 %i1, %laneid 110 br i1 %exit_cond, label %loop_exit, label %loop 111loop_exit: 112 %cond = icmp eq i32 %i, 10 113 br i1 %cond, label %then, label %else 114; CHECK: DIVERGENT: br i1 %cond, 115then: 116 ret i32 0 117else: 118 ret i32 1 119} 120 121; Same as @loop, but the loop is in the LCSSA form. 122define i32 @lcssa() { 123; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' 124entry: 125 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 126 br label %loop 127loop: 128 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 129; CHECK-NOT: DIVERGENT: %i = 130 %i1 = add i32 %i, 1 131 %exit_cond = icmp sge i32 %i1, %tid 132 br i1 %exit_cond, label %loop_exit, label %loop 133loop_exit: 134 %i.lcssa = phi i32 [ %i, %loop ] 135; CHECK: DIVERGENT: %i.lcssa = 136 %cond = icmp eq i32 %i.lcssa, 10 137 br i1 %cond, label %then, label %else 138; CHECK: DIVERGENT: br i1 %cond, 139then: 140 ret i32 0 141else: 142 ret i32 1 143} 144 145; This test contains an unstructured loop. 146; +-------------- entry ----------------+ 147; | | 148; V V 149; i1 = phi(0, i3) i2 = phi(0, i3) 150; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 151; ^ | ^ 152; | V | 153; +-------- switch (tid / i3) ----------+ 154; | 155; V 156; if (i3 == 5) // divergent 157; because sync dependent on (tid / i3). 158define i32 @unstructured_loop(i1 %entry_cond) { 159; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' 160entry: 161 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 162 br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 163loop_entry_1: 164 %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] 165 %j1 = add i32 %i1, 1 166 br label %loop_body 167loop_entry_2: 168 %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] 169 %j2 = add i32 %i2, 2 170 br label %loop_body 171loop_body: 172 %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] 173 br label %loop_latch 174loop_latch: 175 %div = sdiv i32 %tid, %i3 176 switch i32 %div, label %branch [ i32 1, label %loop_entry_1 177 i32 2, label %loop_entry_2 ] 178branch: 179 %cmp = icmp eq i32 %i3, 5 180 br i1 %cmp, label %then, label %else 181; CHECK: DIVERGENT: br i1 %cmp, 182then: 183 ret i32 0 184else: 185 ret i32 1 186} 187 188; Verifies sync-dependence is computed correctly in the absense of loops. 189define i32 @sync_no_loop(i32 %arg) { 190entry: 191 %0 = add i32 %arg, 1 192 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 193 %1 = icmp sge i32 %tid, 10 194 br i1 %1, label %bb1, label %bb2 195 196bb1: 197 br label %bb3 198 199bb2: 200 br label %bb3 201 202bb3: 203 %2 = add i32 %0, 2 204 ; CHECK-NOT: DIVERGENT: %2 205 ret i32 %2 206} 207 208declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() 209declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() 210declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() 211declare i32 @llvm.nvvm.read.ptx.sreg.laneid() 212 213!nvvm.annotations = !{!0, !1, !2, !3, !4, !5} 214!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} 215!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} 216!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} 217!3 = !{i32 ()* @loop, !"kernel", i32 1} 218!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} 219!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} 220