1; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s 2 3target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" 4target triple = "nvptx64-nvidia-cuda" 5 6; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) 7define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { 8; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' 9entry: 10 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 11 %cond = icmp slt i32 %n, 0 12 br i1 %cond, label %then, label %else ; uniform 13; CHECK-NOT: DIVERGENT: br i1 %cond, 14then: 15 %a1 = add i32 %a, %tid 16 br label %merge 17else: 18 %b2 = add i32 %b, %tid 19 br label %merge 20merge: 21 %c = phi i32 [ %a1, %then ], [ %b2, %else ] 22 ret i32 %c 23} 24 25; c = a; 26; if (threadIdx.x < 5) // divergent: data dependent 27; c = b; 28; return c; // c is divergent: sync dependent 29define i32 @sync(i32 %a, i32 %b) { 30; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' 31bb1: 32 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() 33 %cond = icmp slt i32 %tid, 5 34 br i1 %cond, label %bb2, label %bb3 35; CHECK: DIVERGENT: br i1 %cond, 36bb2: 37 br label %bb3 38bb3: 39 %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid 40; CHECK: DIVERGENT: %c = 41 ret i32 %c 42} 43 44; c = 0; 45; if (threadIdx.x >= 5) { // divergent 46; c = (n < 0 ? a : b); // c here is uniform because n is uniform 47; } 48; // c here is divergent because it is sync dependent on threadIdx.x >= 5 49; return c; 50define i32 @mixed(i32 %n, i32 %a, i32 %b) { 51; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' 52bb1: 53 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() 54 %cond = icmp slt i32 %tid, 5 55 br i1 %cond, label %bb6, label %bb2 56; CHECK: DIVERGENT: br i1 %cond, 57bb2: 58 %cond2 = icmp slt i32 %n, 0 59 br i1 %cond2, label %bb4, label %bb3 60bb3: 61 br label %bb5 62bb4: 63 br label %bb5 64bb5: 65 %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] 66; CHECK-NOT: DIVERGENT: %c = 67 br label %bb6 68bb6: 69 %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] 70; CHECK: DIVERGENT: %c2 = 71 ret i32 %c2 72} 73 74; We conservatively treats all parameters of a __device__ function as divergent. 75define i32 @device(i32 %n, i32 %a, i32 %b) { 76; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' 77; CHECK: DIVERGENT: i32 %n 78; CHECK: DIVERGENT: i32 %a 79; CHECK: DIVERGENT: i32 %b 80entry: 81 %cond = icmp slt i32 %n, 0 82 br i1 %cond, label %then, label %else 83; CHECK: DIVERGENT: br i1 %cond, 84then: 85 br label %merge 86else: 87 br label %merge 88merge: 89 %c = phi i32 [ %a, %then ], [ %b, %else ] 90 ret i32 %c 91} 92 93; int i = 0; 94; do { 95; i++; // i here is uniform 96; } while (i < laneid); 97; return i == 10 ? 0 : 1; // i here is divergent 98; 99; The i defined in the loop is used outside. 100define i32 @loop() { 101; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' 102entry: 103 %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() 104 br label %loop 105loop: 106 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 107; CHECK-NOT: DIVERGENT: %i = 108 %i1 = add i32 %i, 1 109 %exit_cond = icmp sge i32 %i1, %laneid 110 br i1 %exit_cond, label %loop_exit, label %loop 111loop_exit: 112 %cond = icmp eq i32 %i, 10 113 br i1 %cond, label %then, label %else 114; CHECK: DIVERGENT: br i1 %cond, 115then: 116 ret i32 0 117else: 118 ret i32 1 119} 120 121; Same as @loop, but the loop is in the LCSSA form. 122define i32 @lcssa() { 123; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' 124entry: 125 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 126 br label %loop 127loop: 128 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 129; CHECK-NOT: DIVERGENT: %i = 130 %i1 = add i32 %i, 1 131 %exit_cond = icmp sge i32 %i1, %tid 132 br i1 %exit_cond, label %loop_exit, label %loop 133loop_exit: 134 %i.lcssa = phi i32 [ %i, %loop ] 135; CHECK: DIVERGENT: %i.lcssa = 136 %cond = icmp eq i32 %i.lcssa, 10 137 br i1 %cond, label %then, label %else 138; CHECK: DIVERGENT: br i1 %cond, 139then: 140 ret i32 0 141else: 142 ret i32 1 143} 144 145; Verifies sync-dependence is computed correctly in the absense of loops. 146define i32 @sync_no_loop(i32 %arg) { 147entry: 148 %0 = add i32 %arg, 1 149 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 150 %1 = icmp sge i32 %tid, 10 151 br i1 %1, label %bb1, label %bb2 152 153bb1: 154 br label %bb3 155 156bb2: 157 br label %bb3 158 159bb3: 160 %2 = add i32 %0, 2 161 ; CHECK-NOT: DIVERGENT: %2 162 ret i32 %2 163} 164 165declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() 166declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() 167declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() 168declare i32 @llvm.nvvm.read.ptx.sreg.laneid() 169 170!nvvm.annotations = !{!0, !1, !2, !3, !4} 171!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} 172!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} 173!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} 174!3 = !{i32 ()* @loop, !"kernel", i32 1} 175!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} 176