1; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s 4 5; GCN-LABEL: barrier_vmcnt_global: 6; GFX8: flat_load_dword 7; GFX9_10: global_load_dword 8; GFX8: s_waitcnt vmcnt(0){{$}} 9; GFX9_10: s_waitcnt vmcnt(0){{$}} 10; GCN-NEXT: s_barrier 11define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) { 12bb: 13 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 14 %tmp1 = zext i32 %tmp to i64 15 %tmp2 = shl nuw nsw i64 %tmp1, 32 16 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1 17 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 18 fence syncscope("singlethread") release 19 tail call void @llvm.amdgcn.s.barrier() 20 fence syncscope("singlethread") acquire 21 %tmp5 = add nuw nsw i64 %tmp2, 4294967296 22 %tmp6 = lshr exact i64 %tmp5, 32 23 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 24 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4 25 ret void 26} 27 28; GCN-LABEL: barrier_vscnt_global: 29; GFX8: flat_store_dword 30; GFX9_10: global_store_dword 31; GFX8: s_waitcnt vmcnt(0){{$}} 32; GFX9: s_waitcnt vmcnt(0){{$}} 33; GFX10: s_waitcnt_vscnt null, 0x0 34; GCN-NEXT: s_barrier 35define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) { 36bb: 37 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 38 %tmp1 = zext i32 %tmp to i64 39 %tmp2 = shl nuw nsw i64 %tmp1, 32 40 %tmp3 = add nuw nsw i64 %tmp2, 8589934592 41 %tmp4 = lshr exact i64 %tmp3, 32 42 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 43 store i32 0, i32 addrspace(1)* %tmp5, align 4 44 fence syncscope("singlethread") release 45 tail call void @llvm.amdgcn.s.barrier() #3 46 fence syncscope("singlethread") acquire 47 %tmp6 = add nuw nsw i64 %tmp2, 4294967296 48 %tmp7 = lshr exact i64 %tmp6, 32 49 %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7 50 store i32 1, i32 addrspace(1)* %tmp8, align 4 51 ret void 52} 53 54; GCN-LABEL: barrier_vmcnt_vscnt_global: 55; GFX8: flat_load_dword 56; GFX9_10: global_load_dword 57; GFX8: s_waitcnt vmcnt(0){{$}} 58; GFX9_10: s_waitcnt vmcnt(0){{$}} 59; GFX10: s_waitcnt_vscnt null, 0x0 60; GCN-NEXT: s_barrier 61define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) { 62bb: 63 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 64 %tmp1 = zext i32 %tmp to i64 65 %tmp2 = shl nuw nsw i64 %tmp1, 32 66 %tmp3 = add nuw nsw i64 %tmp2, 8589934592 67 %tmp4 = lshr exact i64 %tmp3, 32 68 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 69 store i32 0, i32 addrspace(1)* %tmp5, align 4 70 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1 71 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 72 fence syncscope("singlethread") release 73 tail call void @llvm.amdgcn.s.barrier() 74 fence syncscope("singlethread") acquire 75 %tmp8 = add nuw nsw i64 %tmp2, 4294967296 76 %tmp9 = lshr exact i64 %tmp8, 32 77 %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9 78 store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4 79 ret void 80} 81 82; GCN-LABEL: barrier_vmcnt_flat: 83; GCN: flat_load_dword 84; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 85; GCN-NEXT: s_barrier 86define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) { 87bb: 88 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 89 %tmp1 = zext i32 %tmp to i64 90 %tmp2 = shl nuw nsw i64 %tmp1, 32 91 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1 92 %tmp4 = load i32, i32* %tmp3, align 4 93 fence syncscope("singlethread") release 94 tail call void @llvm.amdgcn.s.barrier() 95 fence syncscope("singlethread") acquire 96 %tmp5 = add nuw nsw i64 %tmp2, 4294967296 97 %tmp6 = lshr exact i64 %tmp5, 32 98 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6 99 store i32 %tmp4, i32* %tmp7, align 4 100 ret void 101} 102 103; GCN-LABEL: barrier_vscnt_flat: 104; GCN: flat_store_dword 105; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 106; GFX10: s_waitcnt lgkmcnt(0){{$}} 107; GFX10: s_waitcnt_vscnt null, 0x0 108; GCN-NEXT: s_barrier 109define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) { 110bb: 111 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 112 %tmp1 = zext i32 %tmp to i64 113 %tmp2 = shl nuw nsw i64 %tmp1, 32 114 %tmp3 = add nuw nsw i64 %tmp2, 8589934592 115 %tmp4 = lshr exact i64 %tmp3, 32 116 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 117 store i32 0, i32* %tmp5, align 4 118 fence syncscope("singlethread") release 119 tail call void @llvm.amdgcn.s.barrier() #3 120 fence syncscope("singlethread") acquire 121 %tmp6 = add nuw nsw i64 %tmp2, 4294967296 122 %tmp7 = lshr exact i64 %tmp6, 32 123 %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7 124 store i32 1, i32* %tmp8, align 4 125 ret void 126} 127 128; GCN-LABEL: barrier_vmcnt_vscnt_flat: 129; GCN: flat_load_dword 130; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 131; GFX10: s_waitcnt_vscnt null, 0x0 132; GCN-NEXT: s_barrier 133define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) { 134bb: 135 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 136 %tmp1 = zext i32 %tmp to i64 137 %tmp2 = shl nuw nsw i64 %tmp1, 32 138 %tmp3 = add nuw nsw i64 %tmp2, 8589934592 139 %tmp4 = lshr exact i64 %tmp3, 32 140 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 141 store i32 0, i32* %tmp5, align 4 142 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1 143 %tmp7 = load i32, i32* %tmp6, align 4 144 fence syncscope("singlethread") release 145 tail call void @llvm.amdgcn.s.barrier() 146 fence syncscope("singlethread") acquire 147 %tmp8 = add nuw nsw i64 %tmp2, 4294967296 148 %tmp9 = lshr exact i64 %tmp8, 32 149 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9 150 store i32 %tmp7, i32* %tmp10, align 4 151 ret void 152} 153 154; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: 155; GCN: flat_load_dword 156; GFX8_9: s_waitcnt lgkmcnt(0){{$}} 157; GFX8_9: s_waitcnt vmcnt(0){{$}} 158; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 159; GFX10: s_waitcnt_vscnt null, 0x0 160; GCN-NEXT: s_barrier 161define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { 162bb: 163 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 164 %tmp1 = zext i32 %tmp to i64 165 %tmp2 = shl nuw nsw i64 %tmp1, 32 166 %tmp3 = add nuw nsw i64 %tmp2, 8589934592 167 %tmp4 = lshr exact i64 %tmp3, 32 168 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 169 store i32 0, i32* %tmp5, align 4 170 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1 171 %tmp7 = load i32, i32* %tmp6, align 4 172 fence syncscope("workgroup") release 173 tail call void @llvm.amdgcn.s.barrier() 174 fence syncscope("workgroup") acquire 175 %tmp8 = add nuw nsw i64 %tmp2, 4294967296 176 %tmp9 = lshr exact i64 %tmp8, 32 177 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9 178 store i32 %tmp7, i32* %tmp10, align 4 179 ret void 180} 181 182; GCN-LABEL: load_vmcnt_global: 183; GFX8: flat_load_dword 184; GFX9_10: global_load_dword 185; GFX8: s_waitcnt vmcnt(0){{$}} 186; GFX9_10: s_waitcnt vmcnt(0){{$}} 187; GCN-NEXT: {{global|flat}}_store_dword 188define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) { 189bb: 190 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 191 %tmp1 = zext i32 %tmp to i64 192 %tmp2 = shl nuw nsw i64 %tmp1, 32 193 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1 194 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 195 %tmp5 = add nuw nsw i64 %tmp2, 4294967296 196 %tmp6 = lshr exact i64 %tmp5, 32 197 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 198 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4 199 ret void 200} 201 202; GCN-LABEL: load_vmcnt_flat: 203; GCN: flat_load_dword 204; GCN-NOT: vscnt 205; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 206; GCN-NEXT: {{global|flat}}_store_dword 207define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) { 208bb: 209 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 210 %tmp1 = zext i32 %tmp to i64 211 %tmp2 = shl nuw nsw i64 %tmp1, 32 212 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1 213 %tmp4 = load i32, i32* %tmp3, align 4 214 %tmp5 = add nuw nsw i64 %tmp2, 4294967296 215 %tmp6 = lshr exact i64 %tmp5, 32 216 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6 217 store i32 %tmp4, i32* %tmp7, align 4 218 ret void 219} 220 221; GCN-LABEL: store_vscnt_private: 222; GCN: buffer_store_dword 223; GFX8_9: s_waitcnt vmcnt(0) 224; GFX10: s_waitcnt_vscnt null, 0x0 225; GCN-NEXT: s_setpc_b64 226define void @store_vscnt_private(i32 addrspace(5)* %p) { 227 store i32 0, i32 addrspace(5)* %p 228 ret void 229} 230 231; GCN-LABEL: store_vscnt_global: 232; GFX8: flat_store_dword 233; GFX9_10: global_store_dword 234; GFX8_9: s_waitcnt vmcnt(0) 235; GFX10: s_waitcnt_vscnt null, 0x0 236; GCN-NEXT: s_setpc_b64 237define void @store_vscnt_global(i32 addrspace(1)* %p) { 238 store i32 0, i32 addrspace(1)* %p 239 ret void 240} 241 242; GCN-LABEL: store_vscnt_flat: 243; GCN: flat_store_dword 244; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} 245; GFX10: s_waitcnt lgkmcnt(0){{$}} 246; GFX10: s_waitcnt_vscnt null, 0x0 247; GCN-NEXT: s_setpc_b64 248define void @store_vscnt_flat(i32* %p) { 249 store i32 0, i32* %p 250 ret void 251} 252 253; GCN-LABEL: function_prologue: 254; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}} 255; GFX10: s_waitcnt_vscnt null, 0x0 256; GCN-NEXT: s_setpc_b64 257define void @function_prologue() { 258 ret void 259} 260 261declare void @llvm.amdgcn.s.barrier() 262declare i32 @llvm.amdgcn.workitem.id.x() 263