1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s 3 4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: 5; GCN: enable_sgpr_kernarg_segment_ptr = 1 6 7; HSA: kernarg_segment_byte_size = 0 8; MESA: kernarg_segment_byte_size = 16 9 10; HSA: s_load_dword s0, s[4:5], 0x0 11define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { 12 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 13 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 14 %load = load volatile i32, i32 addrspace(4)* %cast 15 ret void 16} 17 18; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty: 19; GCN: enable_sgpr_kernarg_segment_ptr = 1 20 21; HSA: kernarg_segment_byte_size = 48 22; MESA: kernarg_segment_byte_size = 16 23 24; HSA: s_load_dword s0, s[4:5], 0x0 25define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 { 26 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 27 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 28 %load = load volatile i32, i32 addrspace(4)* %cast 29 ret void 30} 31 32; GCN-LABEL: {{^}}kernel_implicitarg_ptr: 33; GCN: enable_sgpr_kernarg_segment_ptr = 1 34 35; HSA: kernarg_segment_byte_size = 112 36; MESA: kernarg_segment_byte_size = 128 37 38; HSA: s_load_dword s0, s[4:5], 0x1c 39define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { 40 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 41 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 42 %load = load volatile i32, i32 addrspace(4)* %cast 43 ret void 44} 45 46; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr: 47; GCN: enable_sgpr_kernarg_segment_ptr = 1 48 49; HSA: kernarg_segment_byte_size = 160 50; MESA: kernarg_segment_byte_size = 128 51 52; HSA: s_load_dword s0, s[4:5], 0x1c 53define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { 54 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 55 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 56 %load = load volatile i32, i32 addrspace(4)* %cast 57 ret void 58} 59 60; GCN-LABEL: {{^}}func_implicitarg_ptr: 61; GCN: s_waitcnt 62; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 63; GCN-NEXT: s_waitcnt 64; GCN-NEXT: s_setpc_b64 65define void @func_implicitarg_ptr() #0 { 66 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 67 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 68 %load = load volatile i32, i32 addrspace(4)* %cast 69 ret void 70} 71 72; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: 73; GCN: s_waitcnt 74; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 75; GCN-NEXT: s_waitcnt 76; GCN-NEXT: s_setpc_b64 77define void @opencl_func_implicitarg_ptr() #0 { 78 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 79 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 80 %load = load volatile i32, i32 addrspace(4)* %cast 81 ret void 82} 83 84; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty: 85; GCN: enable_sgpr_kernarg_segment_ptr = 1 86; HSA: kernarg_segment_byte_size = 0 87; MESA: kernarg_segment_byte_size = 16 88; GCN-NOT: s[4:5] 89; GCN-NOT: s4 90; GCN-NOT: s5 91; GCN: s_swappc_b64 92define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { 93 call void @func_implicitarg_ptr() 94 ret void 95} 96 97; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty: 98; GCN: enable_sgpr_kernarg_segment_ptr = 1 99; HSA: kernarg_segment_byte_size = 48 100; MESA: kernarg_segment_byte_size = 16 101; GCN-NOT: s[4:5] 102; GCN-NOT: s4 103; GCN-NOT: s5 104; GCN: s_swappc_b64 105define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { 106 call void @func_implicitarg_ptr() 107 ret void 108} 109 110; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func: 111; GCN: enable_sgpr_kernarg_segment_ptr = 1 112; HSA: kernarg_segment_byte_size = 112 113; MESA: kernarg_segment_byte_size = 128 114 115; HSA: s_add_u32 s4, s4, 0x70 116; MESA: s_add_u32 s4, s4, 0x70 117 118; GCN: s_addc_u32 s5, s5, 0{{$}} 119; GCN: s_swappc_b64 120define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { 121 call void @func_implicitarg_ptr() 122 ret void 123} 124 125; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func: 126; GCN: enable_sgpr_kernarg_segment_ptr = 1 127; HSA: kernarg_segment_byte_size = 160 128; MESA: kernarg_segment_byte_size = 128 129 130; GCN: s_add_u32 s4, s4, 0x70 131; GCN: s_addc_u32 s5, s5, 0{{$}} 132; GCN: s_swappc_b64 133define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { 134 call void @func_implicitarg_ptr() 135 ret void 136} 137 138; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: 139; GCN-NOT: s4 140; GCN-NOT: s5 141; GCN-NOT: s[4:5] 142define void @func_call_implicitarg_ptr_func() #0 { 143 call void @func_implicitarg_ptr() 144 ret void 145} 146 147; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: 148; GCN-NOT: s4 149; GCN-NOT: s5 150; GCN-NOT: s[4:5] 151define void @opencl_func_call_implicitarg_ptr_func() #0 { 152 call void @func_implicitarg_ptr() 153 ret void 154} 155 156; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: 157; GCN: s_waitcnt 158; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 159; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 160; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 161; GCN: s_waitcnt lgkmcnt(0) 162define void @func_kernarg_implicitarg_ptr() #0 { 163 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 164 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 165 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 166 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 167 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 168 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 169 ret void 170} 171 172; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: 173; GCN: s_waitcnt 174; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 175; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 176; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 177; GCN: s_waitcnt lgkmcnt(0) 178define void @opencl_func_kernarg_implicitarg_ptr() #0 { 179 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 180 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 181 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 182 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 183 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 184 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 185 ret void 186} 187 188; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: 189; GCN: s_add_u32 s4, s4, 0x70 190; GCN: s_addc_u32 s5, s5, 0 191; GCN: s_swappc_b64 192define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { 193 call void @func_kernarg_implicitarg_ptr() 194 ret void 195} 196 197; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding: 198; HSA: kernarg_segment_byte_size = 120 199; MESA: kernarg_segment_byte_size = 84 200; GCN: kernarg_segment_alignment = 6 201define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 { 202 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 203 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 204 %load = load volatile i32, i32 addrspace(4)* %cast 205 ret void 206} 207 208declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2 209declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 210 211attributes #0 = { nounwind noinline } 212attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } 213attributes #2 = { nounwind readnone speculatable } 214