1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s 2 3; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: 4; HSA: enable_sgpr_private_segment_buffer = 1 5; HSA: enable_sgpr_dispatch_ptr = 0 6; HSA: enable_sgpr_queue_ptr = 1 7 8; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 9; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} 10 11; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 12; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 13 14; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] 15; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] 16; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 17; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 18 19; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 20define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { 21 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* 22 store volatile i32 7, i32 addrspace(4)* %stof 23 ret void 24} 25 26; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: 27; HSA: enable_sgpr_private_segment_buffer = 1 28; HSA: enable_sgpr_dispatch_ptr = 0 29; HSA: enable_sgpr_queue_ptr = 1 30 31; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 32; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} 33 34; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 35; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 36 37; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] 38; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] 39; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 40; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 41 42; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 43define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { 44 %stof = addrspacecast i32* %ptr to i32 addrspace(4)* 45 store volatile i32 7, i32 addrspace(4)* %stof 46 ret void 47} 48 49; no-op 50; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: 51; HSA: enable_sgpr_queue_ptr = 0 52 53; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 54; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 55; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 56; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 57; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 58define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { 59 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* 60 store volatile i32 7, i32 addrspace(4)* %stof 61 ret void 62} 63 64; no-op 65; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: 66; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 67; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 68; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 69; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} 70define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { 71 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* 72 %ld = load volatile i32, i32 addrspace(4)* %stof 73 ret void 74} 75 76; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: 77; HSA: enable_sgpr_private_segment_buffer = 1 78; HSA: enable_sgpr_dispatch_ptr = 0 79; HSA: enable_sgpr_queue_ptr = 0 80 81; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 82; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} 83; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 84; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 85; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 86; HSA: ds_write_b32 [[CASTPTR]], v[[K]] 87define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { 88 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* 89 store volatile i32 0, i32 addrspace(3)* %ftos 90 ret void 91} 92 93; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: 94; HSA: enable_sgpr_private_segment_buffer = 1 95; HSA: enable_sgpr_dispatch_ptr = 0 96; HSA: enable_sgpr_queue_ptr = 0 97 98; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 99; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} 100; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 101; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 102; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 103; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} 104define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { 105 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* 106 store volatile i32 0, i32* %ftos 107 ret void 108} 109 110; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: 111; HSA: enable_sgpr_queue_ptr = 0 112 113; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 114; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 115; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 116; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 117; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 118define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { 119 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* 120 store volatile i32 0, i32 addrspace(1)* %ftos 121 ret void 122} 123 124; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: 125; HSA: enable_sgpr_queue_ptr = 0 126 127; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 128; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 129define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { 130 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* 131 load volatile i32, i32 addrspace(2)* %ftos 132 ret void 133} 134 135; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: 136; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 137; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 138; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 139; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 140; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 141define void @cast_0_group_to_flat_addrspacecast() #0 { 142 %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* 143 store i32 7, i32 addrspace(4)* %cast 144 ret void 145} 146 147; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: 148; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 149; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 150; HSA: ds_write_b32 [[PTR]], [[K]] 151define void @cast_0_flat_to_group_addrspacecast() #0 { 152 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* 153 store i32 7, i32 addrspace(3)* %cast 154 ret void 155} 156 157; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: 158; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 159; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 160; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 161; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 162define void @cast_neg1_group_to_flat_addrspacecast() #0 { 163 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* 164 store i32 7, i32 addrspace(4)* %cast 165 ret void 166} 167 168; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: 169; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 170; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 171; HSA: ds_write_b32 [[PTR]], [[K]] 172define void @cast_neg1_flat_to_group_addrspacecast() #0 { 173 %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* 174 store i32 7, i32 addrspace(3)* %cast 175 ret void 176} 177 178; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: 179; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 180; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 181; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 182; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 183; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 184define void @cast_0_private_to_flat_addrspacecast() #0 { 185 %cast = addrspacecast i32* null to i32 addrspace(4)* 186 store i32 7, i32 addrspace(4)* %cast 187 ret void 188} 189 190; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: 191; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 192; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 193; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen 194define void @cast_0_flat_to_private_addrspacecast() #0 { 195 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* 196 store i32 7, i32* %cast 197 ret void 198} 199 200; Disable optimizations in case there are optimizations added that 201; specialize away generic pointer accesses. 202 203; HSA-LABEL: {{^}}branch_use_flat_i32: 204; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} 205; HSA: s_endpgm 206define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { 207entry: 208 %cmp = icmp ne i32 %c, 0 209 br i1 %cmp, label %local, label %global 210 211local: 212 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* 213 br label %end 214 215global: 216 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* 217 br label %end 218 219end: 220 %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] 221 store i32 %x, i32 addrspace(4)* %fptr, align 4 222; %val = load i32, i32 addrspace(4)* %fptr, align 4 223; store i32 %val, i32 addrspace(1)* %out, align 4 224 ret void 225} 226 227; Check for prologue initializing special SGPRs pointing to scratch. 228; HSA-LABEL: {{^}}store_flat_scratch: 229; HSA-DAG: s_mov_b32 flat_scratch_lo, s9 230; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 231; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 232; HSA: flat_store_dword 233; HSA: s_barrier 234; HSA: flat_load_dword 235define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { 236 %alloca = alloca i32, i32 9, align 4 237 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 238 %pptr = getelementptr i32, i32* %alloca, i32 %x 239 %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* 240 store i32 %x, i32 addrspace(4)* %fptr 241 ; Dummy call 242 call void @llvm.amdgcn.s.barrier() #1 243 %reload = load i32, i32 addrspace(4)* %fptr, align 4 244 store i32 %reload, i32 addrspace(1)* %out, align 4 245 ret void 246} 247 248declare void @llvm.amdgcn.s.barrier() #1 249declare i32 @llvm.amdgcn.workitem.id.x() #2 250 251attributes #0 = { nounwind } 252attributes #1 = { nounwind convergent } 253attributes #2 = { nounwind readnone } 254