1; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 3 4; Test that doing a shift of a pointer with a constant add will be 5; folded into the constant offset addressing mode even if the add has 6; multiple uses. This is relevant to accessing 2 separate, adjacent 7; LDS globals. 8 9 10declare i32 @llvm.amdgcn.workitem.id.x() #1 11 12@lds0 = addrspace(3) global [512 x float] undef, align 4 13@lds1 = addrspace(3) global [512 x float] undef, align 4 14 15 16; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 17 18; GCN-LABEL: {{^}}load_shl_base_lds_0: 19; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 20; GCN: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 21; GCN: s_endpgm 22define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 23 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 24 %idx.0 = add nsw i32 %tid.x, 2 25 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 26 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 27 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 28 store float %val0, float addrspace(1)* %out 29 ret void 30} 31 32; Make sure once the first use is folded into the addressing mode, the 33; remaining add use goes through the normal shl + add constant fold. 34 35; GCN-LABEL: {{^}}load_shl_base_lds_1: 36; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} 37 38; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation 39; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] 40 41; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 42; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} 43; GCN-DAG: buffer_store_dword [[RESULT]] 44; GCN-DAG: buffer_store_dword [[ADDUSE]] 45; GCN: s_endpgm 46define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 47 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 48 %idx.0 = add nsw i32 %tid.x, 2 49 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 50 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 51 %shl_add_use = shl i32 %idx.0, 2 52 store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4 53 store float %val0, float addrspace(1)* %out 54 ret void 55} 56 57@maxlds = addrspace(3) global [65536 x i8] undef, align 4 58 59; GCN-LABEL: {{^}}load_shl_base_lds_max_offset 60; GCN: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 61; GCN: s_endpgm 62define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { 63 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 64 %idx.0 = add nsw i32 %tid.x, 65535 65 %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 66 %val0 = load i8, i8 addrspace(3)* %arrayidx0 67 store i32 %idx.0, i32 addrspace(1)* %add_use 68 store i8 %val0, i8 addrspace(1)* %out 69 ret void 70} 71 72; The two globals are placed adjacent in memory, so the same base 73; pointer can be used with an offset into the second one. 74 75; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints 76 77; GCN-LABEL: {{^}}load_shl_base_lds_2: 78; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} 79; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] 80; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] 81; GCN-DAG: s_mov_b32 m0, -1 82 83; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 84; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 85; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 86 87; GCN: s_endpgm 88define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { 89 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 90 %idx.0 = add nsw i32 %tid.x, 64 91 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 92 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 93 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 94 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 95 %sum = fadd float %val0, %val1 96 store float %sum, float addrspace(1)* %out, align 4 97 ret void 98} 99 100; GCN-LABEL: {{^}}store_shl_base_lds_0: 101; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 102; GCN: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 103; GCN: s_endpgm 104define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 105 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 106 %idx.0 = add nsw i32 %tid.x, 2 107 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 108 store float 1.0, float addrspace(3)* %arrayidx0, align 4 109 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 110 ret void 111} 112 113 114; -------------------------------------------------------------------------------- 115; Atomics. 116 117@lds2 = addrspace(3) global [512 x i32] undef, align 4 118 119; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 120; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 121; %idx.0 = add nsw i32 %tid.x, 2 122; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 123; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4 124; store i32 %val, i32 addrspace(1)* %out, align 4 125; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 126; ret void 127; } 128 129 130; GCN-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: 131; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 132; GCN: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 133; GCN: s_endpgm 134define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { 135 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 136 %idx.0 = add nsw i32 %tid.x, 2 137 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 138 %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic 139 %result = extractvalue { i32, i1 } %pair, 0 140 store i32 %result, i32 addrspace(1)* %out, align 4 141 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 142 ret void 143} 144 145; GCN-LABEL: {{^}}atomic_swap_shl_base_lds_0: 146; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 147; GCN: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 148; GCN: s_endpgm 149define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 150 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 151 %idx.0 = add nsw i32 %tid.x, 2 152 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 153 %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 154 store i32 %val, i32 addrspace(1)* %out, align 4 155 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 156 ret void 157} 158 159; GCN-LABEL: {{^}}atomic_add_shl_base_lds_0: 160; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 161; GCN: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 162; GCN: s_endpgm 163define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 164 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 165 %idx.0 = add nsw i32 %tid.x, 2 166 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 167 %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 168 store i32 %val, i32 addrspace(1)* %out, align 4 169 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 170 ret void 171} 172 173; GCN-LABEL: {{^}}atomic_sub_shl_base_lds_0: 174; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 175; GCN: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 176; GCN: s_endpgm 177define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 178 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 179 %idx.0 = add nsw i32 %tid.x, 2 180 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 181 %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 182 store i32 %val, i32 addrspace(1)* %out, align 4 183 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 184 ret void 185} 186 187; GCN-LABEL: {{^}}atomic_and_shl_base_lds_0: 188; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 189; GCN: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 190; GCN: s_endpgm 191define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 192 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 193 %idx.0 = add nsw i32 %tid.x, 2 194 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 195 %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 196 store i32 %val, i32 addrspace(1)* %out, align 4 197 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 198 ret void 199} 200 201; GCN-LABEL: {{^}}atomic_or_shl_base_lds_0: 202; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 203; GCN: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 204; GCN: s_endpgm 205define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 206 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 207 %idx.0 = add nsw i32 %tid.x, 2 208 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 209 %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 210 store i32 %val, i32 addrspace(1)* %out, align 4 211 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 212 ret void 213} 214 215; GCN-LABEL: {{^}}atomic_xor_shl_base_lds_0: 216; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 217; GCN: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 218; GCN: s_endpgm 219define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 220 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 221 %idx.0 = add nsw i32 %tid.x, 2 222 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 223 %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 224 store i32 %val, i32 addrspace(1)* %out, align 4 225 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 226 ret void 227} 228 229; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 230; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 231; %idx.0 = add nsw i32 %tid.x, 2 232; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 233; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 234; store i32 %val, i32 addrspace(1)* %out, align 4 235; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 236; ret void 237; } 238 239; GCN-LABEL: {{^}}atomic_min_shl_base_lds_0: 240; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 241; GCN: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 242; GCN: s_endpgm 243define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 244 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 245 %idx.0 = add nsw i32 %tid.x, 2 246 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 247 %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 248 store i32 %val, i32 addrspace(1)* %out, align 4 249 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 250 ret void 251} 252 253; GCN-LABEL: {{^}}atomic_max_shl_base_lds_0: 254; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 255; GCN: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 256; GCN: s_endpgm 257define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 258 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 259 %idx.0 = add nsw i32 %tid.x, 2 260 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 261 %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 262 store i32 %val, i32 addrspace(1)* %out, align 4 263 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 264 ret void 265} 266 267; GCN-LABEL: {{^}}atomic_umin_shl_base_lds_0: 268; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 269; GCN: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 270; GCN: s_endpgm 271define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 272 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 273 %idx.0 = add nsw i32 %tid.x, 2 274 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 275 %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 276 store i32 %val, i32 addrspace(1)* %out, align 4 277 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 278 ret void 279} 280 281; GCN-LABEL: {{^}}atomic_umax_shl_base_lds_0: 282; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} 283; GCN: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 284; GCN: s_endpgm 285define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 286 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 287 %idx.0 = add nsw i32 %tid.x, 2 288 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 289 %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst 290 store i32 %val, i32 addrspace(1)* %out, align 4 291 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 292 ret void 293} 294 295; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds: 296; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 297; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32 298 299; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 300; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64 301define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 { 302 %idx.add = add nuw i32 %idx, 4 303 %shl0 = shl i32 %idx.add, 3 304 %shl1 = shl i32 %idx.add, 4 305 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)* 306 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)* 307 store volatile i32 9, i32 addrspace(3)* %ptr0 308 store volatile i32 10, i32 addrspace(3)* %ptr1 309 ret void 310} 311 312; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_lds_offset: 313; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 314; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 315; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528 316; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 0x1fff0, [[SCALE1]] 317; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}} 318define void @shl_add_ptr_combine_2use_max_lds_offset(i32 %idx) #0 { 319 %idx.add = add nuw i32 %idx, 8191 320 %shl0 = shl i32 %idx.add, 3 321 %shl1 = shl i32 %idx.add, 4 322 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)* 323 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)* 324 store volatile i32 9, i32 addrspace(3)* %ptr0 325 store volatile i32 10, i32 addrspace(3)* %ptr1 326 ret void 327} 328 329; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_lds_offset: 330; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1000, v0 331; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]] 332; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]] 333; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+$}} 334; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+$}} 335define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 { 336 %idx.add = add nuw i32 %idx, 4096 337 %shl0 = shl i32 %idx.add, 4 338 %shl1 = shl i32 %idx.add, 5 339 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)* 340 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)* 341 store volatile i32 9, i32 addrspace(3)* %ptr0 342 store volatile i32 10, i32 addrspace(3)* %ptr1 343 ret void 344} 345 346; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: 347; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 348; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16 349 350; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 351; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32 352define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { 353 %idx = zext i16 %idx.arg to i32 354 %idx.add = add nuw i32 %idx, 4 355 %shl0 = shl i32 %idx.add, 2 356 %shl1 = shl i32 %idx.add, 3 357 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* 358 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* 359 store volatile i32 9, i32 addrspace(5)* %ptr0 360 store volatile i32 10, i32 addrspace(5)* %ptr1 361 ret void 362} 363 364; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset: 365; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 366; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 367; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:4088 368; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]] 369; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], 0 offen{{$}} 370define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 { 371 %idx = zext i16 %idx.arg to i32 372 %idx.add = add nuw i32 %idx, 511 373 %shl0 = shl i32 %idx.add, 3 374 %shl1 = shl i32 %idx.add, 4 375 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* 376 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* 377 store volatile i32 9, i32 addrspace(5)* %ptr0 378 store volatile i32 10, i32 addrspace(5)* %ptr1 379 ret void 380} 381; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_private_offset: 382; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0 383; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]] 384; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]] 385; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen{{$}} 386; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen{{$}} 387define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 { 388 %idx = zext i16 %idx.arg to i32 389 %idx.add = add nuw i32 %idx, 256 390 %shl0 = shl i32 %idx.add, 4 391 %shl1 = shl i32 %idx.add, 5 392 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* 393 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* 394 store volatile i32 9, i32 addrspace(5)* %ptr0 395 store volatile i32 10, i32 addrspace(5)* %ptr1 396 ret void 397} 398 399; FIXME: This or should fold into an offset on the write 400; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds: 401; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 402; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]] 403; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} 404 405; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 406; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64 407define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 { 408 %idx.add = or i32 %idx, 4 409 %shl0 = shl i32 %idx.add, 3 410 %shl1 = shl i32 %idx.add, 4 411 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)* 412 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)* 413 store volatile i32 9, i32 addrspace(3)* %ptr0 414 store volatile i32 10, i32 addrspace(3)* %ptr1 415 ret void 416} 417 418; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_max_lds_offset: 419; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 420; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 421; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528 422; GCN-DAG: v_or_b32_e32 [[ADD1:v[0-9]+]], 0x1fff0, [[SCALE1]] 423; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}} 424define void @shl_or_ptr_combine_2use_max_lds_offset(i32 %idx) #0 { 425 %idx.add = or i32 %idx, 8191 426 %shl0 = shl i32 %idx.add, 3 427 %shl1 = shl i32 %idx.add, 4 428 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)* 429 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)* 430 store volatile i32 9, i32 addrspace(3)* %ptr0 431 store volatile i32 10, i32 addrspace(3)* %ptr1 432 ret void 433} 434 435attributes #0 = { nounwind } 436attributes #1 = { nounwind readnone } 437