1; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s 2; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s 3; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s 4; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s 6; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s 7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s 8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s 9 10target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" 11 12; OPT-LABEL: @test_sink_global_small_offset_i32( 13; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in 14; OPT-VI: getelementptr i32, i32 addrspace(1)* %in 15; OPT: br i1 16; OPT-CI: getelementptr i8, 17 18; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: 19define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 20entry: 21 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 22 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 24 %tmp0 = icmp eq i32 %tid, 0 25 br i1 %tmp0, label %endif, label %if 26 27if: 28 %tmp1 = load i32, i32 addrspace(1)* %in.gep 29 br label %endif 30 31endif: 32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 33 store i32 %x, i32 addrspace(1)* %out.gep 34 br label %done 35 36done: 37 ret void 38} 39 40; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( 41; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 42; OPT: br i1 43 44; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: 45; GCN: s_and_saveexec_b64 46; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 47; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} 48; GCN: {{^}}BB1_2: 49; GCN: s_or_b64 exec 50define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 51entry: 52 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 53 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 54 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 55 %tmp0 = icmp eq i32 %tid, 0 56 br i1 %tmp0, label %endif, label %if 57 58if: 59 %tmp1 = load i8, i8 addrspace(1)* %in.gep 60 %tmp2 = sext i8 %tmp1 to i32 61 br label %endif 62 63endif: 64 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 65 store i32 %x, i32 addrspace(1)* %out.gep 66 br label %done 67 68done: 69 ret void 70} 71 72; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: 73; GCN: s_and_saveexec_b64 74; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} 75; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}} 76; GCN: {{^}}BB2_2: 77; GCN: s_or_b64 exec 78define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 79entry: 80 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 81 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 82 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 83 %tmp0 = icmp eq i32 %tid, 0 84 br i1 %tmp0, label %endif, label %if 85 86if: 87 %tmp1 = load i8, i8 addrspace(1)* %in.gep 88 %tmp2 = sext i8 %tmp1 to i32 89 br label %endif 90 91endif: 92 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 93 store i32 %x, i32 addrspace(1)* %out.gep 94 br label %done 95 96done: 97 ret void 98} 99 100; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: 101; GCN: s_and_saveexec_b64 102; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 103; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} 104; GCN: {{^}}BB3_2: 105; GCN: s_or_b64 exec 106define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 107entry: 108 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 109 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 110 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 111 %tmp0 = icmp eq i32 %tid, 0 112 br i1 %tmp0, label %endif, label %if 113 114if: 115 %tmp1 = load i8, i8 addrspace(1)* %in.gep 116 %tmp2 = sext i8 %tmp1 to i32 117 br label %endif 118 119endif: 120 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 121 store i32 %x, i32 addrspace(1)* %out.gep 122 br label %done 123 124done: 125 ret void 126} 127 128; OPT-LABEL: @test_sink_scratch_small_offset_i32( 129; OPT-NOT: getelementptr [512 x i32] 130; OPT: br i1 131; OPT: getelementptr i8, 132 133; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: 134; GCN: s_and_saveexec_b64 135; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} 136; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} 137; GCN: {{^}}BB4_2: 138define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 139entry: 140 %alloca = alloca [512 x i32], align 4, addrspace(5) 141 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 142 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 143 %add.arg = add i32 %arg, 8 144 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022 145 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 146 %tmp0 = icmp eq i32 %tid, 0 147 br i1 %tmp0, label %endif, label %if 148 149if: 150 store volatile i32 123, i32 addrspace(5)* %alloca.gep 151 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 152 br label %endif 153 154endif: 155 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 156 store i32 %x, i32 addrspace(1)* %out.gep.0 157 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 158 store i32 %load, i32 addrspace(1)* %out.gep.1 159 br label %done 160 161done: 162 ret void 163} 164 165; This ends up not fitting due to the reserved 4 bytes at offset 0 166; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( 167; OPT-NOT: getelementptr [512 x i32] 168; OPT: br i1 169; OPT: getelementptr i8, 170 171; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: 172; GCN: s_and_saveexec_b64 173; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 174; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} 175; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 176; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} 177; GCN: {{^BB[0-9]+}}_2: 178 179define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 180entry: 181 %alloca = alloca [512 x i32], align 4, addrspace(5) 182 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 183 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 184 %add.arg = add i32 %arg, 8 185 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023 186 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 187 %tmp0 = icmp eq i32 %tid, 0 188 br i1 %tmp0, label %endif, label %if 189 190if: 191 store volatile i32 123, i32 addrspace(5)* %alloca.gep 192 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 193 br label %endif 194 195endif: 196 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 197 store i32 %x, i32 addrspace(1)* %out.gep.0 198 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 199 store i32 %load, i32 addrspace(1)* %out.gep.1 200 br label %done 201 202done: 203 ret void 204} 205 206; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( 207; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 208; OPT: br i1 209; OPT-NOT: ptrtoint 210 211; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: 212; GCN: s_and_saveexec_b64 213; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} 214; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} 215; GCN: {{^BB[0-9]+}}_2: 216define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 217entry: 218 %alloca = alloca [512 x i32], align 4, addrspace(5) 219 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 220 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 221 %add.arg = add i32 %arg, 8 222 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 223 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 224 %tmp0 = icmp eq i32 %tid, 0 225 br i1 %tmp0, label %endif, label %if 226 227if: 228 store volatile i32 123, i32 addrspace(5)* %alloca.gep 229 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 230 br label %endif 231 232endif: 233 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 234 store i32 %x, i32 addrspace(1)* %out.gep.0 235 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 236 store i32 %load, i32 addrspace(1)* %out.gep.1 237 br label %done 238 239done: 240 ret void 241} 242 243; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: 244; GCN: s_and_saveexec_b64 245; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 246; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 247; GCN: {{^BB[0-9]+}}_2: 248define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { 249entry: 250 %offset.ext = zext i32 %offset to i64 251 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 252 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext 253 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 254 %tmp0 = icmp eq i32 %tid, 0 255 br i1 %tmp0, label %endif, label %if 256 257if: 258 %tmp1 = load i32, i32 addrspace(1)* %in.gep 259 br label %endif 260 261endif: 262 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 263 store i32 %x, i32 addrspace(1)* %out.gep 264 br label %done 265 266done: 267 ret void 268} 269 270; OPT-LABEL: @test_sink_constant_small_offset_i32 271; OPT-NOT: getelementptr i32, i32 addrspace(4)* 272; OPT: br i1 273 274; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: 275; GCN: s_and_saveexec_b64 276; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} 277; GCN: s_or_b64 exec, exec 278define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 279entry: 280 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 281 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 282 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 283 %tmp0 = icmp eq i32 %tid, 0 284 br i1 %tmp0, label %endif, label %if 285 286if: 287 %tmp1 = load i32, i32 addrspace(4)* %in.gep 288 br label %endif 289 290endif: 291 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 292 store i32 %x, i32 addrspace(1)* %out.gep 293 br label %done 294 295done: 296 ret void 297} 298 299; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 300; OPT-NOT: getelementptr i32, i32 addrspace(4)* 301; OPT: br i1 302 303; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: 304; GCN: s_and_saveexec_b64 305; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} 306; GCN: s_or_b64 exec, exec 307define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 308entry: 309 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 310 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255 311 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 312 %tmp0 = icmp eq i32 %tid, 0 313 br i1 %tmp0, label %endif, label %if 314 315if: 316 %tmp1 = load i32, i32 addrspace(4)* %in.gep 317 br label %endif 318 319endif: 320 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 321 store i32 %x, i32 addrspace(1)* %out.gep 322 br label %done 323 324done: 325 ret void 326} 327 328; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 329; OPT-SI: getelementptr i32, i32 addrspace(4)* 330; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 331; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)* 332; OPT: br i1 333 334; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: 335; GCN: s_and_saveexec_b64 336; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 337 338; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 339; GCN: s_or_b64 exec, exec 340define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 341entry: 342 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 343 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256 344 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 345 %tmp0 = icmp eq i32 %tid, 0 346 br i1 %tmp0, label %endif, label %if 347 348if: 349 %tmp1 = load i32, i32 addrspace(4)* %in.gep 350 br label %endif 351 352endif: 353 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 354 store i32 %x, i32 addrspace(1)* %out.gep 355 br label %done 356 357done: 358 ret void 359} 360 361; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 362; OPT-SI: getelementptr i32, i32 addrspace(4)* 363; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 364; OPT: br i1 365 366; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: 367; GCN: s_and_saveexec_b64 368; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 369; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 370; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 371; GCN: s_or_b64 exec, exec 372define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 373entry: 374 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 375 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295 376 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 377 %tmp0 = icmp eq i32 %tid, 0 378 br i1 %tmp0, label %endif, label %if 379 380if: 381 %tmp1 = load i32, i32 addrspace(4)* %in.gep 382 br label %endif 383 384endif: 385 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 386 store i32 %x, i32 addrspace(1)* %out.gep 387 br label %done 388 389done: 390 ret void 391} 392 393; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 394; OPT: getelementptr i32, i32 addrspace(4)* 395; OPT: br i1 396 397; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: 398; GCN: s_and_saveexec_b64 399; GCN: s_add_u32 400; GCN: s_addc_u32 401; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 402; GCN: s_or_b64 exec, exec 403define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 404entry: 405 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 406 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181 407 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 408 %tmp0 = icmp eq i32 %tid, 0 409 br i1 %tmp0, label %endif, label %if 410 411if: 412 %tmp1 = load i32, i32 addrspace(4)* %in.gep 413 br label %endif 414 415endif: 416 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 417 store i32 %x, i32 addrspace(1)* %out.gep 418 br label %done 419 420done: 421 ret void 422} 423 424; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: 425; GCN: s_and_saveexec_b64 426; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} 427; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 428 429; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}} 430; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} 431 432; GCN: s_or_b64 exec, exec 433define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 434entry: 435 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 436 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143 437 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 438 %tmp0 = icmp eq i32 %tid, 0 439 br i1 %tmp0, label %endif, label %if 440 441if: 442 %tmp1 = load i32, i32 addrspace(4)* %in.gep 443 br label %endif 444 445endif: 446 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 447 store i32 %x, i32 addrspace(1)* %out.gep 448 br label %done 449 450done: 451 ret void 452} 453 454; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 455; OPT-SI: getelementptr i32, i32 addrspace(4)* 456; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 457; OPT-VI: getelementptr i32, i32 addrspace(4)* 458; OPT: br i1 459 460; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: 461; GCN: s_and_saveexec_b64 462; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 463; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 464 465; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}} 466 467; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 468; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 469 470; GCN: s_or_b64 exec, exec 471define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 472entry: 473 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 474 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144 475 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 476 %tmp0 = icmp eq i32 %tid, 0 477 br i1 %tmp0, label %endif, label %if 478 479if: 480 %tmp1 = load i32, i32 addrspace(4)* %in.gep 481 br label %endif 482 483endif: 484 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 485 store i32 %x, i32 addrspace(1)* %out.gep 486 br label %done 487 488done: 489 ret void 490} 491 492%struct.foo = type { [3 x float], [3 x float] } 493 494; OPT-LABEL: @sink_ds_address( 495; OPT: getelementptr i8, 496 497; GCN-LABEL: {{^}}sink_ds_address: 498; GCN: s_load_dword [[SREG1:s[0-9]+]], 499; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] 500; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 501define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { 502entry: 503 %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 504 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 505 br label %bb32 506 507bb32: 508 %a = load float, float addrspace(3)* %x, align 4 509 %b = load float, float addrspace(3)* %y, align 4 510 %cmp = fcmp one float %a, %b 511 br i1 %cmp, label %bb34, label %bb33 512 513bb33: 514 unreachable 515 516bb34: 517 unreachable 518} 519 520; Address offset is not a multiple of 4. This is a valid mubuf offset, 521; but not smrd. 522 523; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1( 524; OPT: br i1 %tmp0, 525; OPT: if: 526; OPT: getelementptr i8, {{.*}} 4095 527define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) { 528entry: 529 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 530 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 531 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 532 %tmp0 = icmp eq i32 %tid, 0 533 br i1 %tmp0, label %endif, label %if 534 535if: 536 %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)* 537 %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1 538 br label %endif 539 540endif: 541 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 542 store i32 %x, i32 addrspace(1)* %out.gep 543 br label %done 544 545done: 546 ret void 547} 548 549; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32( 550; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 551; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 552; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 553; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst 554define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 555entry: 556 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 557 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 558 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 559 %tmp0 = icmp eq i32 %tid, 0 560 br i1 %tmp0, label %endif, label %if 561 562if: 563 %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst 564 br label %endif 565 566endif: 567 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 568 store i32 %x, i32 addrspace(3)* %out.gep 569 br label %done 570 571done: 572 ret void 573} 574 575; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32( 576; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 577; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 578; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 579; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic 580define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 581entry: 582 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 583 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 584 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 585 %tmp0 = icmp eq i32 %tid, 0 586 br i1 %tmp0, label %endif, label %if 587 588if: 589 %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic 590 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0 591 br label %endif 592 593endif: 594 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 595 store i32 %x, i32 addrspace(3)* %out.gep 596 br label %done 597 598done: 599 ret void 600} 601 602; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32( 603; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 604; OPT: br i1 605; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 606define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { 607entry: 608 %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999 609 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 610 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 611 %tmp0 = icmp eq i32 %tid, 0 612 br i1 %tmp0, label %endif, label %if 613 614if: 615 %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 616 %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0 617 br label %endif 618 619endif: 620 %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ] 621 store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep 622 br label %done 623 624done: 625 ret void 626} 627 628; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32( 629; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 630; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 631; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 632; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 633define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 634entry: 635 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 636 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 637 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 638 %tmp0 = icmp eq i32 %tid, 0 639 br i1 %tmp0, label %endif, label %if 640 641if: 642 %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 643 br label %endif 644 645endif: 646 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 647 store i32 %x, i32 addrspace(3)* %out.gep 648 br label %done 649 650done: 651 ret void 652} 653 654; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32( 655; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 656; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 657; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 658; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 659define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 660entry: 661 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 662 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 663 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 664 %tmp0 = icmp eq i32 %tid, 0 665 br i1 %tmp0, label %endif, label %if 666 667if: 668 %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 669 br label %endif 670 671endif: 672 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 673 store i32 %x, i32 addrspace(3)* %out.gep 674 br label %done 675 676done: 677 ret void 678} 679 680; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( 681; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 682; OPT-SICIV: br 683; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep 684 685; OPT-GFX9: br 686; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 687; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr 688 689; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: 690; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}} 691define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 692entry: 693 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 694 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 695 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 696 %tmp0 = icmp eq i32 %tid, 0 697 br i1 %tmp0, label %endif, label %if 698 699if: 700 %tmp1 = load i8, i8 addrspace(1)* %in.gep 701 %tmp2 = sext i8 %tmp1 to i32 702 br label %endif 703 704endif: 705 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 706 store i32 %x, i32 addrspace(1)* %out.gep 707 br label %done 708 709done: 710 ret void 711} 712 713; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset( 714; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 715; OPT: br 716; OPT: load i8, i8 addrspace(1)* %in.gep 717 718; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset: 719define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 720entry: 721 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 722 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 723 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 724 %tmp0 = icmp eq i32 %tid, 0 725 br i1 %tmp0, label %endif, label %if 726 727if: 728 %tmp1 = load i8, i8 addrspace(1)* %in.gep 729 %tmp2 = sext i8 %tmp1 to i32 730 br label %endif 731 732endif: 733 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 734 store i32 %x, i32 addrspace(1)* %out.gep 735 br label %done 736 737done: 738 ret void 739} 740 741declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 742declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 743declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 744 745attributes #0 = { nounwind readnone } 746attributes #1 = { nounwind } 747attributes #2 = { nounwind argmemonly } 748