1; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s 2; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s 3; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s 4; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s 6; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s 7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s 8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s 9 10target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 11 12; OPT-LABEL: @test_sink_global_small_offset_i32( 13; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in 14; OPT-VI: getelementptr i32, i32 addrspace(1)* %in 15; OPT: br i1 16; OPT-CI: getelementptr i8, 17 18; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: 19define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 20entry: 21 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 22 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 24 %tmp0 = icmp eq i32 %tid, 0 25 br i1 %tmp0, label %endif, label %if 26 27if: 28 %tmp1 = load i32, i32 addrspace(1)* %in.gep 29 br label %endif 30 31endif: 32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 33 store i32 %x, i32 addrspace(1)* %out.gep 34 br label %done 35 36done: 37 ret void 38} 39 40; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( 41; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 42; OPT: br i1 43 44; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: 45; GCN: s_and_saveexec_b64 46; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 47 48; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}} 49; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} 50; GCN: {{^}}BB1_2: 51; GCN: s_or_b64 exec 52define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 53entry: 54 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 55 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 56 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 57 %tmp0 = icmp eq i32 %tid, 0 58 br i1 %tmp0, label %endif, label %if 59 60if: 61 %tmp1 = load i8, i8 addrspace(1)* %in.gep 62 %tmp2 = sext i8 %tmp1 to i32 63 br label %endif 64 65endif: 66 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 67 store i32 %x, i32 addrspace(1)* %out.gep 68 br label %done 69 70done: 71 ret void 72} 73 74; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: 75; GCN: s_and_saveexec_b64 76; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} 77; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 78; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} 79; GCN: {{^}}BB2_2: 80; GCN: s_or_b64 exec 81define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 82entry: 83 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 84 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 85 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 86 %tmp0 = icmp eq i32 %tid, 0 87 br i1 %tmp0, label %endif, label %if 88 89if: 90 %tmp1 = load i8, i8 addrspace(1)* %in.gep 91 %tmp2 = sext i8 %tmp1 to i32 92 br label %endif 93 94endif: 95 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 96 store i32 %x, i32 addrspace(1)* %out.gep 97 br label %done 98 99done: 100 ret void 101} 102 103; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: 104; GCN: s_and_saveexec_b64 105; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 106; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}} 107; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}} 108; GCN: {{^}}BB3_2: 109; GCN: s_or_b64 exec 110define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 111entry: 112 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 113 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 114 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 115 %tmp0 = icmp eq i32 %tid, 0 116 br i1 %tmp0, label %endif, label %if 117 118if: 119 %tmp1 = load i8, i8 addrspace(1)* %in.gep 120 %tmp2 = sext i8 %tmp1 to i32 121 br label %endif 122 123endif: 124 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 125 store i32 %x, i32 addrspace(1)* %out.gep 126 br label %done 127 128done: 129 ret void 130} 131 132; OPT-LABEL: @test_sink_scratch_small_offset_i32( 133; OPT-NOT: getelementptr [512 x i32] 134; OPT: br i1 135; OPT: getelementptr i8, 136 137; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: 138; GCN: s_and_saveexec_b64 139; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} 140; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} 141; GCN: {{^}}BB4_2: 142define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 143entry: 144 %alloca = alloca [512 x i32], align 4, addrspace(5) 145 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 146 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 147 %add.arg = add i32 %arg, 8 148 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022 149 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 150 %tmp0 = icmp eq i32 %tid, 0 151 br i1 %tmp0, label %endif, label %if 152 153if: 154 store volatile i32 123, i32 addrspace(5)* %alloca.gep 155 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 156 br label %endif 157 158endif: 159 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 160 store i32 %x, i32 addrspace(1)* %out.gep.0 161 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 162 store i32 %load, i32 addrspace(1)* %out.gep.1 163 br label %done 164 165done: 166 ret void 167} 168 169; This ends up not fitting due to the reserved 4 bytes at offset 0 170; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( 171; OPT-NOT: getelementptr [512 x i32] 172; OPT: br i1 173; OPT: getelementptr i8, 174 175; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: 176; GCN: s_and_saveexec_b64 177; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 178; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} 179; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 180; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} 181; GCN: {{^BB[0-9]+}}_2: 182 183define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 184entry: 185 %alloca = alloca [512 x i32], align 4, addrspace(5) 186 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 187 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 188 %add.arg = add i32 %arg, 8 189 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023 190 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 191 %tmp0 = icmp eq i32 %tid, 0 192 br i1 %tmp0, label %endif, label %if 193 194if: 195 store volatile i32 123, i32 addrspace(5)* %alloca.gep 196 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 197 br label %endif 198 199endif: 200 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 201 store i32 %x, i32 addrspace(1)* %out.gep.0 202 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 203 store i32 %load, i32 addrspace(1)* %out.gep.1 204 br label %done 205 206done: 207 ret void 208} 209 210; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( 211; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 212; OPT: br i1 213; OPT-NOT: ptrtoint 214 215; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: 216; GCN: s_and_saveexec_b64 217; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} 218; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} 219; GCN: {{^BB[0-9]+}}_2: 220define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 221entry: 222 %alloca = alloca [512 x i32], align 4, addrspace(5) 223 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 224 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 225 %add.arg = add i32 %arg, 8 226 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 227 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 228 %tmp0 = icmp eq i32 %tid, 0 229 br i1 %tmp0, label %endif, label %if 230 231if: 232 store volatile i32 123, i32 addrspace(5)* %alloca.gep 233 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 234 br label %endif 235 236endif: 237 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 238 store i32 %x, i32 addrspace(1)* %out.gep.0 239 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 240 store i32 %load, i32 addrspace(1)* %out.gep.1 241 br label %done 242 243done: 244 ret void 245} 246 247; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: 248; GCN: s_and_saveexec_b64 249; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 250; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 251; GCN: {{^BB[0-9]+}}_2: 252define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { 253entry: 254 %offset.ext = zext i32 %offset to i64 255 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 256 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext 257 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 258 %tmp0 = icmp eq i32 %tid, 0 259 br i1 %tmp0, label %endif, label %if 260 261if: 262 %tmp1 = load i32, i32 addrspace(1)* %in.gep 263 br label %endif 264 265endif: 266 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 267 store i32 %x, i32 addrspace(1)* %out.gep 268 br label %done 269 270done: 271 ret void 272} 273 274; OPT-LABEL: @test_sink_constant_small_offset_i32 275; OPT-NOT: getelementptr i32, i32 addrspace(4)* 276; OPT: br i1 277 278; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: 279; GCN: s_and_saveexec_b64 280; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} 281; GCN: s_or_b64 exec, exec 282define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 283entry: 284 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 285 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 286 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 287 %tmp0 = icmp eq i32 %tid, 0 288 br i1 %tmp0, label %endif, label %if 289 290if: 291 %tmp1 = load i32, i32 addrspace(4)* %in.gep 292 br label %endif 293 294endif: 295 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 296 store i32 %x, i32 addrspace(1)* %out.gep 297 br label %done 298 299done: 300 ret void 301} 302 303; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 304; OPT-NOT: getelementptr i32, i32 addrspace(4)* 305; OPT: br i1 306 307; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: 308; GCN: s_and_saveexec_b64 309; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} 310; GCN: s_or_b64 exec, exec 311define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 312entry: 313 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 314 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255 315 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 316 %tmp0 = icmp eq i32 %tid, 0 317 br i1 %tmp0, label %endif, label %if 318 319if: 320 %tmp1 = load i32, i32 addrspace(4)* %in.gep 321 br label %endif 322 323endif: 324 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 325 store i32 %x, i32 addrspace(1)* %out.gep 326 br label %done 327 328done: 329 ret void 330} 331 332; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 333; OPT-SI: getelementptr i32, i32 addrspace(4)* 334; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 335; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)* 336; OPT: br i1 337 338; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: 339; GCN: s_and_saveexec_b64 340; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 341 342; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 343; GCN: s_or_b64 exec, exec 344define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 345entry: 346 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 347 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256 348 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 349 %tmp0 = icmp eq i32 %tid, 0 350 br i1 %tmp0, label %endif, label %if 351 352if: 353 %tmp1 = load i32, i32 addrspace(4)* %in.gep 354 br label %endif 355 356endif: 357 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 358 store i32 %x, i32 addrspace(1)* %out.gep 359 br label %done 360 361done: 362 ret void 363} 364 365; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 366; OPT-SI: getelementptr i32, i32 addrspace(4)* 367; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 368; OPT: br i1 369 370; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: 371; GCN: s_and_saveexec_b64 372; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 373; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 374; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 375 376; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 377; VI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 378; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 379 380; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}} 381 382; GCN: s_or_b64 exec, exec 383define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 384entry: 385 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 386 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295 387 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 388 %tmp0 = icmp eq i32 %tid, 0 389 br i1 %tmp0, label %endif, label %if 390 391if: 392 %tmp1 = load i32, i32 addrspace(4)* %in.gep 393 br label %endif 394 395endif: 396 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 397 store i32 %x, i32 addrspace(1)* %out.gep 398 br label %done 399 400done: 401 ret void 402} 403 404; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 405; OPT: getelementptr i32, i32 addrspace(4)* 406; OPT: br i1 407 408; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: 409; GCN: s_and_saveexec_b64 410; GCN: s_add_u32 411; GCN: s_addc_u32 412; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 413; GCN: s_or_b64 exec, exec 414define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 415entry: 416 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 417 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181 418 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 419 %tmp0 = icmp eq i32 %tid, 0 420 br i1 %tmp0, label %endif, label %if 421 422if: 423 %tmp1 = load i32, i32 addrspace(4)* %in.gep 424 br label %endif 425 426endif: 427 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 428 store i32 %x, i32 addrspace(1)* %out.gep 429 br label %done 430 431done: 432 ret void 433} 434 435; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: 436; GCN: s_and_saveexec_b64 437; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} 438; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 439 440; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}} 441; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} 442 443; GCN: s_or_b64 exec, exec 444define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 445entry: 446 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 447 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143 448 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 449 %tmp0 = icmp eq i32 %tid, 0 450 br i1 %tmp0, label %endif, label %if 451 452if: 453 %tmp1 = load i32, i32 addrspace(4)* %in.gep 454 br label %endif 455 456endif: 457 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 458 store i32 %x, i32 addrspace(1)* %out.gep 459 br label %done 460 461done: 462 ret void 463} 464 465; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 466; OPT-SI: getelementptr i32, i32 addrspace(4)* 467; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 468; OPT-VI: getelementptr i32, i32 addrspace(4)* 469; OPT: br i1 470 471; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: 472; GCN: s_and_saveexec_b64 473; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 474; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 475 476; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}} 477 478; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 479; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 480 481; GCN: s_or_b64 exec, exec 482define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 483entry: 484 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 485 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144 486 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 487 %tmp0 = icmp eq i32 %tid, 0 488 br i1 %tmp0, label %endif, label %if 489 490if: 491 %tmp1 = load i32, i32 addrspace(4)* %in.gep 492 br label %endif 493 494endif: 495 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 496 store i32 %x, i32 addrspace(1)* %out.gep 497 br label %done 498 499done: 500 ret void 501} 502 503%struct.foo = type { [3 x float], [3 x float] } 504 505; OPT-LABEL: @sink_ds_address( 506; OPT: getelementptr inbounds i8, 507 508; GCN-LABEL: {{^}}sink_ds_address: 509; GCN: s_load_dword [[SREG1:s[0-9]+]], 510; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] 511; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 512define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { 513entry: 514 %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 515 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 516 br label %bb32 517 518bb32: 519 %a = load float, float addrspace(3)* %x, align 4 520 %b = load float, float addrspace(3)* %y, align 4 521 %cmp = fcmp one float %a, %b 522 br i1 %cmp, label %bb34, label %bb33 523 524bb33: 525 unreachable 526 527bb34: 528 unreachable 529} 530 531; Address offset is not a multiple of 4. This is a valid mubuf offset, 532; but not smrd. 533 534; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1( 535; OPT: br i1 %tmp0, 536; OPT: if: 537; OPT: getelementptr i8, {{.*}} 4095 538define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) { 539entry: 540 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 541 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 542 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 543 %tmp0 = icmp eq i32 %tid, 0 544 br i1 %tmp0, label %endif, label %if 545 546if: 547 %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)* 548 %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1 549 br label %endif 550 551endif: 552 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 553 store i32 %x, i32 addrspace(1)* %out.gep 554 br label %done 555 556done: 557 ret void 558} 559 560; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32( 561; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 562; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 563; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 564; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst 565define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 566entry: 567 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 568 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 569 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 570 %tmp0 = icmp eq i32 %tid, 0 571 br i1 %tmp0, label %endif, label %if 572 573if: 574 %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst 575 br label %endif 576 577endif: 578 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 579 store i32 %x, i32 addrspace(3)* %out.gep 580 br label %done 581 582done: 583 ret void 584} 585 586; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32( 587; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 588; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 589; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 590; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic 591define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 592entry: 593 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 594 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 595 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 596 %tmp0 = icmp eq i32 %tid, 0 597 br i1 %tmp0, label %endif, label %if 598 599if: 600 %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic 601 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0 602 br label %endif 603 604endif: 605 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 606 store i32 %x, i32 addrspace(3)* %out.gep 607 br label %done 608 609done: 610 ret void 611} 612 613; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32( 614; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 615; OPT: br i1 616; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 617define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { 618entry: 619 %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999 620 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 621 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 622 %tmp0 = icmp eq i32 %tid, 0 623 br i1 %tmp0, label %endif, label %if 624 625if: 626 %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 627 %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0 628 br label %endif 629 630endif: 631 %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ] 632 store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep 633 br label %done 634 635done: 636 ret void 637} 638 639; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32( 640; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 641; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 642; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 643; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 644define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 645entry: 646 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 647 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 648 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 649 %tmp0 = icmp eq i32 %tid, 0 650 br i1 %tmp0, label %endif, label %if 651 652if: 653 %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 654 br label %endif 655 656endif: 657 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 658 store i32 %x, i32 addrspace(3)* %out.gep 659 br label %done 660 661done: 662 ret void 663} 664 665; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32( 666; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 667; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 668; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 669; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 670define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 671entry: 672 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 673 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 674 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 675 %tmp0 = icmp eq i32 %tid, 0 676 br i1 %tmp0, label %endif, label %if 677 678if: 679 %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 680 br label %endif 681 682endif: 683 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 684 store i32 %x, i32 addrspace(3)* %out.gep 685 br label %done 686 687done: 688 ret void 689} 690 691; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( 692; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 693; OPT-SICIV: br 694; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep 695 696; OPT-GFX9: br 697; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 698; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr 699 700; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: 701; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 702; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}} 703define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 704entry: 705 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 706 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 707 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 708 %tmp0 = icmp eq i32 %tid, 0 709 br i1 %tmp0, label %endif, label %if 710 711if: 712 %tmp1 = load i8, i8 addrspace(1)* %in.gep 713 %tmp2 = sext i8 %tmp1 to i32 714 br label %endif 715 716endif: 717 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 718 store i32 %x, i32 addrspace(1)* %out.gep 719 br label %done 720 721done: 722 ret void 723} 724 725; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset( 726; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 727; OPT: br 728; OPT: load i8, i8 addrspace(1)* %in.gep 729 730; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset: 731define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 732entry: 733 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 734 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 735 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 736 %tmp0 = icmp eq i32 %tid, 0 737 br i1 %tmp0, label %endif, label %if 738 739if: 740 %tmp1 = load i8, i8 addrspace(1)* %in.gep 741 %tmp2 = sext i8 %tmp1 to i32 742 br label %endif 743 744endif: 745 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 746 store i32 %x, i32 addrspace(1)* %out.gep 747 br label %done 748 749done: 750 ret void 751} 752 753; OPT-LABEL: @test_sink_small_offset_ds_append( 754; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 755; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 756; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 757; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %1, i1 false) 758define amdgpu_kernel void @test_sink_small_offset_ds_append(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 759entry: 760 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 761 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 762 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 763 %tmp0 = icmp eq i32 %tid, 0 764 br i1 %tmp0, label %endif, label %if 765 766if: 767 %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %in.gep, i1 false) 768 br label %endif 769 770endif: 771 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 772 store i32 %x, i32 addrspace(3)* %out.gep 773 br label %done 774 775done: 776 ret void 777} 778 779; OPT-LABEL: @test_sink_small_offset_ds_consume( 780; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 781; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 782; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 783; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %1, i1 false) 784define amdgpu_kernel void @test_sink_small_offset_ds_consume(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 785entry: 786 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 787 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 788 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 789 %tmp0 = icmp eq i32 %tid, 0 790 br i1 %tmp0, label %endif, label %if 791 792if: 793 %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %in.gep, i1 false) 794 br label %endif 795 796endif: 797 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 798 store i32 %x, i32 addrspace(3)* %out.gep 799 br label %done 800 801done: 802 ret void 803} 804 805declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 806declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 807declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 808declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3 809declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3 810 811attributes #0 = { nounwind readnone } 812attributes #1 = { nounwind } 813attributes #2 = { nounwind argmemonly } 814attributes #3 = { argmemonly convergent nounwind willreturn } 815