1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,MUBUF %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,MUBUF %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,FLATSCR %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,FLATSCR %s 6 7; FIXME: Generated test checks do not check metadata at the end of the 8; function, so this also includes manually added checks. 9 10; Test that we can select a statically sized alloca outside of the 11; entry block. 12 13; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an 14; alignment less than the stack alignment. 15define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 16; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 17; MUBUF: ; %bb.0: ; %entry 18; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 19; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 20; MUBUF-NEXT: s_add_u32 s0, s0, s9 21; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 22; MUBUF-NEXT: s_addc_u32 s1, s1, 0 23; MUBUF-NEXT: s_movk_i32 s32, 0x400 24; MUBUF-NEXT: s_mov_b32 s33, 0 25; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 26; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 27; MUBUF-NEXT: s_cbranch_scc1 BB0_3 28; MUBUF-NEXT: ; %bb.1: ; %bb.0 29; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 30; MUBUF-NEXT: s_cbranch_scc1 BB0_3 31; MUBUF-NEXT: ; %bb.2: ; %bb.1 32; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 33; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 34; MUBUF-NEXT: s_mov_b32 s32, s6 35; MUBUF-NEXT: v_mov_b32_e32 v2, s6 36; MUBUF-NEXT: v_mov_b32_e32 v1, 0 37; MUBUF-NEXT: v_mov_b32_e32 v3, 1 38; MUBUF-NEXT: s_add_i32 s6, s6, s7 39; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 40; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 41; MUBUF-NEXT: v_mov_b32_e32 v2, s6 42; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 43; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 44; MUBUF-NEXT: s_waitcnt vmcnt(0) 45; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 46; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 47; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 48; MUBUF-NEXT: BB0_3: ; %bb.2 49; MUBUF-NEXT: v_mov_b32_e32 v0, 0 50; MUBUF-NEXT: global_store_dword v[0:1], v0, off 51; MUBUF-NEXT: s_endpgm 52; 53; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 54; FLATSCR: ; %bb.0: ; %entry 55; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 56; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 57; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 58; FLATSCR-NEXT: s_mov_b32 s32, 16 59; FLATSCR-NEXT: s_mov_b32 s33, 0 60; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 61; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0 62; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 63; FLATSCR-NEXT: ; %bb.1: ; %bb.0 64; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 65; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 66; FLATSCR-NEXT: ; %bb.2: ; %bb.1 67; FLATSCR-NEXT: s_mov_b32 s2, s32 68; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 69; FLATSCR-NEXT: s_add_i32 s4, s2, s3 70; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 71; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 72; FLATSCR-NEXT: s_add_u32 s2, s2, s3 73; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 74; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 75; FLATSCR-NEXT: s_mov_b32 s32, s4 76; FLATSCR-NEXT: s_add_i32 s4, s4, s2 77; FLATSCR-NEXT: scratch_load_dword v2, off, s4 78; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 79; FLATSCR-NEXT: s_waitcnt vmcnt(0) 80; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 81; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 82; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 83; FLATSCR-NEXT: BB0_3: ; %bb.2 84; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 85; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 86; FLATSCR-NEXT: s_endpgm 87 88entry: 89 %cond0 = icmp eq i32 %arg.cond0, 0 90 br i1 %cond0, label %bb.0, label %bb.2 91 92bb.0: 93 %alloca = alloca [16 x i32], align 4, addrspace(5) 94 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 95 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 96 %cond1 = icmp eq i32 %arg.cond1, 0 97 br i1 %cond1, label %bb.1, label %bb.2 98 99bb.1: 100 ; Use the alloca outside of the defining block. 101 store i32 0, i32 addrspace(5)* %gep0 102 store i32 1, i32 addrspace(5)* %gep1 103 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 104 %load = load i32, i32 addrspace(5)* %gep2 105 %tid = call i32 @llvm.amdgcn.workitem.id.x() 106 %add = add i32 %load, %tid 107 store i32 %add, i32 addrspace(1)* %out 108 br label %bb.2 109 110bb.2: 111 store volatile i32 0, i32 addrspace(1)* undef 112 ret void 113} 114; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 115; DEFAULTSIZE: ; ScratchSize: 4112 116 117; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 118; ASSUME1024: ; ScratchSize: 1040 119 120define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 121; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 122; MUBUF: ; %bb.0: ; %entry 123; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 124; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 125; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 126; MUBUF-NEXT: s_add_u32 s0, s0, s9 127; MUBUF-NEXT: s_addc_u32 s1, s1, 0 128; MUBUF-NEXT: s_movk_i32 s32, 0x1000 129; MUBUF-NEXT: s_mov_b32 s33, 0 130; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 131; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 132; MUBUF-NEXT: s_cbranch_scc1 BB1_2 133; MUBUF-NEXT: ; %bb.1: ; %bb.0 134; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 135; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 136; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 137; MUBUF-NEXT: s_mov_b32 s32, s6 138; MUBUF-NEXT: v_mov_b32_e32 v2, s6 139; MUBUF-NEXT: v_mov_b32_e32 v1, 0 140; MUBUF-NEXT: v_mov_b32_e32 v3, 1 141; MUBUF-NEXT: s_add_i32 s6, s6, s7 142; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 143; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 144; MUBUF-NEXT: v_mov_b32_e32 v2, s6 145; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 146; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 147; MUBUF-NEXT: s_waitcnt vmcnt(0) 148; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 149; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 150; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 151; MUBUF-NEXT: BB1_2: ; %bb.1 152; MUBUF-NEXT: v_mov_b32_e32 v0, 0 153; MUBUF-NEXT: global_store_dword v[0:1], v0, off 154; MUBUF-NEXT: s_endpgm 155; 156; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 157; FLATSCR: ; %bb.0: ; %entry 158; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 159; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 160; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 161; FLATSCR-NEXT: s_mov_b32 s32, 64 162; FLATSCR-NEXT: s_mov_b32 s33, 0 163; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 164; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 165; FLATSCR-NEXT: s_cbranch_scc1 BB1_2 166; FLATSCR-NEXT: ; %bb.1: ; %bb.0 167; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 168; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 169; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 170; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 171; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 172; FLATSCR-NEXT: s_mov_b32 s32, s2 173; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 174; FLATSCR-NEXT: s_add_i32 s2, s2, s3 175; FLATSCR-NEXT: scratch_load_dword v2, off, s2 176; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 177; FLATSCR-NEXT: s_waitcnt vmcnt(0) 178; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 179; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 180; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 181; FLATSCR-NEXT: BB1_2: ; %bb.1 182; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 183; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 184; FLATSCR-NEXT: s_endpgm 185entry: 186 %cond = icmp eq i32 %arg.cond, 0 187 br i1 %cond, label %bb.0, label %bb.1 188 189bb.0: 190 %alloca = alloca [16 x i32], align 64, addrspace(5) 191 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 192 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 193 store i32 0, i32 addrspace(5)* %gep0 194 store i32 1, i32 addrspace(5)* %gep1 195 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 196 %load = load i32, i32 addrspace(5)* %gep2 197 %tid = call i32 @llvm.amdgcn.workitem.id.x() 198 %add = add i32 %load, %tid 199 store i32 %add, i32 addrspace(1)* %out 200 br label %bb.1 201 202bb.1: 203 store volatile i32 0, i32 addrspace(1)* undef 204 ret void 205} 206 207; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 208; DEFAULTSIZE: ; ScratchSize: 4160 209 210; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 211; ASSUME1024: ; ScratchSize: 1088 212 213 214define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 215; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: 216; MUBUF: ; %bb.0: ; %entry 217; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 218; MUBUF-NEXT: s_mov_b32 s7, s33 219; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 220; MUBUF-NEXT: s_mov_b32 s33, s32 221; MUBUF-NEXT: s_add_u32 s32, s32, 0x400 222; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 223; MUBUF-NEXT: s_cbranch_execz BB2_3 224; MUBUF-NEXT: ; %bb.1: ; %bb.0 225; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 226; MUBUF-NEXT: s_and_b64 exec, exec, vcc 227; MUBUF-NEXT: s_cbranch_execz BB2_3 228; MUBUF-NEXT: ; %bb.2: ; %bb.1 229; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 230; MUBUF-NEXT: v_mov_b32_e32 v2, 0 231; MUBUF-NEXT: v_mov_b32_e32 v3, s6 232; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 233; MUBUF-NEXT: v_mov_b32_e32 v2, 1 234; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 235; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 236; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 237; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 238; MUBUF-NEXT: s_mov_b32 s32, s6 239; MUBUF-NEXT: s_waitcnt vmcnt(0) 240; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 241; MUBUF-NEXT: global_store_dword v[0:1], v2, off 242; MUBUF-NEXT: BB2_3: ; %bb.2 243; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 244; MUBUF-NEXT: v_mov_b32_e32 v0, 0 245; MUBUF-NEXT: global_store_dword v[0:1], v0, off 246; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 247; MUBUF-NEXT: s_mov_b32 s33, s7 248; MUBUF-NEXT: s_waitcnt vmcnt(0) 249; MUBUF-NEXT: s_setpc_b64 s[30:31] 250; 251; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: 252; FLATSCR: ; %bb.0: ; %entry 253; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; FLATSCR-NEXT: s_mov_b32 s5, s33 255; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 256; FLATSCR-NEXT: s_mov_b32 s33, s32 257; FLATSCR-NEXT: s_add_u32 s32, s32, 16 258; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 259; FLATSCR-NEXT: s_cbranch_execz BB2_3 260; FLATSCR-NEXT: ; %bb.1: ; %bb.0 261; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 262; FLATSCR-NEXT: s_and_b64 exec, exec, vcc 263; FLATSCR-NEXT: s_cbranch_execz BB2_3 264; FLATSCR-NEXT: ; %bb.2: ; %bb.1 265; FLATSCR-NEXT: s_mov_b32 s2, s32 266; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 267; FLATSCR-NEXT: s_add_i32 s4, s2, s3 268; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 269; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 270; FLATSCR-NEXT: s_add_u32 s2, s2, s3 271; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 272; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 273; FLATSCR-NEXT: scratch_load_dword v2, v2, off 274; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 275; FLATSCR-NEXT: s_mov_b32 s32, s4 276; FLATSCR-NEXT: s_waitcnt vmcnt(0) 277; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 278; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 279; FLATSCR-NEXT: BB2_3: ; %bb.2 280; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 281; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 282; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 283; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 284; FLATSCR-NEXT: s_mov_b32 s33, s5 285; FLATSCR-NEXT: s_waitcnt vmcnt(0) 286; FLATSCR-NEXT: s_setpc_b64 s[30:31] 287 288entry: 289 %cond0 = icmp eq i32 %arg.cond0, 0 290 br i1 %cond0, label %bb.0, label %bb.2 291 292bb.0: 293 %alloca = alloca [16 x i32], align 4, addrspace(5) 294 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 295 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 296 %cond1 = icmp eq i32 %arg.cond1, 0 297 br i1 %cond1, label %bb.1, label %bb.2 298 299bb.1: 300 ; Use the alloca outside of the defining block. 301 store i32 0, i32 addrspace(5)* %gep0 302 store i32 1, i32 addrspace(5)* %gep1 303 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 304 %load = load i32, i32 addrspace(5)* %gep2 305 %tid = call i32 @llvm.amdgcn.workitem.id.x() 306 %add = add i32 %load, %tid 307 store i32 %add, i32 addrspace(1)* %out 308 br label %bb.2 309 310bb.2: 311 store volatile i32 0, i32 addrspace(1)* undef 312 ret void 313} 314 315define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 316; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: 317; MUBUF: ; %bb.0: ; %entry 318; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; MUBUF-NEXT: s_add_u32 s4, s32, 0xfc0 320; MUBUF-NEXT: s_mov_b32 s7, s33 321; MUBUF-NEXT: s_and_b32 s33, s4, 0xfffff000 322; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 323; MUBUF-NEXT: s_add_u32 s32, s32, 0x2000 324; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 325; MUBUF-NEXT: s_cbranch_execz BB3_2 326; MUBUF-NEXT: ; %bb.1: ; %bb.0 327; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 328; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 329; MUBUF-NEXT: v_mov_b32_e32 v2, 0 330; MUBUF-NEXT: v_mov_b32_e32 v5, s6 331; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen 332; MUBUF-NEXT: v_mov_b32_e32 v2, 1 333; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 334; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 335; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 336; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 337; MUBUF-NEXT: s_mov_b32 s32, s6 338; MUBUF-NEXT: s_waitcnt vmcnt(0) 339; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 340; MUBUF-NEXT: global_store_dword v[0:1], v2, off 341; MUBUF-NEXT: BB3_2: ; %bb.1 342; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 343; MUBUF-NEXT: v_mov_b32_e32 v0, 0 344; MUBUF-NEXT: global_store_dword v[0:1], v0, off 345; MUBUF-NEXT: s_sub_u32 s32, s32, 0x2000 346; MUBUF-NEXT: s_mov_b32 s33, s7 347; MUBUF-NEXT: s_waitcnt vmcnt(0) 348; MUBUF-NEXT: s_setpc_b64 s[30:31] 349; 350; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: 351; FLATSCR: ; %bb.0: ; %entry 352; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 353; FLATSCR-NEXT: s_add_u32 s0, s32, 63 354; FLATSCR-NEXT: s_mov_b32 s3, s33 355; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffffc0 356; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 357; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80 358; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 359; FLATSCR-NEXT: s_cbranch_execz BB3_2 360; FLATSCR-NEXT: ; %bb.1: ; %bb.0 361; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 362; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 363; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 364; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 365; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 366; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 367; FLATSCR-NEXT: scratch_load_dword v2, v2, off 368; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 369; FLATSCR-NEXT: s_mov_b32 s32, s2 370; FLATSCR-NEXT: s_waitcnt vmcnt(0) 371; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 372; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 373; FLATSCR-NEXT: BB3_2: ; %bb.1 374; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 375; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 376; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 377; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80 378; FLATSCR-NEXT: s_mov_b32 s33, s3 379; FLATSCR-NEXT: s_waitcnt vmcnt(0) 380; FLATSCR-NEXT: s_setpc_b64 s[30:31] 381entry: 382 %cond = icmp eq i32 %arg.cond, 0 383 br i1 %cond, label %bb.0, label %bb.1 384 385bb.0: 386 %alloca = alloca [16 x i32], align 64, addrspace(5) 387 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 388 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 389 store i32 0, i32 addrspace(5)* %gep0 390 store i32 1, i32 addrspace(5)* %gep1 391 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 392 %load = load i32, i32 addrspace(5)* %gep2 393 %tid = call i32 @llvm.amdgcn.workitem.id.x() 394 %add = add i32 %load, %tid 395 store i32 %add, i32 addrspace(1)* %out 396 br label %bb.1 397 398bb.1: 399 store volatile i32 0, i32 addrspace(1)* undef 400 ret void 401} 402 403declare i32 @llvm.amdgcn.workitem.id.x() #0 404 405attributes #0 = { nounwind readnone speculatable } 406