1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s 2 3; indexing of vectors. 4 5; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll 6; to avoid gfx9 scheduling induced issues. 7 8 9; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: 10; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}} 11; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] 12; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 13 14; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] 15; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] 16 17; GCN: v_cmp_eq_u32_e32 18; GCN-COUNT-32: v_cndmask_b32 19 20; GCN-COUNT-4: buffer_store_dwordx4 21define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { 22entry: 23 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 24 %id.ext = zext i32 %id to i64 25 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext 26 %idx0 = load volatile i32, i32 addrspace(1)* %gep 27 %idx1 = add i32 %idx0, 1 28 %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() 29 %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 30 %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 31 store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0 32 %cmp = icmp eq i32 %id, 0 33 br i1 %cmp, label %bb1, label %bb2 34 35bb1: 36 store volatile i32 %live.out.val, i32 addrspace(1)* undef 37 br label %bb2 38 39bb2: 40 ret void 41} 42 43; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The 44; gpr_idx mode switching sequence is expanded late for this reason. 45 46; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block 47 48; GCN: s_set_gpr_idx_on 49; GCN-NEXT: v_mov_b32_e32 50; GCN-NEXT: s_set_gpr_idx_off 51 52; GCN: s_set_gpr_idx_on 53; GCN-NEXT: v_mov_b32_e32 54; GCN-NOT: v_mov_b32_e32 55; GCN-NEXT: s_set_gpr_idx_off 56define amdgpu_kernel void @insert_w_offset_multiple_in_block(<16 x float> addrspace(1)* %out1, i32 %in) #0 { 57entry: 58 %add1 = add i32 %in, 1 59 %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1 60 %add2 = add i32 %in, 2 61 %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2 62 store <16 x float> %ins1, <16 x float> addrspace(1)* %out1 63 %out2 = getelementptr <16 x float>, <16 x float> addrspace(1)* %out1, i32 1 64 store <16 x float> %ins2, <16 x float> addrspace(1)* %out2 65 66 ret void 67} 68 69declare i32 @llvm.amdgcn.workitem.id.x() #1 70declare void @llvm.amdgcn.s.barrier() #2 71 72attributes #0 = { nounwind } 73