1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s 2 3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 4 5; SI-LABEL: {{^}}test_if: 6; Make sure the i1 values created by the cfg structurizer pass are 7; moved using VALU instructions 8 9 10; waitcnt should be inserted after exec modification 11; SI: v_cmp_lt_i32_e32 vcc, 0, 12; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc 13; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] 14; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] 15; SI-NEXT: s_cbranch_execz [[FLOW_BB]] 16 17; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 18; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 19; SI: v_mov_b32_e32 v{{[0-9]}}, -1 20; SI: s_and_saveexec_b64 21; SI-NEXT: ; mask branch 22 23; v_mov should be after exec modification 24; SI: [[FLOW_BB]]: 25; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] 26; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}} 27; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] 28; SI-NEXT: ; mask branch 29; 30define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { 31entry: 32 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 33 switch i32 %tid, label %default [ 34 i32 0, label %case0 35 i32 1, label %case1 36 ] 37 38case0: 39 %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 40 store i32 13, i32 addrspace(1)* %arrayidx1, align 4 41 br label %end 42 43case1: 44 %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 45 store i32 17, i32 addrspace(1)* %arrayidx5, align 4 46 br label %end 47 48default: 49 %cmp8 = icmp eq i32 %tid, 2 50 %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 51 br i1 %cmp8, label %if, label %else 52 53if: 54 store i32 19, i32 addrspace(1)* %arrayidx10, align 4 55 br label %end 56 57else: 58 store i32 21, i32 addrspace(1)* %arrayidx10, align 4 59 br label %end 60 61end: 62 ret void 63} 64 65; SI-LABEL: {{^}}simple_test_v_if: 66; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 67; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 68; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] 69 70; SI-NEXT: BB{{[0-9]+_[0-9]+}}: 71; SI: buffer_store_dword 72 73; SI-NEXT: {{^}}[[EXIT]]: 74; SI: s_endpgm 75define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 76 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 77 %is.0 = icmp ne i32 %tid, 0 78 br i1 %is.0, label %then, label %exit 79 80then: 81 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 82 store i32 999, i32 addrspace(1)* %gep 83 br label %exit 84 85exit: 86 ret void 87} 88 89; FIXME: It would be better to endpgm in the then block. 90 91; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: 92; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 93; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 94; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] 95 96; SI-NEXT: BB{{[0-9]+_[0-9]+}}: 97; SI: buffer_store_dword 98 99; SI-NEXT: {{^}}[[EXIT]]: 100; SI: s_endpgm 101define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 102 %tid = call i32 @llvm.amdgcn.workitem.id.x() 103 %is.0 = icmp ne i32 %tid, 0 104 br i1 %is.0, label %then, label %exit 105 106then: 107 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 108 store i32 999, i32 addrspace(1)* %gep 109 ret void 110 111exit: 112 ret void 113} 114 115; Final block has more than a ret to execute. This was miscompiled 116; before function exit blocks were unified since the endpgm would 117; terminate the then wavefront before reaching the store. 118 119; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: 120; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} 121; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 122; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] 123; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] 124 125; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit 126; SI: ds_write_b32 127 128; SI-NEXT: {{^}}[[FLOW]]: 129; SI-NEXT: s_or_saveexec_b64 130; SI-NEXT: s_xor_b64 exec, exec 131; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] 132 133; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then 134; SI: s_waitcnt 135; SI-NEXT: buffer_store_dword 136 137; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock 138; SI: s_endpgm 139define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 140 %tid = call i32 @llvm.amdgcn.workitem.id.x() 141 %is.0 = icmp ne i32 %tid, 0 142 br i1 %is.0, label %then, label %exit 143 144then: 145 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 146 store i32 999, i32 addrspace(1)* %gep 147 ret void 148 149exit: 150 store volatile i32 7, i32 addrspace(3)* undef 151 ret void 152} 153 154; SI-LABEL: {{^}}simple_test_v_loop: 155; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 156; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 157; SI-NEXT: ; mask branch 158; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 159 160; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} 161 162; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 163; SI: buffer_load_dword 164; SI-DAG: buffer_store_dword 165; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 166; SI: s_cbranch_vccz [[LABEL_LOOP]] 167; SI: [[LABEL_EXIT]]: 168; SI: s_endpgm 169 170define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 171entry: 172 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 173 %is.0 = icmp ne i32 %tid, 0 174 %limit = add i32 %tid, 64 175 br i1 %is.0, label %loop, label %exit 176 177loop: 178 %i = phi i32 [%tid, %entry], [%i.inc, %loop] 179 %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i 180 %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i 181 %load = load i32, i32 addrspace(1)* %src 182 store i32 %load, i32 addrspace(1)* %gep.dst 183 %i.inc = add nsw i32 %i, 1 184 %cmp = icmp eq i32 %limit, %i.inc 185 br i1 %cmp, label %exit, label %loop 186 187exit: 188 ret void 189} 190 191; SI-LABEL: {{^}}multi_vcond_loop: 192 193; Load loop limit from buffer 194; Branch to exit if uniformly not taken 195; SI: ; %bb.0: 196; SI: buffer_load_dword [[VBOUND:v[0-9]+]] 197; SI: v_cmp_lt_i32_e32 vcc 198; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc 199; SI-NEXT: ; mask branch 200; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 201 202; Initialize inner condition to false 203; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader 204; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} 205; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] 206 207; Clear exec bits for workitems that load -1s 208; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 209; SI: buffer_load_dword [[B:v[0-9]+]] 210; SI: buffer_load_dword [[A:v[0-9]+]] 211; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] 212; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] 213; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] 214; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] 215; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]] 216; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] 217 218; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 219; SI: buffer_store_dword 220; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] 221; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] 222 223; SI: [[LABEL_FLOW]]: 224; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] 225; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]] 226; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]] 227; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]] 228; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]] 229; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] 230; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] 231 232; SI: [[LABEL_EXIT]]: 233; SI-NOT: [[COND_STATE]] 234; SI: s_endpgm 235 236define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { 237bb: 238 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 239 %tmp4 = sext i32 %tmp to i64 240 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 241 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 242 %tmp7 = icmp sgt i32 %tmp6, 0 243 %tmp8 = sext i32 %tmp6 to i64 244 br i1 %tmp7, label %bb10, label %bb26 245 246bb10: ; preds = %bb, %bb20 247 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] 248 %tmp12 = add nsw i64 %tmp11, %tmp4 249 %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 250 %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 251 %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 252 %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 253 %tmp17 = icmp ne i32 %tmp14, -1 254 %tmp18 = icmp ne i32 %tmp16, -1 255 %tmp19 = and i1 %tmp17, %tmp18 256 br i1 %tmp19, label %bb20, label %bb26 257 258bb20: ; preds = %bb10 259 %tmp21 = add nsw i32 %tmp16, %tmp14 260 %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 261 store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 262 %tmp23 = add nuw nsw i64 %tmp11, 1 263 %tmp24 = icmp slt i64 %tmp23, %tmp8 264 br i1 %tmp24, label %bb10, label %bb26 265 266bb26: ; preds = %bb10, %bb20, %bb 267 ret void 268} 269 270attributes #0 = { nounwind readnone } 271attributes #1 = { nounwind } 272