• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 namespace aco {
11 namespace {
12 
13 struct branch_ctx {
14    Program* program;
15    std::vector<bool> blocks_incoming_exec_used;
16 
branch_ctxaco::__anon3a46ac430111::branch_ctx17    branch_ctx(Program* program_)
18        : program(program_), blocks_incoming_exec_used(program_->blocks.size(), true)
19    {}
20 };
21 
22 void
remove_linear_successor(branch_ctx & ctx,Block & block,uint32_t succ_index)23 remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
24 {
25    Block& succ = ctx.program->blocks[succ_index];
26    ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
27    assert(std::next(it) == succ.linear_preds.end());
28    succ.linear_preds.pop_back();
29    it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
30    assert(std::next(it) == block.linear_succs.end());
31    block.linear_succs.pop_back();
32 
33    if (succ.linear_preds.empty()) {
34       /* This block became unreachable - Recursively remove successors. */
35       succ.instructions.clear();
36       for (unsigned i : succ.linear_succs)
37          remove_linear_successor(ctx, succ, i);
38    }
39 }
40 
41 void
try_remove_simple_block(branch_ctx & ctx,Block & block)42 try_remove_simple_block(branch_ctx& ctx, Block& block)
43 {
44    if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch)
45       return;
46 
47    /* Don't remove the preheader as it might be needed as convergence point
48     * in order to insert code (e.g. for loop alignment, wait states, etc.).
49     */
50    if (block.kind & block_kind_loop_preheader)
51       return;
52 
53    unsigned succ_idx = block.linear_succs[0];
54    Block& succ = ctx.program->blocks[succ_idx];
55    for (unsigned pred_idx : block.linear_preds) {
56       Block& pred = ctx.program->blocks[pred_idx];
57       assert(pred.index < block.index);
58       assert(!pred.instructions.empty() && pred.instructions.back()->isBranch());
59       Instruction* branch = pred.instructions.back().get();
60       if (branch->opcode == aco_opcode::p_branch) {
61          /* The predecessor unconditionally jumps to this block. Redirect to successor. */
62          pred.linear_succs[0] = succ_idx;
63          succ.linear_preds.push_back(pred_idx);
64       } else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) {
65          /* The predecessor's alternative target is this block's successor. */
66          pred.linear_succs[0] = succ_idx;
67          pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */
68          pred.linear_succs.pop_back();
69          branch->opcode = aco_opcode::p_branch;
70       } else if (pred.linear_succs[1] == block.index) {
71          /* The predecessor jumps to this block. Redirect to successor. */
72          pred.linear_succs[1] = succ_idx;
73          succ.linear_preds.push_back(pred_idx);
74       } else {
75          /* This block is the fall-through target of the predecessor. */
76          assert(pred_idx == block.index - 1);
77          if (block.instructions.empty()) {
78             /* If this block is empty, just fall-through to the successor. */
79             pred.linear_succs[0] = succ_idx;
80             succ.linear_preds.push_back(pred_idx);
81             continue;
82          }
83 
84          /* Otherwise, check if there is a fall-through path for the jump target. */
85          if (block.index >= pred.linear_succs[1])
86             return;
87          for (unsigned j = block.index + 1; j < pred.linear_succs[1]; j++) {
88             if (!ctx.program->blocks[j].instructions.empty())
89                return;
90          }
91          pred.linear_succs[0] = pred.linear_succs[1];
92          pred.linear_succs[1] = succ_idx;
93          succ.linear_preds.push_back(pred_idx);
94 
95          /* Invert the condition. This branch now falls through to its original target.
96           * However, we don't update the fall-through target since this instruction
97           * gets lowered in the next step, anyway.
98           */
99          if (branch->opcode == aco_opcode::p_cbranch_nz)
100             branch->opcode = aco_opcode::p_cbranch_z;
101          else
102             branch->opcode = aco_opcode::p_cbranch_nz;
103       }
104 
105       /* Update the branch target. */
106       branch->branch().target[0] = succ_idx;
107    }
108 
109    /* If this block is part of the logical CFG, also connect pre- and successors. */
110    if (!block.logical_succs.empty()) {
111       assert(block.logical_succs.size() == 1);
112       unsigned logical_succ_idx = block.logical_succs[0];
113       Block& logical_succ = ctx.program->blocks[logical_succ_idx];
114       ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(),
115                                      logical_succ.logical_preds.end(), block.index);
116       assert(std::next(it) == logical_succ.logical_preds.end());
117       logical_succ.logical_preds.pop_back();
118       for (unsigned pred_idx : block.logical_preds) {
119          Block& pred = ctx.program->blocks[pred_idx];
120          std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index,
121                       logical_succ_idx);
122 
123          if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1])
124             pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */
125          else
126             logical_succ.logical_preds.push_back(pred_idx);
127       }
128 
129       block.logical_succs.clear();
130       block.logical_preds.clear();
131    }
132 
133    remove_linear_successor(ctx, block, succ_idx);
134    block.linear_preds.clear();
135    block.instructions.clear();
136 }
137 
138 void
try_merge_break_with_continue(branch_ctx & ctx,Block & block)139 try_merge_break_with_continue(branch_ctx& ctx, Block& block)
140 {
141    /* Look for this:
142     * BB1:
143     *    ...
144     *    p_branch_z exec BB3, BB2
145     * BB2:
146     *    ...
147     *    s[0:1], scc = s_andn2 s[0:1], exec
148     *    s_cbranch_scc0 BB4
149     * BB3:
150     *    exec = s_mov_b64 s[0:1]
151     *    s_branch BB1
152     * BB4:
153     *    ...
154     *
155     * And turn it into this:
156     * BB1:
157     *    ...
158     *    p_branch_z exec BB3, BB2
159     * BB2:
160     *    ...
161     * BB3:
162     *    s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
163     *    s_cbranch_scc1 BB1, BB4
164     * BB4:
165     *    ...
166     */
167    if (block.linear_succs.size() != 2 || block.instructions.size() < 2)
168       return;
169 
170    Instruction* branch = block.instructions.back().get();
171    if (branch->opcode != aco_opcode::s_cbranch_scc0)
172       return;
173 
174    Block& merge = ctx.program->blocks[block.linear_succs[0]];
175    Block& loopexit = ctx.program->blocks[block.linear_succs[1]];
176 
177    /* Just a jump to the loop header. */
178    if (merge.linear_succs.size() != 1)
179       return;
180 
181    for (unsigned merge_pred : merge.linear_preds) {
182       if (merge_pred == block.index)
183          continue;
184 
185       Block& pred = ctx.program->blocks[merge_pred];
186       Instruction* pred_branch = pred.instructions.back().get();
187       /* The branch needs to be exec zero only, otherwise we corrupt exec. */
188       if (pred_branch->opcode != aco_opcode::p_cbranch_z ||
189           pred_branch->operands[0].physReg() != exec)
190          return;
191    }
192 
193    /* merge block: copy to exec, branch */
194    if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch)
195       return;
196 
197    Builder bld(ctx.program);
198    Instruction* execwrite = merge.instructions[0].get();
199    if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec())
200       return;
201 
202    Instruction* execsrc = block.instructions[block.instructions.size() - 2].get();
203    if (execsrc->opcode != bld.w64or32(Builder::s_andn2) ||
204        execsrc->definitions[0].physReg() != execwrite->operands[0].physReg() ||
205        execsrc->operands[0].physReg() != execwrite->operands[0].physReg() ||
206        execsrc->operands[1].physReg() != exec)
207       return;
208 
209    /* Use conditional branch in merge block. */
210    merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
211    block.linear_succs.pop_back();
212    block.linear_succs[0] = merge.index;
213    merge.linear_succs.push_back(loopexit.index);
214    std::swap(merge.linear_succs[0], merge.linear_succs[1]);
215    std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
216                 merge.index);
217 
218    /* Check if we can use the loopexit as the fallthrough block.
219     * Otherwise, we'll need an extra branch instruction.
220     */
221    for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
222       if (!ctx.program->blocks[i].instructions.empty()) {
223          branch->opcode = aco_opcode::s_branch;
224          merge.instructions.emplace_back(std::move(block.instructions.back()));
225          break;
226       }
227    }
228    block.instructions.pop_back();
229 
230    if (ctx.program->gfx_level >= GFX9) {
231       /* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
232       Instruction* wr_exec =
233          bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
234                   Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
235       merge.instructions[0].reset(wr_exec);
236    } else {
237       /* Move s_andn2 to the merge block. */
238       merge.instructions.emplace(merge.instructions.begin(), std::move(block.instructions.back()));
239    }
240    block.instructions.pop_back();
241    ctx.blocks_incoming_exec_used[merge.index] = true;
242 }
243 
244 void
eliminate_useless_exec_writes_in_block(branch_ctx & ctx,Block & block)245 eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
246 {
247    /* Check if any successor needs the outgoing exec mask from the current block. */
248    bool exec_write_used;
249    if (block.kind & block_kind_end_with_regs) {
250       /* Last block of a program with succeed shader part should respect final exec write. */
251       exec_write_used = true;
252    } else if (block.linear_succs.empty() && !block.instructions.empty() &&
253               block.instructions.back()->opcode == aco_opcode::s_setpc_b64) {
254       /* This block ends in a long jump and exec might be needed for the next shader part. */
255       exec_write_used = true;
256    } else {
257       /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
258       exec_write_used =
259          std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
260                      [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
261    }
262 
263    /* Go through all instructions and eliminate useless exec writes. */
264    for (int i = block.instructions.size() - 1; i >= 0; --i) {
265       aco_ptr<Instruction>& instr = block.instructions[i];
266 
267       /* See if the current instruction needs or writes exec. */
268       bool needs_exec = needs_exec_mask(instr.get());
269       bool writes_exec =
270          instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask;
271 
272       /* See if we found an unused exec write. */
273       if (writes_exec && !exec_write_used) {
274          /* Don't eliminate an instruction that writes registers other than exec and scc.
275           * It is possible that this is eg. an s_and_saveexec and the saved value is
276           * used by a later branch.
277           */
278          bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
279                                          [](const Definition& def) -> bool
280                                          { return def.physReg() != exec && def.physReg() != scc; });
281          if (!writes_other) {
282             instr.reset();
283             continue;
284          }
285       }
286 
287       /* For a newly encountered exec write, clear the used flag. */
288       if (writes_exec)
289          exec_write_used = false;
290 
291       /* If the current instruction needs exec, mark it as used. */
292       exec_write_used |= needs_exec;
293    }
294 
295    /* Remember if the current block needs an incoming exec mask from its predecessors. */
296    ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
297 
298    /* Cleanup: remove deleted instructions from the vector. */
299    auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
300    block.instructions.resize(new_end - block.instructions.begin());
301 }
302 
303 /**
304  *  Check if the branch instruction can be removed:
305  *  This is beneficial when executing the next block with an empty exec mask
306  *  is faster than the branch instruction itself.
307  *
308  *  Override this judgement when:
309  *  - The application prefers to remove control flow
310  *  - The compiler stack knows that it's a divergent branch never taken
311  */
312 bool
can_remove_branch(branch_ctx & ctx,Block & block,Pseudo_branch_instruction * branch)313 can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
314 {
315    const uint32_t target = branch->target[0];
316    const bool uniform_branch =
317       !((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
318         branch->operands[0].physReg() == exec);
319 
320    if (branch->never_taken) {
321       assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1),
322                                             std::next(ctx.program->blocks.begin(), target),
323                                             [](Block& b) { return b.instructions.empty(); }));
324       return true;
325    }
326 
327    /* Cannot remove back-edges. */
328    if (block.index >= target)
329       return false;
330 
331    const bool prefer_remove = branch->rarely_taken;
332    unsigned num_scalar = 0;
333    unsigned num_vector = 0;
334 
335    /* Check the instructions between branch and target */
336    for (unsigned i = block.index + 1; i < target; i++) {
337       /* Uniform conditional branches must not be ignored if they
338        * are about to jump over actual instructions */
339       if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
340          return false;
341 
342       for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
343          if (instr->isSOPP()) {
344             /* Discard early exits and loop breaks and continues should work fine with
345              * an empty exec mask.
346              */
347             if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
348                 instr->opcode == aco_opcode::s_cbranch_scc1 ||
349                 instr->opcode == aco_opcode::s_cbranch_execz ||
350                 instr->opcode == aco_opcode::s_cbranch_execnz) {
351                bool is_break_continue =
352                   ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
353                bool discard_early_exit =
354                   ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
355                if (is_break_continue || discard_early_exit)
356                   continue;
357             }
358             return false;
359          } else if (instr->isSALU()) {
360             num_scalar++;
361          } else if (instr->isVALU() || instr->isVINTRP()) {
362             if (instr->opcode == aco_opcode::v_writelane_b32 ||
363                 instr->opcode == aco_opcode::v_writelane_b32_e64) {
364                /* writelane ignores exec, writing inactive lanes results in UB. */
365                return false;
366             }
367             num_vector++;
368             /* VALU which writes SGPRs are always executed on GFX10+ */
369             if (ctx.program->gfx_level >= GFX10) {
370                for (Definition& def : instr->definitions) {
371                   if (def.regClass().type() == RegType::sgpr)
372                      num_scalar++;
373                }
374             }
375          } else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
376             /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
377              * SMEM might be an invalid access, and barriers are probably expensive. */
378             return false;
379          } else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
380             // TODO: GFX6-9 can use vskip
381             if (!prefer_remove)
382                return false;
383          } else if (instr->opcode != aco_opcode::p_debug_info) {
384             assert(false && "Pseudo instructions should be lowered by this point.");
385             return false;
386          }
387 
388          if (!prefer_remove) {
389             /* Under these conditions, we shouldn't remove the branch.
390              * Don't care about the estimated cycles when the shader prefers flattening.
391              */
392             unsigned est_cycles;
393             if (ctx.program->gfx_level >= GFX10)
394                est_cycles = num_scalar * 2 + num_vector;
395             else
396                est_cycles = num_scalar * 4 + num_vector * 4;
397 
398             if (est_cycles > 16)
399                return false;
400          }
401       }
402    }
403 
404    return true;
405 }
406 
407 void
lower_branch_instruction(branch_ctx & ctx,Block & block)408 lower_branch_instruction(branch_ctx& ctx, Block& block)
409 {
410    if (block.instructions.empty() || !block.instructions.back()->isBranch())
411       return;
412 
413    aco_ptr<Instruction> branch = std::move(block.instructions.back());
414    const uint32_t target = branch->branch().target[0];
415    block.instructions.pop_back();
416 
417    if (can_remove_branch(ctx, block, &branch->branch())) {
418       if (branch->opcode != aco_opcode::p_branch)
419          remove_linear_successor(ctx, block, target);
420       return;
421    }
422 
423    /* emit branch instruction */
424    Builder bld(ctx.program, &block.instructions);
425    switch (branch->opcode) {
426    case aco_opcode::p_branch:
427       assert(block.linear_succs[0] == target);
428       bld.sopp(aco_opcode::s_branch, target);
429       break;
430    case aco_opcode::p_cbranch_nz:
431       assert(block.linear_succs[1] == target);
432       if (branch->operands[0].physReg() == exec)
433          bld.sopp(aco_opcode::s_cbranch_execnz, target);
434       else if (branch->operands[0].physReg() == vcc)
435          bld.sopp(aco_opcode::s_cbranch_vccnz, target);
436       else {
437          assert(branch->operands[0].physReg() == scc);
438          bld.sopp(aco_opcode::s_cbranch_scc1, target);
439       }
440       break;
441    case aco_opcode::p_cbranch_z:
442       assert(block.linear_succs[1] == target);
443       if (branch->operands[0].physReg() == exec)
444          bld.sopp(aco_opcode::s_cbranch_execz, target);
445       else if (branch->operands[0].physReg() == vcc)
446          bld.sopp(aco_opcode::s_cbranch_vccz, target);
447       else {
448          assert(branch->operands[0].physReg() == scc);
449          bld.sopp(aco_opcode::s_cbranch_scc0, target);
450       }
451       break;
452    default: unreachable("Unknown Pseudo branch instruction!");
453    }
454 }
455 
456 void
try_stitch_linear_block(branch_ctx & ctx,Block & block)457 try_stitch_linear_block(branch_ctx& ctx, Block& block)
458 {
459    /* Don't stitch blocks that are part of the logical CFG. */
460    if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty())
461       return;
462 
463    /* Try to stitch this block with the predecessor:
464     * This block must have exactly one predecessor and
465     * the predecessor must have exactly one successor.
466     */
467    Block& pred = ctx.program->blocks[block.linear_preds[0]];
468    if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 &&
469        (pred.instructions.empty() || !pred.instructions.back()->isSOPP())) {
470       /* Insert the instructions at the end of the predecessor and fixup edges. */
471       pred.instructions.insert(pred.instructions.end(),
472                                std::move_iterator(block.instructions.begin()),
473                                std::move_iterator(block.instructions.end()));
474       for (unsigned succ_idx : block.linear_succs) {
475          Block& s = ctx.program->blocks[succ_idx];
476          std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index);
477       }
478       pred.linear_succs = std::move(block.linear_succs);
479 
480       block.instructions.clear();
481       block.linear_preds.clear();
482       block.linear_succs.clear();
483       return;
484    }
485 
486    /* Try to stitch this block with the successor:
487     * This block must have exactly one successor and
488     * the successor must have exactly one predecessor.
489     */
490    Block& succ = ctx.program->blocks[block.linear_succs[0]];
491    if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 &&
492        (block.instructions.empty() || !block.instructions.back()->isSOPP())) {
493       /* Insert the instructions at the beginning of the successor. */
494       succ.instructions.insert(succ.instructions.begin(),
495                                std::move_iterator(block.instructions.begin()),
496                                std::move_iterator(block.instructions.end()));
497       for (unsigned pred_idx : block.linear_preds) {
498          Block& p = ctx.program->blocks[pred_idx];
499          if (!p.instructions.empty() &&
500              instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch &&
501              p.instructions.back()->salu().imm == block.index) {
502             p.instructions.back()->salu().imm = succ.index;
503          }
504          std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index);
505       }
506       succ.linear_preds = std::move(block.linear_preds);
507 
508       block.instructions.clear();
509       block.linear_preds.clear();
510       block.linear_succs.clear();
511    }
512 }
513 
514 } /* end namespace */
515 
516 void
lower_branches(Program * program)517 lower_branches(Program* program)
518 {
519    branch_ctx ctx(program);
520 
521    for (int i = program->blocks.size() - 1; i >= 0; i--) {
522       Block& block = program->blocks[i];
523       lower_branch_instruction(ctx, block);
524       eliminate_useless_exec_writes_in_block(ctx, block);
525 
526       if (block.kind & block_kind_break)
527          try_merge_break_with_continue(ctx, block);
528 
529       if (block.linear_succs.size() == 1)
530          try_remove_simple_block(ctx, block);
531    }
532 
533    for (Block& block : program->blocks) {
534       try_stitch_linear_block(ctx, block);
535    }
536 }
537 
538 } // namespace aco
539