• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include <vector>
11 
12 namespace aco {
13 
14 namespace {
15 
16 enum WQMState : uint8_t {
17    Unspecified = 0,
18    Exact,
19    WQM, /* with control flow applied */
20 };
21 
22 enum mask_type : uint8_t {
23    mask_type_global = 1 << 0,
24    mask_type_exact = 1 << 1,
25    mask_type_wqm = 1 << 2,
26    mask_type_loop = 1 << 3, /* active lanes of a loop */
27 };
28 
29 struct loop_info {
30    Block* loop_header;
31    uint16_t num_exec_masks;
32    bool has_divergent_break;
33    bool has_divergent_continue;
34    bool has_discard; /* has a discard or demote */
loop_infoaco::__anonfb53f2590111::loop_info35    loop_info(Block* b, uint16_t num, bool breaks, bool cont, bool discard)
36        : loop_header(b), num_exec_masks(num), has_divergent_break(breaks),
37          has_divergent_continue(cont), has_discard(discard)
38    {}
39 };
40 
41 struct exec_info {
42    Operand op; /* Either a temporary, exec or const -1. */
43    uint8_t type; /* enum mask_type */
44    exec_info() = default;
exec_infoaco::__anonfb53f2590111::exec_info45    exec_info(const Operand& op_, const uint8_t& type_) : op(op_), type(type_) {}
46 };
47 
48 struct block_info {
49    std::vector<exec_info> exec;
50 };
51 
52 struct exec_ctx {
53    Program* program;
54    std::vector<block_info> info;
55    std::vector<loop_info> loop;
56    bool handle_wqm = false;
exec_ctxaco::__anonfb53f2590111::exec_ctx57    exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
58 };
59 
60 bool
needs_exact(aco_ptr<Instruction> & instr)61 needs_exact(aco_ptr<Instruction>& instr)
62 {
63    if (instr->isMUBUF()) {
64       return instr->mubuf().disable_wqm;
65    } else if (instr->isMTBUF()) {
66       return instr->mtbuf().disable_wqm;
67    } else if (instr->isMIMG()) {
68       return instr->mimg().disable_wqm;
69    } else if (instr->isFlatLike()) {
70       return instr->flatlike().disable_wqm;
71    } else {
72       /* Require Exact for p_jump_to_epilog because if p_exit_early_if_not is
73        * emitted inside the same block, the main FS will always jump to the PS
74        * epilog without considering the exec mask.
75        */
76       return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog ||
77              instr->opcode == aco_opcode::p_dual_src_export_gfx11;
78    }
79 }
80 
81 WQMState
get_instr_needs(aco_ptr<Instruction> & instr)82 get_instr_needs(aco_ptr<Instruction>& instr)
83 {
84    if (needs_exact(instr))
85       return Exact;
86 
87    bool pred_by_exec = needs_exec_mask(instr.get()) || instr->opcode == aco_opcode::p_logical_end ||
88                        instr->isBranch();
89 
90    return pred_by_exec ? WQM : Unspecified;
91 }
92 
93 void
transition_to_WQM(exec_ctx & ctx,Builder bld,unsigned idx)94 transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
95 {
96    if (ctx.info[idx].exec.back().type & mask_type_wqm)
97       return;
98    if (ctx.info[idx].exec.back().type & mask_type_global) {
99       Operand exec_mask = ctx.info[idx].exec.back().op;
100       if (exec_mask == Operand(exec, bld.lm))
101          ctx.info[idx].exec.back().op = bld.copy(bld.def(bld.lm), exec_mask);
102 
103       bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), exec_mask);
104       ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type_global | mask_type_wqm);
105       return;
106    }
107    /* otherwise, the WQM mask should be one below the current mask */
108    ctx.info[idx].exec.pop_back();
109    assert(ctx.info[idx].exec.back().type & mask_type_wqm);
110    assert(ctx.info[idx].exec.back().op.size() == bld.lm.size());
111    assert(ctx.info[idx].exec.back().op.isTemp());
112    bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
113 }
114 
115 void
transition_to_Exact(exec_ctx & ctx,Builder bld,unsigned idx)116 transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
117 {
118    if (ctx.info[idx].exec.back().type & mask_type_exact)
119       return;
120    /* We can't remove the loop exec mask, because that can cause exec.size() to
121     * be less than num_exec_masks. The loop exec mask also needs to be kept
122     * around for various uses. */
123    if ((ctx.info[idx].exec.back().type & mask_type_global) &&
124        !(ctx.info[idx].exec.back().type & mask_type_loop)) {
125       ctx.info[idx].exec.pop_back();
126       assert(ctx.info[idx].exec.back().type & mask_type_exact);
127       assert(ctx.info[idx].exec.back().op.size() == bld.lm.size());
128       assert(ctx.info[idx].exec.back().op.isTemp());
129       bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
130       return;
131    }
132    /* otherwise, we create an exact mask and push to the stack */
133    Operand wqm = ctx.info[idx].exec.back().op;
134    if (wqm == Operand(exec, bld.lm)) {
135       wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
136                      Definition(exec, bld.lm), ctx.info[idx].exec[0].op, Operand(exec, bld.lm));
137    } else {
138       bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].op,
139                wqm);
140    }
141    ctx.info[idx].exec.back().op = Operand(wqm);
142    ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type_exact);
143 }
144 
145 unsigned
add_coupling_code(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions)146 add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
147 {
148    unsigned idx = block->index;
149    Builder bld(ctx.program, &instructions);
150    Block::edge_vec& preds = block->linear_preds;
151    bool restore_exec = false;
152 
153    /* start block */
154    if (preds.empty()) {
155       aco_ptr<Instruction>& startpgm = block->instructions[0];
156       assert(startpgm->opcode == aco_opcode::p_startpgm);
157       bld.insert(std::move(startpgm));
158 
159       unsigned count = 1;
160       while (block->instructions[count]->opcode == aco_opcode::p_init_scratch ||
161              block->instructions[count]->opcode == aco_opcode::s_setprio) {
162          bld.insert(std::move(block->instructions[count]));
163          count++;
164       }
165 
166       Operand start_exec(exec, bld.lm);
167 
168       /* exec seems to need to be manually initialized with combined shaders */
169       if (ctx.program->stage.num_sw_stages() > 1 ||
170           ctx.program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
171           (ctx.program->stage.sw == SWStage::VS &&
172            (ctx.program->stage.hw == AC_HW_HULL_SHADER ||
173             ctx.program->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER)) ||
174           (ctx.program->stage.sw == SWStage::TES &&
175            ctx.program->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER)) {
176          start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
177          bld.copy(Definition(exec, bld.lm), start_exec);
178       }
179 
180       /* EXEC is automatically initialized by the HW for compute shaders.
181        * We know for sure exec is initially -1 when the shader always has full subgroups.
182        */
183       if (ctx.program->stage == compute_cs && ctx.program->info.cs.uses_full_subgroups)
184          start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
185 
186       if (ctx.handle_wqm) {
187          ctx.info[idx].exec.emplace_back(start_exec, mask_type_global | mask_type_exact);
188          /* Initialize WQM already */
189          transition_to_WQM(ctx, bld, idx);
190       } else {
191          uint8_t mask = mask_type_global;
192          if (ctx.program->needs_wqm) {
193             bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
194                      Operand(exec, bld.lm));
195             mask |= mask_type_wqm;
196          } else {
197             mask |= mask_type_exact;
198          }
199          ctx.info[idx].exec.emplace_back(start_exec, mask);
200       }
201 
202       return count;
203    }
204 
205    /* loop entry block */
206    if (block->kind & block_kind_loop_header) {
207       assert(preds[0] == idx - 1);
208       ctx.info[idx].exec = ctx.info[idx - 1].exec;
209       loop_info& info = ctx.loop.back();
210       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
211 
212       /* create ssa names for outer exec masks */
213       if (info.has_discard && preds.size() > 1) {
214          aco_ptr<Instruction> phi;
215          for (int i = 0; i < info.num_exec_masks - 1; i++) {
216             phi.reset(
217                create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
218             phi->definitions[0] = bld.def(bld.lm);
219             phi->operands[0] = ctx.info[preds[0]].exec[i].op;
220             ctx.info[idx].exec[i].op = bld.insert(std::move(phi));
221          }
222       }
223 
224       ctx.info[idx].exec.back().type |= mask_type_loop;
225 
226       if (info.has_divergent_continue) {
227          /* create ssa name for loop active mask */
228          aco_ptr<Instruction> phi{
229             create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
230          phi->definitions[0] = bld.def(bld.lm);
231          phi->operands[0] = ctx.info[preds[0]].exec.back().op;
232          ctx.info[idx].exec.back().op = bld.insert(std::move(phi));
233 
234          restore_exec = true;
235          uint8_t mask_type = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
236          ctx.info[idx].exec.emplace_back(ctx.info[idx].exec.back().op, mask_type);
237       }
238 
239    } else if (block->kind & block_kind_loop_exit) {
240       Block* header = ctx.loop.back().loop_header;
241       loop_info& info = ctx.loop.back();
242 
243       for (ASSERTED unsigned pred : preds)
244          assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
245 
246       /* fill the loop header phis */
247       Block::edge_vec& header_preds = header->linear_preds;
248       int instr_idx = 0;
249       if (info.has_discard && header_preds.size() > 1) {
250          while (instr_idx < info.num_exec_masks - 1) {
251             aco_ptr<Instruction>& phi = header->instructions[instr_idx];
252             assert(phi->opcode == aco_opcode::p_linear_phi);
253             for (unsigned i = 1; i < phi->operands.size(); i++)
254                phi->operands[i] = ctx.info[header_preds[i]].exec[instr_idx].op;
255             instr_idx++;
256          }
257       }
258 
259       if (info.has_divergent_continue) {
260          aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
261          assert(phi->opcode == aco_opcode::p_linear_phi);
262          for (unsigned i = 1; i < phi->operands.size(); i++)
263             phi->operands[i] = ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].op;
264          restore_exec = true;
265       }
266 
267       if (info.has_divergent_break) {
268          restore_exec = true;
269          /* Drop the loop active mask. */
270          info.num_exec_masks--;
271       }
272       assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
273 
274       /* create the loop exit phis if not trivial */
275       for (unsigned exec_idx = 0; exec_idx < info.num_exec_masks; exec_idx++) {
276          Operand same = ctx.info[preds[0]].exec[exec_idx].op;
277          uint8_t type = ctx.info[header_preds[0]].exec[exec_idx].type;
278          bool trivial = true;
279 
280          for (unsigned i = 1; i < preds.size() && trivial; i++) {
281             if (ctx.info[preds[i]].exec[exec_idx].op != same)
282                trivial = false;
283          }
284 
285          if (trivial) {
286             ctx.info[idx].exec.emplace_back(same, type);
287          } else {
288             /* create phi for loop footer */
289             aco_ptr<Instruction> phi{
290                create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
291             phi->definitions[0] = bld.def(bld.lm);
292             for (unsigned i = 0; i < phi->operands.size(); i++)
293                phi->operands[i] = ctx.info[preds[i]].exec[exec_idx].op;
294             ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
295          }
296       }
297 
298       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
299       ctx.loop.pop_back();
300 
301    } else if (preds.size() == 1) {
302       ctx.info[idx].exec = ctx.info[preds[0]].exec;
303 
304       /* After continue and break blocks, we implicitly set exec to zero.
305        * This is so that parallelcopies can be inserted before the branch
306        * without being affected by the changed exec mask.
307        */
308       if (ctx.info[idx].exec.back().op.constantEquals(0)) {
309          assert(block->logical_succs.empty());
310          /* Check whether the successor block already restores exec. */
311          uint16_t block_kind = ctx.program->blocks[block->linear_succs[0]].kind;
312          if (!(block_kind & (block_kind_loop_header | block_kind_loop_exit | block_kind_invert |
313                              block_kind_merge))) {
314             /* The successor does not restore exec. */
315             restore_exec = true;
316          }
317       }
318    } else {
319       assert(preds.size() == 2);
320       assert(ctx.info[preds[0]].exec.size() == ctx.info[preds[1]].exec.size());
321 
322       unsigned last = ctx.info[preds[0]].exec.size() - 1;
323 
324       /* create phis for diverged temporary exec masks */
325       for (unsigned i = 0; i < last; i++) {
326          /* skip trivial phis */
327          if (ctx.info[preds[0]].exec[i].op == ctx.info[preds[1]].exec[i].op) {
328             Operand op = ctx.info[preds[0]].exec[i].op;
329             /* discard/demote can change the state of the current exec mask */
330             assert(!op.isTemp() ||
331                    ctx.info[preds[0]].exec[i].type == ctx.info[preds[1]].exec[i].type);
332             uint8_t mask = ctx.info[preds[0]].exec[i].type & ctx.info[preds[1]].exec[i].type;
333             ctx.info[idx].exec.emplace_back(op, mask);
334             continue;
335          }
336 
337          Operand phi = bld.pseudo(aco_opcode::p_linear_phi, bld.def(bld.lm),
338                                   ctx.info[preds[0]].exec[i].op, ctx.info[preds[1]].exec[i].op);
339          uint8_t mask_type = ctx.info[preds[0]].exec[i].type & ctx.info[preds[1]].exec[i].type;
340          ctx.info[idx].exec.emplace_back(phi, mask_type);
341       }
342 
343       if (block->kind & block_kind_merge) {
344          restore_exec = true;
345       } else {
346          /* The last mask is already in exec. */
347          Operand current_exec = Operand(exec, bld.lm);
348          if (ctx.info[preds[0]].exec[last].op == ctx.info[preds[1]].exec[last].op) {
349             current_exec = ctx.info[preds[0]].exec[last].op;
350          }
351          uint8_t mask_type =
352             ctx.info[preds[0]].exec[last].type & ctx.info[preds[1]].exec[last].type;
353          ctx.info[idx].exec.emplace_back(current_exec, mask_type);
354       }
355    }
356 
357    unsigned i = 0;
358    while (block->instructions[i]->opcode == aco_opcode::p_phi ||
359           block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
360       bld.insert(std::move(block->instructions[i]));
361       i++;
362    }
363 
364    if (ctx.handle_wqm) {
365       /* End WQM handling if not needed anymore */
366       if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
367          if (block->instructions[i]->opcode == aco_opcode::p_end_wqm) {
368             ctx.info[idx].exec.back().type |= mask_type_global;
369             transition_to_Exact(ctx, bld, idx);
370             ctx.handle_wqm = false;
371             restore_exec = false;
372             i++;
373          }
374       }
375    }
376 
377    /* restore exec mask after divergent control flow */
378    if (restore_exec) {
379       Operand restore = ctx.info[idx].exec.back().op;
380       assert(restore.size() == bld.lm.size());
381       bld.copy(Definition(exec, bld.lm), restore);
382    }
383 
384    return i;
385 }
386 
387 /* Avoid live-range splits in Exact mode:
388  * Because the data register of atomic VMEM instructions
389  * is shared between src and dst, it might be necessary
390  * to create live-range splits during RA.
391  * Make the live-range splits explicit in WQM mode.
392  */
393 void
handle_atomic_data(exec_ctx & ctx,Builder & bld,unsigned block_idx,aco_ptr<Instruction> & instr)394 handle_atomic_data(exec_ctx& ctx, Builder& bld, unsigned block_idx, aco_ptr<Instruction>& instr)
395 {
396    /* check if this is an atomic VMEM instruction */
397    int idx = -1;
398    if (!instr->isVMEM() || instr->definitions.empty())
399       return;
400    else if (instr->isMIMG())
401       idx = instr->operands[2].isTemp() ? 2 : -1;
402    else if (instr->operands.size() == 4)
403       idx = 3;
404 
405    if (idx != -1) {
406       /* insert explicit copy of atomic data in WQM-mode */
407       transition_to_WQM(ctx, bld, block_idx);
408       Temp data = instr->operands[idx].getTemp();
409       data = bld.copy(bld.def(data.regClass()), data);
410       instr->operands[idx].setTemp(data);
411    }
412 }
413 
414 void
process_instructions(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions,unsigned idx)415 process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
416                      unsigned idx)
417 {
418    block_info& info = ctx.info[block->index];
419    WQMState state;
420    if (info.exec.back().type & mask_type_wqm) {
421       state = WQM;
422    } else {
423       assert(!ctx.handle_wqm || info.exec.back().type & mask_type_exact);
424       state = Exact;
425    }
426 
427    Builder bld(ctx.program, &instructions);
428 
429    for (; idx < block->instructions.size(); idx++) {
430       aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
431 
432       WQMState needs = ctx.handle_wqm ? get_instr_needs(instr) : Unspecified;
433 
434       if (needs == WQM && state != WQM) {
435          transition_to_WQM(ctx, bld, block->index);
436          state = WQM;
437       } else if (needs == Exact) {
438          if (ctx.handle_wqm)
439             handle_atomic_data(ctx, bld, block->index, instr);
440          transition_to_Exact(ctx, bld, block->index);
441          state = Exact;
442       }
443 
444       if (instr->opcode == aco_opcode::p_discard_if) {
445          Operand current_exec = Operand(exec, bld.lm);
446 
447          if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) {
448             /* Transition to Exact without extra instruction. */
449             info.exec.resize(1);
450             assert(info.exec[0].type == (mask_type_exact | mask_type_global));
451             current_exec = info.exec[0].op;
452             info.exec[0].op = Operand(exec, bld.lm);
453             state = Exact;
454          } else if (info.exec.size() >= 2 && ctx.handle_wqm) {
455             /* Preserve the WQM mask */
456             info.exec[1].type &= ~mask_type_global;
457          }
458 
459          Temp cond;
460          if (instr->operands[0].isConstant()) {
461             assert(instr->operands[0].constantValue() == -1u);
462             /* save condition and set exec to zero */
463             cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
464                             Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
465          } else {
466             cond = instr->operands[0].getTemp();
467             /* discard from current exec */
468             bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), current_exec,
469                      cond);
470          }
471 
472          if (info.exec.size() == 1) {
473             instr->operands[0] = Operand(exec, bld.lm);
474          } else {
475             /* discard from inner to outer exec mask on stack */
476             int num = info.exec.size() - 2;
477             Temp exit_cond;
478             for (int i = num; i >= 0; i--) {
479                Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
480                                              info.exec[i].op, cond);
481                info.exec[i].op = Operand(andn2->definitions[0].getTemp());
482                exit_cond = andn2->definitions[1].getTemp();
483             }
484             instr->operands[0] = bld.scc(exit_cond);
485          }
486 
487          info.exec.back().op = Operand(exec, bld.lm);
488          instr->opcode = aco_opcode::p_exit_early_if_not;
489          assert(!ctx.handle_wqm || (info.exec[0].type & mask_type_wqm) == 0);
490       } else if (instr->opcode == aco_opcode::p_is_helper) {
491          Definition dst = instr->definitions[0];
492          assert(dst.size() == bld.lm.size());
493          if (state == Exact) {
494             instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
495             instr->operands[0] = Operand::zero();
496             instr->definitions[0] = dst;
497          } else {
498             exec_info& exact_mask = info.exec[0];
499             assert(exact_mask.type & mask_type_exact);
500 
501             instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
502             instr->operands[0] = Operand(exec, bld.lm); /* current exec */
503             instr->operands[1] = Operand(exact_mask.op);
504             instr->definitions[0] = dst;
505             instr->definitions[1] = bld.def(s1, scc);
506          }
507       } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
508          assert((info.exec[0].type & mask_type_exact) && (info.exec[0].type & mask_type_global));
509 
510          const bool nested_cf = !(info.exec.back().type & mask_type_global);
511          if (ctx.handle_wqm && state == Exact && nested_cf) {
512             /* Transition back to WQM without extra instruction. */
513             info.exec.pop_back();
514             state = WQM;
515          } else if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) {
516             /* Transition to Exact without extra instruction. */
517             info.exec.resize(1);
518             state = Exact;
519          } else if (nested_cf) {
520             /* Save curent exec temporarily. */
521             info.exec.back().op = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
522          } else {
523             info.exec.back().op = Operand(exec, bld.lm);
524          }
525 
526          /* Remove invocations from global exact mask. */
527          Definition def = state == Exact ? Definition(exec, bld.lm) : bld.def(bld.lm);
528          Operand src = instr->operands[0].isConstant() ? Operand(exec, bld.lm) : instr->operands[0];
529 
530          bld.sop2(Builder::s_andn2, def, bld.def(s1, scc), info.exec[0].op, src);
531          info.exec[0].op = def.isTemp() ? Operand(def.getTemp()) : Operand(exec, bld.lm);
532 
533          /* Update global WQM mask and store in exec. */
534          if (state == WQM) {
535             assert(info.exec.size() > 1);
536             bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), def.getTemp());
537          }
538 
539          /* End shader if global mask is zero. */
540          instr->opcode = aco_opcode::p_exit_early_if_not;
541          instr->operands[0] = Operand(exec, bld.lm);
542          bld.insert(std::move(instr));
543 
544          /* Update all other exec masks. */
545          if (nested_cf) {
546             const unsigned global_idx = state == WQM ? 1 : 0;
547             for (unsigned i = global_idx + 1; i < info.exec.size() - 1; i++) {
548                info.exec[i].op = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
549                                           info.exec[i].op, Operand(exec, bld.lm));
550             }
551             /* Update current exec and save WQM mask. */
552             info.exec[global_idx].op =
553                bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
554                         Definition(exec, bld.lm), info.exec.back().op, Operand(exec, bld.lm));
555             info.exec.back().op = Operand(exec, bld.lm);
556          }
557          continue;
558 
559       } else if (instr->opcode == aco_opcode::p_elect) {
560          bool all_lanes_enabled = info.exec.back().op.constantEquals(-1u);
561          Definition dst = instr->definitions[0];
562 
563          if (all_lanes_enabled) {
564             bld.copy(Definition(dst), Operand::c32_or_c64(1u, dst.size() == 2));
565          } else {
566             Temp first_lane_idx = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
567             bld.sop2(Builder::s_lshl, Definition(dst), bld.def(s1, scc),
568                      Operand::c32_or_c64(1u, dst.size() == 2), Operand(first_lane_idx));
569          }
570          continue;
571       } else if (instr->opcode == aco_opcode::p_end_wqm) {
572          assert(block->kind & block_kind_top_level);
573          assert(info.exec.size() <= 2);
574          /* This instruction indicates the end of WQM mode. */
575          info.exec.back().type |= mask_type_global;
576          transition_to_Exact(ctx, bld, block->index);
577          state = Exact;
578          ctx.handle_wqm = false;
579          continue;
580       }
581 
582       bld.insert(std::move(instr));
583    }
584 }
585 
586 void
add_branch_code(exec_ctx & ctx,Block * block)587 add_branch_code(exec_ctx& ctx, Block* block)
588 {
589    unsigned idx = block->index;
590    Builder bld(ctx.program, block);
591 
592    if (block->linear_succs.empty())
593       return;
594 
595    if (block->kind & block_kind_loop_preheader) {
596       /* collect information about the succeeding loop */
597       bool has_divergent_break = false;
598       bool has_divergent_continue = false;
599       bool has_discard = false;
600       unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
601 
602       for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
603          Block& loop_block = ctx.program->blocks[i];
604 
605          if (loop_block.kind & block_kind_uses_discard)
606             has_discard = true;
607          if (loop_block.loop_nest_depth != loop_nest_depth)
608             continue;
609 
610          if (loop_block.kind & block_kind_uniform)
611             continue;
612          else if (loop_block.kind & block_kind_break)
613             has_divergent_break = true;
614          else if (loop_block.kind & block_kind_continue)
615             has_divergent_continue = true;
616       }
617 
618       if (has_divergent_break) {
619          /* save restore exec mask */
620          const Operand& current_exec = ctx.info[idx].exec.back().op;
621          if (!current_exec.isTemp() && !current_exec.isConstant()) {
622             bld.reset(bld.instructions, std::prev(bld.instructions->end()));
623             Operand restore = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
624             ctx.info[idx].exec.back().op = restore;
625             bld.reset(bld.instructions);
626          }
627          uint8_t mask = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
628          ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask);
629       }
630       unsigned num_exec_masks = ctx.info[idx].exec.size();
631 
632       ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks,
633                             has_divergent_break, has_divergent_continue, has_discard);
634 
635       Pseudo_branch_instruction& branch = block->instructions.back()->branch();
636       branch.target[0] = block->linear_succs[0];
637    } else if (block->kind & block_kind_continue_or_break) {
638       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
639              block_kind_loop_header);
640       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
641              block_kind_loop_exit);
642       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
643       block->instructions.pop_back();
644 
645       while (!(ctx.info[idx].exec.back().type & mask_type_loop))
646          ctx.info[idx].exec.pop_back();
647 
648       Temp cond = bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc),
649                            ctx.info[idx].exec.back().op, Operand::zero(bld.lm.bytes()))
650                      .def(1)
651                      .getTemp();
652       bld.branch(aco_opcode::p_cbranch_nz, Operand(cond, scc), block->linear_succs[1],
653                  block->linear_succs[0]);
654    } else if (block->kind & block_kind_uniform) {
655       Pseudo_branch_instruction& branch = block->instructions.back()->branch();
656       if (branch.opcode == aco_opcode::p_branch) {
657          branch.target[0] = block->linear_succs[0];
658       } else {
659          branch.target[0] = block->linear_succs[1];
660          branch.target[1] = block->linear_succs[0];
661       }
662    } else if (block->kind & block_kind_branch) {
663       // orig = s_and_saveexec_b64
664       assert(block->linear_succs.size() == 2);
665       assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
666       Temp cond = block->instructions.back()->operands[0].getTemp();
667       aco_ptr<Instruction> branch = std::move(block->instructions.back());
668       block->instructions.pop_back();
669 
670       uint8_t mask_type = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
671       if (ctx.info[idx].exec.back().op.constantEquals(-1u)) {
672          bld.copy(Definition(exec, bld.lm), cond);
673       } else if (ctx.info[idx].exec.back().op.isTemp()) {
674          bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), cond,
675                   Operand(exec, bld.lm));
676       } else {
677          Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
678                                   Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
679 
680          ctx.info[idx].exec.back().op = Operand(old_exec);
681       }
682 
683       /* add next current exec to the stack */
684       ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type);
685 
686       Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, Operand(exec, bld.lm),
687                                      block->linear_succs[1], block->linear_succs[0]);
688       r->branch().rarely_taken = branch->branch().rarely_taken;
689       r->branch().never_taken = branch->branch().never_taken;
690    } else if (block->kind & block_kind_invert) {
691       // exec = s_andn2_b64 (original_exec, exec)
692       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
693       aco_ptr<Instruction> branch = std::move(block->instructions.back());
694       block->instructions.pop_back();
695       assert(ctx.info[idx].exec.size() >= 2);
696       Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].op;
697       bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
698                Operand(exec, bld.lm));
699 
700       Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, Operand(exec, bld.lm),
701                                      block->linear_succs[1], block->linear_succs[0]);
702       r->branch().rarely_taken = branch->branch().rarely_taken;
703       r->branch().never_taken = branch->branch().never_taken;
704    } else if (block->kind & block_kind_break) {
705       // loop_mask = s_andn2_b64 (loop_mask, exec)
706       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
707       block->instructions.pop_back();
708 
709       Temp cond = Temp();
710       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
711          cond = bld.tmp(s1);
712          Operand exec_mask = ctx.info[idx].exec[exec_idx].op;
713          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
714                               exec_mask, Operand(exec, bld.lm));
715          ctx.info[idx].exec[exec_idx].op = exec_mask;
716          if (ctx.info[idx].exec[exec_idx].type & mask_type_loop)
717             break;
718       }
719 
720       /* Implicitly set exec to zero and branch. */
721       ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes());
722       bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
723                  block->linear_succs[0]);
724    } else if (block->kind & block_kind_continue) {
725       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
726       block->instructions.pop_back();
727 
728       Temp cond = Temp();
729       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
730          if (ctx.info[idx].exec[exec_idx].type & mask_type_loop)
731             break;
732          cond = bld.tmp(s1);
733          Operand exec_mask = ctx.info[idx].exec[exec_idx].op;
734          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
735                               exec_mask, Operand(exec, bld.lm));
736          ctx.info[idx].exec[exec_idx].op = exec_mask;
737       }
738       assert(cond != Temp());
739 
740       /* Implicitly set exec to zero and branch. */
741       ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes());
742       bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
743                  block->linear_succs[0]);
744    } else {
745       unreachable("unknown/invalid block type");
746    }
747 }
748 
749 void
process_block(exec_ctx & ctx,Block * block)750 process_block(exec_ctx& ctx, Block* block)
751 {
752    std::vector<aco_ptr<Instruction>> instructions;
753    instructions.reserve(block->instructions.size());
754 
755    unsigned idx = add_coupling_code(ctx, block, instructions);
756 
757    assert(!block->linear_succs.empty() || ctx.info[block->index].exec.size() <= 2);
758 
759    process_instructions(ctx, block, instructions, idx);
760 
761    block->instructions = std::move(instructions);
762 
763    add_branch_code(ctx, block);
764 }
765 
766 } /* end namespace */
767 
768 void
insert_exec_mask(Program * program)769 insert_exec_mask(Program* program)
770 {
771    exec_ctx ctx(program);
772 
773    if (program->needs_wqm && program->needs_exact)
774       ctx.handle_wqm = true;
775 
776    for (Block& block : program->blocks)
777       process_block(ctx, &block);
778 }
779 
780 } // namespace aco
781