• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include <vector>
11 
12 namespace aco {
13 
14 namespace {
15 
16 enum WQMState : uint8_t {
17    Unspecified = 0,
18    Exact,
19    WQM, /* with control flow applied */
20 };
21 
22 enum mask_type : uint8_t {
23    mask_type_global = 1 << 0,
24    mask_type_exact = 1 << 1,
25    mask_type_wqm = 1 << 2,
26    mask_type_loop = 1 << 3, /* active lanes of a loop */
27 };
28 
29 struct loop_info {
30    Block* loop_header;
31    uint16_t num_exec_masks;
32    bool has_divergent_break;
33    bool has_divergent_continue;
34    bool has_discard; /* has a discard or demote */
loop_infoaco::__anond342433c0111::loop_info35    loop_info(Block* b, uint16_t num, bool breaks, bool cont, bool discard)
36        : loop_header(b), num_exec_masks(num), has_divergent_break(breaks),
37          has_divergent_continue(cont), has_discard(discard)
38    {}
39 };
40 
41 struct exec_info {
42    Operand op; /* Either a temporary, exec or const -1. */
43    uint8_t type; /* enum mask_type */
44    exec_info() = default;
exec_infoaco::__anond342433c0111::exec_info45    exec_info(const Operand& op_, const uint8_t& type_) : op(op_), type(type_) {}
46 };
47 
48 struct block_info {
49    std::vector<exec_info> exec;
50 };
51 
52 struct exec_ctx {
53    Program* program;
54    std::vector<block_info> info;
55    std::vector<loop_info> loop;
56    bool handle_wqm = false;
exec_ctxaco::__anond342433c0111::exec_ctx57    exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
58 };
59 
60 bool
needs_exact(aco_ptr<Instruction> & instr)61 needs_exact(aco_ptr<Instruction>& instr)
62 {
63    if (instr->isMUBUF()) {
64       return instr->mubuf().disable_wqm;
65    } else if (instr->isMTBUF()) {
66       return instr->mtbuf().disable_wqm;
67    } else if (instr->isMIMG()) {
68       return instr->mimg().disable_wqm;
69    } else if (instr->isFlatLike()) {
70       return instr->flatlike().disable_wqm;
71    } else {
72       /* Require Exact for p_jump_to_epilog because if p_exit_early_if_not is
73        * emitted inside the same block, the main FS will always jump to the PS
74        * epilog without considering the exec mask.
75        */
76       return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog ||
77              instr->opcode == aco_opcode::p_dual_src_export_gfx11;
78    }
79 }
80 
81 WQMState
get_instr_needs(aco_ptr<Instruction> & instr)82 get_instr_needs(aco_ptr<Instruction>& instr)
83 {
84    if (needs_exact(instr))
85       return Exact;
86 
87    bool pred_by_exec = needs_exec_mask(instr.get()) || instr->opcode == aco_opcode::p_logical_end ||
88                        instr->isBranch();
89 
90    return pred_by_exec ? WQM : Unspecified;
91 }
92 
93 void
transition_to_WQM(exec_ctx & ctx,Builder bld,unsigned idx)94 transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
95 {
96    if (ctx.info[idx].exec.back().type & mask_type_wqm)
97       return;
98    if (ctx.info[idx].exec.back().type & mask_type_global) {
99       Operand exec_mask = ctx.info[idx].exec.back().op;
100       if (exec_mask == Operand(exec, bld.lm))
101          ctx.info[idx].exec.back().op = bld.copy(bld.def(bld.lm), exec_mask);
102 
103       bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), exec_mask);
104       ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type_global | mask_type_wqm);
105       return;
106    }
107    /* otherwise, the WQM mask should be one below the current mask */
108    ctx.info[idx].exec.pop_back();
109    assert(ctx.info[idx].exec.back().type & mask_type_wqm);
110    assert(ctx.info[idx].exec.back().op.size() == bld.lm.size());
111    assert(ctx.info[idx].exec.back().op.isTemp());
112    bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
113 }
114 
115 void
transition_to_Exact(exec_ctx & ctx,Builder bld,unsigned idx)116 transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
117 {
118    if (ctx.info[idx].exec.back().type & mask_type_exact)
119       return;
120    /* We can't remove the loop exec mask, because that can cause exec.size() to
121     * be less than num_exec_masks. The loop exec mask also needs to be kept
122     * around for various uses. */
123    if ((ctx.info[idx].exec.back().type & mask_type_global) &&
124        !(ctx.info[idx].exec.back().type & mask_type_loop)) {
125       ctx.info[idx].exec.pop_back();
126       assert(ctx.info[idx].exec.back().type & mask_type_exact);
127       assert(ctx.info[idx].exec.back().op.size() == bld.lm.size());
128       assert(ctx.info[idx].exec.back().op.isTemp());
129       bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
130       return;
131    }
132    /* otherwise, we create an exact mask and push to the stack */
133    Operand wqm = ctx.info[idx].exec.back().op;
134    if (wqm == Operand(exec, bld.lm)) {
135       wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
136                      Definition(exec, bld.lm), ctx.info[idx].exec[0].op, Operand(exec, bld.lm));
137    } else {
138       bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].op,
139                wqm);
140    }
141    ctx.info[idx].exec.back().op = Operand(wqm);
142    ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type_exact);
143 }
144 
145 unsigned
add_coupling_code(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions)146 add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
147 {
148    unsigned idx = block->index;
149    Builder bld(ctx.program, &instructions);
150    Block::edge_vec& preds = block->linear_preds;
151    bool restore_exec = false;
152 
153    /* start block */
154    if (preds.empty()) {
155       aco_ptr<Instruction>& startpgm = block->instructions[0];
156       assert(startpgm->opcode == aco_opcode::p_startpgm);
157       bld.insert(std::move(startpgm));
158 
159       unsigned count = 1;
160       while (block->instructions[count]->opcode == aco_opcode::p_init_scratch ||
161              block->instructions[count]->opcode == aco_opcode::s_setprio) {
162          bld.insert(std::move(block->instructions[count]));
163          count++;
164       }
165 
166       Operand start_exec(exec, bld.lm);
167 
168       /* exec seems to need to be manually initialized with combined shaders */
169       if (ctx.program->stage.num_sw_stages() > 1 ||
170           ctx.program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
171           (ctx.program->stage.sw == SWStage::VS &&
172            (ctx.program->stage.hw == AC_HW_HULL_SHADER ||
173             ctx.program->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER)) ||
174           (ctx.program->stage.sw == SWStage::TES &&
175            ctx.program->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER)) {
176          start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
177          bld.copy(Definition(exec, bld.lm), start_exec);
178       }
179 
180       /* EXEC is automatically initialized by the HW for compute shaders.
181        * We know for sure exec is initially -1 when the shader always has full subgroups.
182        */
183       if (ctx.program->stage == compute_cs && ctx.program->info.cs.uses_full_subgroups)
184          start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
185 
186       if (ctx.handle_wqm) {
187          ctx.info[idx].exec.emplace_back(start_exec, mask_type_global | mask_type_exact);
188          /* Initialize WQM already */
189          transition_to_WQM(ctx, bld, idx);
190       } else {
191          uint8_t mask = mask_type_global;
192          if (ctx.program->needs_wqm) {
193             bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
194                      Operand(exec, bld.lm));
195             mask |= mask_type_wqm;
196          } else {
197             mask |= mask_type_exact;
198          }
199          ctx.info[idx].exec.emplace_back(start_exec, mask);
200       }
201 
202       return count;
203    }
204 
205    /* loop entry block */
206    if (block->kind & block_kind_loop_header) {
207       assert(preds[0] == idx - 1);
208       ctx.info[idx].exec = ctx.info[idx - 1].exec;
209       loop_info& info = ctx.loop.back();
210       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
211 
212       /* create ssa names for outer exec masks */
213       if (info.has_discard && preds.size() > 1) {
214          aco_ptr<Instruction> phi;
215          for (int i = 0; i < info.num_exec_masks - 1; i++) {
216             phi.reset(
217                create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
218             phi->definitions[0] = bld.def(bld.lm);
219             phi->operands[0] = ctx.info[preds[0]].exec[i].op;
220             ctx.info[idx].exec[i].op = bld.insert(std::move(phi));
221          }
222       }
223 
224       ctx.info[idx].exec.back().type |= mask_type_loop;
225 
226       if (info.has_divergent_continue) {
227          /* create ssa name for loop active mask */
228          aco_ptr<Instruction> phi{
229             create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
230          phi->definitions[0] = bld.def(bld.lm);
231          phi->operands[0] = ctx.info[preds[0]].exec.back().op;
232          ctx.info[idx].exec.back().op = bld.insert(std::move(phi));
233 
234          restore_exec = true;
235          uint8_t mask_type = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
236          ctx.info[idx].exec.emplace_back(ctx.info[idx].exec.back().op, mask_type);
237       }
238 
239    } else if (block->kind & block_kind_loop_exit) {
240       Block* header = ctx.loop.back().loop_header;
241       loop_info& info = ctx.loop.back();
242 
243       for (ASSERTED unsigned pred : preds)
244          assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
245 
246       /* fill the loop header phis */
247       Block::edge_vec& header_preds = header->linear_preds;
248       int instr_idx = 0;
249       if (info.has_discard && header_preds.size() > 1) {
250          while (instr_idx < info.num_exec_masks - 1) {
251             aco_ptr<Instruction>& phi = header->instructions[instr_idx];
252             assert(phi->opcode == aco_opcode::p_linear_phi);
253             for (unsigned i = 1; i < phi->operands.size(); i++)
254                phi->operands[i] = ctx.info[header_preds[i]].exec[instr_idx].op;
255             instr_idx++;
256          }
257       }
258 
259       if (info.has_divergent_continue) {
260          aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
261          assert(phi->opcode == aco_opcode::p_linear_phi);
262          for (unsigned i = 1; i < phi->operands.size(); i++)
263             phi->operands[i] = ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].op;
264          restore_exec = true;
265       }
266 
267       if (info.has_divergent_break) {
268          restore_exec = true;
269          /* Drop the loop active mask. */
270          info.num_exec_masks--;
271       }
272       assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
273 
274       /* create the loop exit phis if not trivial */
275       for (unsigned exec_idx = 0; exec_idx < info.num_exec_masks; exec_idx++) {
276          Operand same = ctx.info[preds[0]].exec[exec_idx].op;
277          uint8_t type = ctx.info[header_preds[0]].exec[exec_idx].type;
278          bool trivial = true;
279 
280          for (unsigned i = 1; i < preds.size() && trivial; i++) {
281             if (ctx.info[preds[i]].exec[exec_idx].op != same)
282                trivial = false;
283          }
284 
285          if (trivial) {
286             ctx.info[idx].exec.emplace_back(same, type);
287          } else {
288             /* create phi for loop footer */
289             aco_ptr<Instruction> phi{
290                create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
291             phi->definitions[0] = bld.def(bld.lm);
292             for (unsigned i = 0; i < phi->operands.size(); i++)
293                phi->operands[i] = ctx.info[preds[i]].exec[exec_idx].op;
294             ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
295          }
296       }
297 
298       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
299       ctx.loop.pop_back();
300 
301    } else if (preds.size() == 1) {
302       ctx.info[idx].exec = ctx.info[preds[0]].exec;
303    } else {
304       assert(preds.size() == 2);
305       assert(ctx.info[preds[0]].exec.size() == ctx.info[preds[1]].exec.size());
306 
307       unsigned last = ctx.info[preds[0]].exec.size() - 1;
308 
309       /* create phis for diverged temporary exec masks */
310       for (unsigned i = 0; i < last; i++) {
311          /* skip trivial phis */
312          if (ctx.info[preds[0]].exec[i].op == ctx.info[preds[1]].exec[i].op) {
313             Operand op = ctx.info[preds[0]].exec[i].op;
314             /* discard/demote can change the state of the current exec mask */
315             assert(!op.isTemp() ||
316                    ctx.info[preds[0]].exec[i].type == ctx.info[preds[1]].exec[i].type);
317             uint8_t mask = ctx.info[preds[0]].exec[i].type & ctx.info[preds[1]].exec[i].type;
318             ctx.info[idx].exec.emplace_back(op, mask);
319             continue;
320          }
321 
322          Operand phi = bld.pseudo(aco_opcode::p_linear_phi, bld.def(bld.lm),
323                                   ctx.info[preds[0]].exec[i].op, ctx.info[preds[1]].exec[i].op);
324          uint8_t mask_type = ctx.info[preds[0]].exec[i].type & ctx.info[preds[1]].exec[i].type;
325          ctx.info[idx].exec.emplace_back(phi, mask_type);
326       }
327 
328       if (block->kind & block_kind_merge) {
329          restore_exec = true;
330       } else {
331          /* The last mask is already in exec. */
332          Operand current_exec = Operand(exec, bld.lm);
333          if (ctx.info[preds[0]].exec[last].op == ctx.info[preds[1]].exec[last].op) {
334             current_exec = ctx.info[preds[0]].exec[last].op;
335          }
336          uint8_t mask_type =
337             ctx.info[preds[0]].exec[last].type & ctx.info[preds[1]].exec[last].type;
338          ctx.info[idx].exec.emplace_back(current_exec, mask_type);
339       }
340    }
341 
342    unsigned i = 0;
343    while (block->instructions[i]->opcode == aco_opcode::p_phi ||
344           block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
345       bld.insert(std::move(block->instructions[i]));
346       i++;
347    }
348 
349    if (ctx.handle_wqm) {
350       /* End WQM handling if not needed anymore */
351       if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
352          if (block->instructions[i]->opcode == aco_opcode::p_end_wqm) {
353             ctx.info[idx].exec.back().type |= mask_type_global;
354             transition_to_Exact(ctx, bld, idx);
355             ctx.handle_wqm = false;
356             restore_exec = false;
357             i++;
358          }
359       }
360    }
361 
362    /* restore exec mask after divergent control flow */
363    if (restore_exec) {
364       Operand restore = ctx.info[idx].exec.back().op;
365       assert(restore.size() == bld.lm.size());
366       bld.copy(Definition(exec, bld.lm), restore);
367    }
368 
369    return i;
370 }
371 
372 /* Avoid live-range splits in Exact mode:
373  * Because the data register of atomic VMEM instructions
374  * is shared between src and dst, it might be necessary
375  * to create live-range splits during RA.
376  * Make the live-range splits explicit in WQM mode.
377  */
378 void
handle_atomic_data(exec_ctx & ctx,Builder & bld,unsigned block_idx,aco_ptr<Instruction> & instr)379 handle_atomic_data(exec_ctx& ctx, Builder& bld, unsigned block_idx, aco_ptr<Instruction>& instr)
380 {
381    /* check if this is an atomic VMEM instruction */
382    int idx = -1;
383    if (!instr->isVMEM() || instr->definitions.empty())
384       return;
385    else if (instr->isMIMG())
386       idx = instr->operands[2].isTemp() ? 2 : -1;
387    else if (instr->operands.size() == 4)
388       idx = 3;
389 
390    if (idx != -1) {
391       /* insert explicit copy of atomic data in WQM-mode */
392       transition_to_WQM(ctx, bld, block_idx);
393       Temp data = instr->operands[idx].getTemp();
394       data = bld.copy(bld.def(data.regClass()), data);
395       instr->operands[idx].setTemp(data);
396    }
397 }
398 
399 void
process_instructions(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions,unsigned idx)400 process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
401                      unsigned idx)
402 {
403    block_info& info = ctx.info[block->index];
404    WQMState state;
405    if (info.exec.back().type & mask_type_wqm) {
406       state = WQM;
407    } else {
408       assert(!ctx.handle_wqm || info.exec.back().type & mask_type_exact);
409       state = Exact;
410    }
411 
412    Builder bld(ctx.program, &instructions);
413 
414    for (; idx < block->instructions.size(); idx++) {
415       aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
416 
417       WQMState needs = ctx.handle_wqm ? get_instr_needs(instr) : Unspecified;
418 
419       if (needs == WQM && state != WQM) {
420          transition_to_WQM(ctx, bld, block->index);
421          state = WQM;
422       } else if (needs == Exact) {
423          if (ctx.handle_wqm)
424             handle_atomic_data(ctx, bld, block->index, instr);
425          transition_to_Exact(ctx, bld, block->index);
426          state = Exact;
427       }
428 
429       if (instr->opcode == aco_opcode::p_discard_if) {
430          Operand current_exec = Operand(exec, bld.lm);
431 
432          if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) {
433             /* Transition to Exact without extra instruction. */
434             info.exec.resize(1);
435             assert(info.exec[0].type == (mask_type_exact | mask_type_global));
436             current_exec = info.exec[0].op;
437             info.exec[0].op = Operand(exec, bld.lm);
438             state = Exact;
439          } else if (info.exec.size() >= 2 && ctx.handle_wqm) {
440             /* Preserve the WQM mask */
441             info.exec[1].type &= ~mask_type_global;
442          }
443 
444          Temp cond;
445          if (instr->operands[0].isConstant()) {
446             assert(instr->operands[0].constantValue() == -1u);
447             /* save condition and set exec to zero */
448             cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
449                             Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
450          } else {
451             cond = instr->operands[0].getTemp();
452             /* discard from current exec */
453             bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), current_exec,
454                      cond);
455          }
456 
457          if (info.exec.size() == 1) {
458             instr->operands[0] = Operand(exec, bld.lm);
459          } else {
460             /* discard from inner to outer exec mask on stack */
461             int num = info.exec.size() - 2;
462             Temp exit_cond;
463             for (int i = num; i >= 0; i--) {
464                Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
465                                              info.exec[i].op, cond);
466                info.exec[i].op = Operand(andn2->definitions[0].getTemp());
467                exit_cond = andn2->definitions[1].getTemp();
468             }
469             instr->operands[0] = bld.scc(exit_cond);
470          }
471 
472          info.exec.back().op = Operand(exec, bld.lm);
473          instr->opcode = aco_opcode::p_exit_early_if_not;
474          assert(!ctx.handle_wqm || (info.exec[0].type & mask_type_wqm) == 0);
475       } else if (instr->opcode == aco_opcode::p_is_helper) {
476          Definition dst = instr->definitions[0];
477          assert(dst.size() == bld.lm.size());
478          if (state == Exact) {
479             instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
480             instr->operands[0] = Operand::zero();
481             instr->definitions[0] = dst;
482          } else {
483             exec_info& exact_mask = info.exec[0];
484             assert(exact_mask.type & mask_type_exact);
485 
486             instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
487             instr->operands[0] = Operand(exec, bld.lm); /* current exec */
488             instr->operands[1] = Operand(exact_mask.op);
489             instr->definitions[0] = dst;
490             instr->definitions[1] = bld.def(s1, scc);
491          }
492       } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
493          assert((info.exec[0].type & mask_type_exact) && (info.exec[0].type & mask_type_global));
494 
495          const bool nested_cf = !(info.exec.back().type & mask_type_global);
496          if (ctx.handle_wqm && state == Exact && nested_cf) {
497             /* Transition back to WQM without extra instruction. */
498             info.exec.pop_back();
499             state = WQM;
500          } else if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) {
501             /* Transition to Exact without extra instruction. */
502             info.exec.resize(1);
503             state = Exact;
504          } else if (nested_cf) {
505             /* Save curent exec temporarily. */
506             info.exec.back().op = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
507          } else {
508             info.exec.back().op = Operand(exec, bld.lm);
509          }
510 
511          /* Remove invocations from global exact mask. */
512          Definition def = state == Exact ? Definition(exec, bld.lm) : bld.def(bld.lm);
513          Operand src = instr->operands[0].isConstant() ? Operand(exec, bld.lm) : instr->operands[0];
514 
515          bld.sop2(Builder::s_andn2, def, bld.def(s1, scc), info.exec[0].op, src);
516          info.exec[0].op = def.isTemp() ? Operand(def.getTemp()) : Operand(exec, bld.lm);
517 
518          /* Update global WQM mask and store in exec. */
519          if (state == WQM) {
520             assert(info.exec.size() > 1);
521             bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), def.getTemp());
522          }
523 
524          /* End shader if global mask is zero. */
525          instr->opcode = aco_opcode::p_exit_early_if_not;
526          instr->operands[0] = Operand(exec, bld.lm);
527          bld.insert(std::move(instr));
528 
529          /* Update all other exec masks. */
530          if (nested_cf) {
531             const unsigned global_idx = state == WQM ? 1 : 0;
532             for (unsigned i = global_idx + 1; i < info.exec.size() - 1; i++) {
533                info.exec[i].op = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
534                                           info.exec[i].op, Operand(exec, bld.lm));
535             }
536             /* Update current exec and save WQM mask. */
537             info.exec[global_idx].op =
538                bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
539                         Definition(exec, bld.lm), info.exec.back().op, Operand(exec, bld.lm));
540             info.exec.back().op = Operand(exec, bld.lm);
541          }
542          continue;
543 
544       } else if (instr->opcode == aco_opcode::p_elect) {
545          bool all_lanes_enabled = info.exec.back().op.constantEquals(-1u);
546          Definition dst = instr->definitions[0];
547 
548          if (all_lanes_enabled) {
549             bld.copy(Definition(dst), Operand::c32_or_c64(1u, dst.size() == 2));
550          } else {
551             Temp first_lane_idx = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
552             bld.sop2(Builder::s_lshl, Definition(dst), bld.def(s1, scc),
553                      Operand::c32_or_c64(1u, dst.size() == 2), Operand(first_lane_idx));
554          }
555          continue;
556       } else if (instr->opcode == aco_opcode::p_end_wqm) {
557          assert(block->kind & block_kind_top_level);
558          assert(info.exec.size() <= 2);
559          /* This instruction indicates the end of WQM mode. */
560          info.exec.back().type |= mask_type_global;
561          transition_to_Exact(ctx, bld, block->index);
562          state = Exact;
563          ctx.handle_wqm = false;
564          continue;
565       }
566 
567       bld.insert(std::move(instr));
568    }
569 }
570 
571 void
add_branch_code(exec_ctx & ctx,Block * block)572 add_branch_code(exec_ctx& ctx, Block* block)
573 {
574    unsigned idx = block->index;
575    Builder bld(ctx.program, block);
576 
577    if (block->linear_succs.empty())
578       return;
579 
580    if (block->kind & block_kind_loop_preheader) {
581       /* collect information about the succeeding loop */
582       bool has_divergent_break = false;
583       bool has_divergent_continue = false;
584       bool has_discard = false;
585       unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
586 
587       for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
588          Block& loop_block = ctx.program->blocks[i];
589 
590          if (loop_block.kind & block_kind_uses_discard)
591             has_discard = true;
592          if (loop_block.loop_nest_depth != loop_nest_depth)
593             continue;
594 
595          if (loop_block.kind & block_kind_uniform)
596             continue;
597          else if (loop_block.kind & block_kind_break)
598             has_divergent_break = true;
599          else if (loop_block.kind & block_kind_continue)
600             has_divergent_continue = true;
601       }
602 
603       if (has_divergent_break) {
604          /* save restore exec mask */
605          const Operand& current_exec = ctx.info[idx].exec.back().op;
606          if (!current_exec.isTemp() && !current_exec.isConstant()) {
607             bld.reset(bld.instructions, std::prev(bld.instructions->end()));
608             Operand restore = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
609             ctx.info[idx].exec.back().op = restore;
610             bld.reset(bld.instructions);
611          }
612          uint8_t mask = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
613          ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask);
614       }
615       unsigned num_exec_masks = ctx.info[idx].exec.size();
616 
617       ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks,
618                             has_divergent_break, has_divergent_continue, has_discard);
619 
620       Pseudo_branch_instruction& branch = block->instructions.back()->branch();
621       branch.target[0] = block->linear_succs[0];
622    } else if (block->kind & block_kind_continue_or_break) {
623       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
624              block_kind_loop_header);
625       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
626              block_kind_loop_exit);
627       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
628       block->instructions.pop_back();
629 
630       bool need_parallelcopy = false;
631       while (!(ctx.info[idx].exec.back().type & mask_type_loop)) {
632          ctx.info[idx].exec.pop_back();
633          need_parallelcopy = true;
634       }
635 
636       if (need_parallelcopy)
637          bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
638       bld.branch(aco_opcode::p_cbranch_nz, Operand(exec, bld.lm), block->linear_succs[1],
639                  block->linear_succs[0]);
640    } else if (block->kind & block_kind_uniform) {
641       Pseudo_branch_instruction& branch = block->instructions.back()->branch();
642       if (branch.opcode == aco_opcode::p_branch) {
643          branch.target[0] = block->linear_succs[0];
644       } else {
645          branch.target[0] = block->linear_succs[1];
646          branch.target[1] = block->linear_succs[0];
647       }
648    } else if (block->kind & block_kind_branch) {
649       // orig = s_and_saveexec_b64
650       assert(block->linear_succs.size() == 2);
651       assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
652       Temp cond = block->instructions.back()->operands[0].getTemp();
653       aco_ptr<Instruction> branch = std::move(block->instructions.back());
654       block->instructions.pop_back();
655 
656       uint8_t mask_type = ctx.info[idx].exec.back().type & (mask_type_wqm | mask_type_exact);
657       if (ctx.info[idx].exec.back().op.constantEquals(-1u)) {
658          bld.copy(Definition(exec, bld.lm), cond);
659       } else if (ctx.info[idx].exec.back().op.isTemp()) {
660          bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), cond,
661                   Operand(exec, bld.lm));
662       } else {
663          Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
664                                   Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
665 
666          ctx.info[idx].exec.back().op = Operand(old_exec);
667       }
668 
669       /* add next current exec to the stack */
670       ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask_type);
671 
672       Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, Operand(exec, bld.lm),
673                                      block->linear_succs[1], block->linear_succs[0]);
674       r->branch().rarely_taken = branch->branch().rarely_taken;
675       r->branch().never_taken = branch->branch().never_taken;
676    } else if (block->kind & block_kind_invert) {
677       // exec = s_andn2_b64 (original_exec, exec)
678       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
679       aco_ptr<Instruction> branch = std::move(block->instructions.back());
680       block->instructions.pop_back();
681       assert(ctx.info[idx].exec.size() >= 2);
682       Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].op;
683       bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
684                Operand(exec, bld.lm));
685 
686       Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, Operand(exec, bld.lm),
687                                      block->linear_succs[1], block->linear_succs[0]);
688       r->branch().rarely_taken = branch->branch().rarely_taken;
689       r->branch().never_taken = branch->branch().never_taken;
690    } else if (block->kind & block_kind_break) {
691       // loop_mask = s_andn2_b64 (loop_mask, exec)
692       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
693       block->instructions.pop_back();
694 
695       Temp cond = Temp();
696       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
697          cond = bld.tmp(s1);
698          Operand exec_mask = ctx.info[idx].exec[exec_idx].op;
699          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
700                               exec_mask, Operand(exec, bld.lm));
701          ctx.info[idx].exec[exec_idx].op = exec_mask;
702          if (ctx.info[idx].exec[exec_idx].type & mask_type_loop)
703             break;
704       }
705 
706       /* check if the successor is the merge block, otherwise set exec to 0 */
707       // TODO: this could be done better by directly branching to the merge block
708       unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
709       Block& succ = ctx.program->blocks[succ_idx];
710       if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
711          bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
712       }
713 
714       bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
715                  block->linear_succs[0]);
716    } else if (block->kind & block_kind_continue) {
717       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
718       block->instructions.pop_back();
719 
720       Temp cond = Temp();
721       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
722          if (ctx.info[idx].exec[exec_idx].type & mask_type_loop)
723             break;
724          cond = bld.tmp(s1);
725          Operand exec_mask = ctx.info[idx].exec[exec_idx].op;
726          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
727                               exec_mask, Operand(exec, bld.lm));
728          ctx.info[idx].exec[exec_idx].op = exec_mask;
729       }
730       assert(cond != Temp());
731 
732       /* check if the successor is the merge block, otherwise set exec to 0 */
733       // TODO: this could be done better by directly branching to the merge block
734       unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
735       Block& succ = ctx.program->blocks[succ_idx];
736       if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
737          bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
738       }
739 
740       bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
741                  block->linear_succs[0]);
742    } else {
743       unreachable("unknown/invalid block type");
744    }
745 }
746 
747 void
process_block(exec_ctx & ctx,Block * block)748 process_block(exec_ctx& ctx, Block* block)
749 {
750    std::vector<aco_ptr<Instruction>> instructions;
751    instructions.reserve(block->instructions.size());
752 
753    unsigned idx = add_coupling_code(ctx, block, instructions);
754 
755    assert(!block->linear_succs.empty() || ctx.info[block->index].exec.size() <= 2);
756 
757    process_instructions(ctx, block, instructions, idx);
758 
759    block->instructions = std::move(instructions);
760 
761    add_branch_code(ctx, block);
762 }
763 
764 } /* end namespace */
765 
766 void
insert_exec_mask(Program * program)767 insert_exec_mask(Program* program)
768 {
769    exec_ctx ctx(program);
770 
771    if (program->needs_wqm && program->needs_exact)
772       ctx.handle_wqm = true;
773 
774    for (Block& block : program->blocks)
775       process_block(ctx, &block);
776 }
777 
778 } // namespace aco
779