• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include "util/u_math.h"
28 
29 namespace aco {
30 
31 namespace {
32 
33 enum WQMState : uint8_t {
34    Unspecified = 0,
35    Exact = 1 << 0,
36    WQM = 1 << 1, /* with control flow applied */
37    Preserve_WQM = 1 << 2,
38    Exact_Branch = 1 << 3,
39 };
40 
41 enum mask_type : uint8_t {
42    mask_type_global = 1 << 0,
43    mask_type_exact = 1 << 1,
44    mask_type_wqm = 1 << 2,
45    mask_type_loop = 1 << 3, /* active lanes of a loop */
46    mask_type_initial = 1 << 4, /* initially active lanes */
47 };
48 
49 struct wqm_ctx {
50    Program* program;
51    /* state for WQM propagation */
52    std::set<unsigned> worklist;
53    std::vector<uint16_t> defined_in;
54    std::vector<bool> needs_wqm;
55    std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
56    bool loop;
57    bool wqm;
wqm_ctxaco::__anonc106b1720111::wqm_ctx58    wqm_ctx(Program* program) : program(program),
59                                defined_in(program->peekAllocationId(), 0xFFFF),
60                                needs_wqm(program->peekAllocationId()),
61                                branch_wqm(program->blocks.size()),
62                                loop(false),
63                                wqm(false)
64    {
65       for (unsigned i = 0; i < program->blocks.size(); i++)
66          worklist.insert(i);
67    }
68 };
69 
70 struct loop_info {
71    Block* loop_header;
72    uint16_t num_exec_masks;
73    uint8_t needs;
74    bool has_divergent_break;
75    bool has_divergent_continue;
76    bool has_discard; /* has a discard or demote */
loop_infoaco::__anonc106b1720111::loop_info77    loop_info(Block* b, uint16_t num, uint8_t needs, bool breaks, bool cont, bool discard) :
78              loop_header(b), num_exec_masks(num), needs(needs), has_divergent_break(breaks),
79              has_divergent_continue(cont), has_discard(discard) {}
80 };
81 
82 struct block_info {
83    std::vector<std::pair<Temp, uint8_t>> exec;
84    std::vector<WQMState> instr_needs;
85    uint8_t block_needs;
86    uint8_t ever_again_needs;
87    bool logical_end_wqm;
88    /* more... */
89 };
90 
91 struct exec_ctx {
92    Program *program;
93    std::vector<block_info> info;
94    std::vector<loop_info> loop;
95    bool handle_wqm = false;
exec_ctxaco::__anonc106b1720111::exec_ctx96    exec_ctx(Program *program) : program(program), info(program->blocks.size()) {}
97 };
98 
pred_by_exec_mask(aco_ptr<Instruction> & instr)99 bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
100    if (instr->isSALU())
101       return instr->reads_exec();
102    if (instr->format == Format::SMEM || instr->isSALU())
103       return false;
104    if (instr->format == Format::PSEUDO_BARRIER)
105       return false;
106 
107    if (instr->format == Format::PSEUDO) {
108       switch (instr->opcode) {
109       case aco_opcode::p_create_vector:
110       case aco_opcode::p_extract_vector:
111       case aco_opcode::p_split_vector:
112       case aco_opcode::p_parallelcopy:
113          for (Definition def : instr->definitions) {
114             if (def.getTemp().type() == RegType::vgpr)
115                return true;
116          }
117          return false;
118       case aco_opcode::p_spill:
119       case aco_opcode::p_reload:
120          return false;
121       default:
122          break;
123       }
124    }
125 
126    if (instr->opcode == aco_opcode::v_readlane_b32 ||
127        instr->opcode == aco_opcode::v_readlane_b32_e64 ||
128        instr->opcode == aco_opcode::v_writelane_b32 ||
129        instr->opcode == aco_opcode::v_writelane_b32_e64)
130       return false;
131 
132    return true;
133 }
134 
needs_exact(aco_ptr<Instruction> & instr)135 bool needs_exact(aco_ptr<Instruction>& instr) {
136    if (instr->format == Format::MUBUF) {
137       MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
138       return mubuf->disable_wqm;
139    } else if (instr->format == Format::MTBUF) {
140       MTBUF_instruction *mtbuf = static_cast<MTBUF_instruction *>(instr.get());
141       return mtbuf->disable_wqm;
142    } else if (instr->format == Format::MIMG) {
143       MIMG_instruction *mimg = static_cast<MIMG_instruction *>(instr.get());
144       return mimg->disable_wqm;
145    } else if (instr->format == Format::FLAT || instr->format == Format::GLOBAL) {
146       FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
147       return flat->disable_wqm;
148    } else {
149       return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
150    }
151 }
152 
set_needs_wqm(wqm_ctx & ctx,Temp tmp)153 void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
154 {
155    if (!ctx.needs_wqm[tmp.id()]) {
156       ctx.needs_wqm[tmp.id()] = true;
157       if (ctx.defined_in[tmp.id()] != 0xFFFF)
158          ctx.worklist.insert(ctx.defined_in[tmp.id()]);
159    }
160 }
161 
mark_block_wqm(wqm_ctx & ctx,unsigned block_idx)162 void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
163 {
164    if (ctx.branch_wqm[block_idx])
165       return;
166 
167    ctx.branch_wqm[block_idx] = true;
168    ctx.worklist.insert(block_idx);
169 
170    Block& block = ctx.program->blocks[block_idx];
171 
172    /* TODO: this sets more branch conditions to WQM than it needs to
173     * it should be enough to stop at the "exec mask top level" */
174    if (block.kind & block_kind_top_level)
175       return;
176 
177    for (unsigned pred_idx : block.logical_preds)
178       mark_block_wqm(ctx, pred_idx);
179 }
180 
get_block_needs(wqm_ctx & ctx,exec_ctx & exec_ctx,Block * block)181 void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
182 {
183    block_info& info = exec_ctx.info[block->index];
184 
185    std::vector<WQMState> instr_needs(block->instructions.size());
186 
187    if (block->kind & block_kind_top_level) {
188       if (ctx.loop && ctx.wqm) {
189          unsigned block_idx = block->index + 1;
190          while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) {
191             /* flag all break conditions as WQM:
192              * the conditions might be computed outside the nested CF */
193             if (ctx.program->blocks[block_idx].kind & block_kind_break)
194                mark_block_wqm(ctx, block_idx);
195             /* flag all blocks as WQM to ensure we enter all (nested) loops in WQM */
196             exec_ctx.info[block_idx].block_needs |= WQM;
197             block_idx++;
198          }
199       } else if (ctx.loop && !ctx.wqm) {
200          /* Ensure a branch never results in an exec mask with only helper
201           * invocations (which can cause a loop to repeat infinitively if it's
202           * break branches are done in exact). */
203          unsigned block_idx = block->index;
204          do {
205             if ((ctx.program->blocks[block_idx].kind & block_kind_branch))
206                exec_ctx.info[block_idx].block_needs |= Exact_Branch;
207             block_idx++;
208          } while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level));
209       }
210 
211       ctx.loop = false;
212       ctx.wqm = false;
213    }
214 
215    for (int i = block->instructions.size() - 1; i >= 0; --i) {
216       aco_ptr<Instruction>& instr = block->instructions[i];
217 
218       WQMState needs = needs_exact(instr) ? Exact : Unspecified;
219       bool propagate_wqm = instr->opcode == aco_opcode::p_wqm;
220       bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
221       bool pred_by_exec = pred_by_exec_mask(instr);
222       for (const Definition& definition : instr->definitions) {
223          if (!definition.isTemp())
224             continue;
225          const unsigned def = definition.tempId();
226          ctx.defined_in[def] = block->index;
227          if (needs == Unspecified && ctx.needs_wqm[def]) {
228             needs = pred_by_exec ? WQM : Unspecified;
229             propagate_wqm = true;
230          }
231       }
232 
233       if (instr->format == Format::PSEUDO_BRANCH && ctx.branch_wqm[block->index]) {
234          needs = WQM;
235          propagate_wqm = true;
236       }
237 
238       if (propagate_wqm) {
239          for (const Operand& op : instr->operands) {
240             if (op.isTemp()) {
241                set_needs_wqm(ctx, op.getTemp());
242             }
243          }
244       } else if (preserve_wqm && info.block_needs & WQM) {
245          needs = Preserve_WQM;
246       }
247 
248       /* ensure the condition controlling the control flow for this phi is in WQM */
249       if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
250          for (unsigned pred_idx : block->logical_preds) {
251             mark_block_wqm(ctx, pred_idx);
252             exec_ctx.info[pred_idx].logical_end_wqm = true;
253             ctx.worklist.insert(pred_idx);
254          }
255       }
256 
257       if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) ||
258           instr->opcode == aco_opcode::p_wqm) {
259          assert(needs != Exact);
260          needs = WQM;
261       }
262 
263       instr_needs[i] = needs;
264       info.block_needs |= needs;
265    }
266 
267    info.instr_needs = instr_needs;
268 
269    /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
270     * <cond> should be computed in WQM */
271    if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
272       for (unsigned pred_idx : block->logical_preds)
273          mark_block_wqm(ctx, pred_idx);
274       ctx.wqm = true;
275    }
276    if (block->kind & block_kind_loop_header)
277       ctx.loop = true;
278 }
279 
calculate_wqm_needs(exec_ctx & exec_ctx)280 void calculate_wqm_needs(exec_ctx& exec_ctx)
281 {
282    wqm_ctx ctx(exec_ctx.program);
283 
284    while (!ctx.worklist.empty()) {
285       unsigned block_index = *std::prev(ctx.worklist.end());
286       ctx.worklist.erase(std::prev(ctx.worklist.end()));
287 
288       get_block_needs(ctx, exec_ctx, &exec_ctx.program->blocks[block_index]);
289    }
290 
291    uint8_t ever_again_needs = 0;
292    for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) {
293       exec_ctx.info[i].ever_again_needs = ever_again_needs;
294       Block& block = exec_ctx.program->blocks[i];
295 
296       if (block.kind & block_kind_needs_lowering)
297          exec_ctx.info[i].block_needs |= Exact;
298 
299       /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
300       if ((block.kind & block_kind_discard ||
301            block.kind & block_kind_uses_discard_if) &&
302           ever_again_needs & WQM)
303          exec_ctx.info[i].block_needs |= Preserve_WQM;
304 
305       ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
306       if (block.kind & block_kind_discard ||
307           block.kind & block_kind_uses_discard_if ||
308           block.kind & block_kind_uses_demote)
309          ever_again_needs |= Exact;
310 
311       /* don't propagate WQM preservation further than the next top_level block */
312       if (block.kind & block_kind_top_level)
313          ever_again_needs &= ~Preserve_WQM;
314       else
315          exec_ctx.info[i].block_needs &= ~Preserve_WQM;
316    }
317    exec_ctx.handle_wqm = true;
318 }
319 
transition_to_WQM(exec_ctx & ctx,Builder bld,unsigned idx)320 void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
321 {
322    if (ctx.info[idx].exec.back().second & mask_type_wqm)
323       return;
324    if (ctx.info[idx].exec.back().second & mask_type_global) {
325       Temp exec_mask = ctx.info[idx].exec.back().first;
326       /* TODO: we might generate better code if we pass the uncopied "exec_mask"
327        * directly to the s_wqm (we still need to keep this parallelcopy for
328        * potential later uses of exec_mask though). We currently can't do this
329        * because of a RA bug. */
330       exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), bld.exec(exec_mask));
331       ctx.info[idx].exec.back().first = exec_mask;
332 
333       exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
334       ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
335       return;
336    }
337    /* otherwise, the WQM mask should be one below the current mask */
338    ctx.info[idx].exec.pop_back();
339    assert(ctx.info[idx].exec.back().second & mask_type_wqm);
340    assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
341    ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
342                                                 ctx.info[idx].exec.back().first);
343 }
344 
transition_to_Exact(exec_ctx & ctx,Builder bld,unsigned idx)345 void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
346 {
347    if (ctx.info[idx].exec.back().second & mask_type_exact)
348       return;
349    /* We can't remove the loop exec mask, because that can cause exec.size() to
350     * be less than num_exec_masks. The loop exec mask also needs to be kept
351     * around for various uses. */
352    if ((ctx.info[idx].exec.back().second & mask_type_global) &&
353        !(ctx.info[idx].exec.back().second & mask_type_loop)) {
354       ctx.info[idx].exec.pop_back();
355       assert(ctx.info[idx].exec.back().second & mask_type_exact);
356       assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
357       ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
358                                                    ctx.info[idx].exec.back().first);
359       return;
360    }
361    /* otherwise, we create an exact mask and push to the stack */
362    Temp wqm = ctx.info[idx].exec.back().first;
363    Temp exact = bld.tmp(bld.lm);
364    wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
365                   bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
366    ctx.info[idx].exec.back().first = wqm;
367    ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
368 }
369 
add_coupling_code(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions)370 unsigned add_coupling_code(exec_ctx& ctx, Block* block,
371                            std::vector<aco_ptr<Instruction>>& instructions)
372 {
373    unsigned idx = block->index;
374    Builder bld(ctx.program, &instructions);
375    std::vector<unsigned>& preds = block->linear_preds;
376 
377    /* start block */
378    if (idx == 0) {
379       aco_ptr<Instruction>& startpgm = block->instructions[0];
380       assert(startpgm->opcode == aco_opcode::p_startpgm);
381       Temp exec_mask = startpgm->definitions.back().getTemp();
382       bld.insert(std::move(startpgm));
383 
384       /* exec seems to need to be manually initialized with combined shaders */
385       if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) {
386          bld.copy(bld.exec(Definition(exec_mask)), Operand(UINT32_MAX, bld.lm == s2));
387          instructions[0]->definitions.pop_back();
388       }
389 
390       if (ctx.handle_wqm) {
391          ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial);
392          /* if this block only needs WQM, initialize already */
393          if (ctx.info[0].block_needs == WQM)
394             transition_to_WQM(ctx, bld, 0);
395       } else {
396          uint8_t mask = mask_type_global;
397          if (ctx.program->needs_wqm) {
398             exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
399             mask |= mask_type_wqm;
400          } else {
401             mask |= mask_type_exact;
402          }
403          ctx.info[0].exec.emplace_back(exec_mask, mask);
404       }
405 
406       return 1;
407    }
408 
409    /* loop entry block */
410    if (block->kind & block_kind_loop_header) {
411       assert(preds[0] == idx - 1);
412       ctx.info[idx].exec = ctx.info[idx - 1].exec;
413       loop_info& info = ctx.loop.back();
414       while (ctx.info[idx].exec.size() > info.num_exec_masks)
415          ctx.info[idx].exec.pop_back();
416 
417       /* create ssa names for outer exec masks */
418       if (info.has_discard) {
419          aco_ptr<Pseudo_instruction> phi;
420          for (int i = 0; i < info.num_exec_masks - 1; i++) {
421             phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
422             phi->definitions[0] = bld.def(bld.lm);
423             phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
424             ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
425          }
426       }
427 
428       /* create ssa name for restore mask */
429       if (info.has_divergent_break) {
430          /* this phi might be trivial but ensures a parallelcopy on the loop header */
431          aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
432          phi->definitions[0] = bld.def(bld.lm);
433          phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
434          ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
435       }
436 
437       /* create ssa name for loop active mask */
438       aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
439       if (info.has_divergent_continue)
440          phi->definitions[0] = bld.def(bld.lm);
441       else
442          phi->definitions[0] = bld.def(bld.lm, exec);
443       phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
444       Temp loop_active = bld.insert(std::move(phi));
445 
446       if (info.has_divergent_break) {
447          uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
448          ctx.info[idx].exec.emplace_back(loop_active, mask_type);
449       } else {
450          ctx.info[idx].exec.back().first = loop_active;
451          ctx.info[idx].exec.back().second |= mask_type_loop;
452       }
453 
454       /* create a parallelcopy to move the active mask to exec */
455       unsigned i = 0;
456       if (info.has_divergent_continue) {
457          while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
458             bld.insert(std::move(block->instructions[i]));
459             i++;
460          }
461          uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
462          assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
463          ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
464                                                     ctx.info[idx].exec.back().first), mask_type);
465       }
466 
467       return i;
468    }
469 
470    /* loop exit block */
471    if (block->kind & block_kind_loop_exit) {
472       Block* header = ctx.loop.back().loop_header;
473       loop_info& info = ctx.loop.back();
474 
475       for (ASSERTED unsigned pred : preds)
476          assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
477 
478       /* fill the loop header phis */
479       std::vector<unsigned>& header_preds = header->linear_preds;
480       int k = 0;
481       if (info.has_discard) {
482          while (k < info.num_exec_masks - 1) {
483             aco_ptr<Instruction>& phi = header->instructions[k];
484             assert(phi->opcode == aco_opcode::p_linear_phi);
485             for (unsigned i = 1; i < phi->operands.size(); i++)
486                phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[k].first);
487             k++;
488          }
489       }
490       aco_ptr<Instruction>& phi = header->instructions[k++];
491       assert(phi->opcode == aco_opcode::p_linear_phi);
492       for (unsigned i = 1; i < phi->operands.size(); i++)
493          phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
494 
495       if (info.has_divergent_break) {
496          aco_ptr<Instruction>& phi = header->instructions[k];
497          assert(phi->opcode == aco_opcode::p_linear_phi);
498          for (unsigned i = 1; i < phi->operands.size(); i++)
499             phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
500       }
501 
502       assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
503 
504       /* create the loop exit phis if not trivial */
505       bool need_parallelcopy = false;
506       for (unsigned k = 0; k < info.num_exec_masks; k++) {
507          Temp same = ctx.info[preds[0]].exec[k].first;
508          uint8_t type = ctx.info[header_preds[0]].exec[k].second;
509          bool trivial = true;
510 
511          for (unsigned i = 1; i < preds.size() && trivial; i++) {
512             if (ctx.info[preds[i]].exec[k].first != same)
513                trivial = false;
514          }
515 
516          if (k == info.num_exec_masks - 1u) {
517             bool all_liveout_exec = true;
518             bool all_not_liveout_exec = true;
519             for (unsigned pred : preds) {
520                all_liveout_exec = all_liveout_exec && same == ctx.program->blocks[pred].live_out_exec;
521                all_not_liveout_exec = all_not_liveout_exec && same != ctx.program->blocks[pred].live_out_exec;
522             }
523             if (!all_liveout_exec && !all_not_liveout_exec)
524                trivial = false;
525             else if (all_not_liveout_exec)
526                need_parallelcopy = true;
527 
528             need_parallelcopy |= !trivial;
529          }
530 
531          if (trivial) {
532             ctx.info[idx].exec.emplace_back(same, type);
533          } else {
534             /* create phi for loop footer */
535             aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
536             phi->definitions[0] = bld.def(bld.lm);
537             if (k == info.num_exec_masks - 1u) {
538                phi->definitions[0].setFixed(exec);
539                need_parallelcopy = false;
540             }
541             for (unsigned i = 0; i < phi->operands.size(); i++)
542                phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
543             ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
544          }
545       }
546       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
547 
548       /* create a parallelcopy to move the live mask to exec */
549       unsigned i = 0;
550       while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
551          bld.insert(std::move(block->instructions[i]));
552          i++;
553       }
554 
555       if (ctx.handle_wqm) {
556          if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
557             if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
558                 (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
559                ctx.info[idx].exec.back().second |= mask_type_global;
560                transition_to_Exact(ctx, bld, idx);
561                ctx.handle_wqm = false;
562             }
563          }
564          if (ctx.info[idx].block_needs == WQM)
565             transition_to_WQM(ctx, bld, idx);
566          else if (ctx.info[idx].block_needs == Exact)
567             transition_to_Exact(ctx, bld, idx);
568       }
569 
570       assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
571       if (need_parallelcopy) {
572          /* only create this parallelcopy is needed, since the operand isn't
573           * fixed to exec which causes the spiller to miscalculate register demand */
574          /* TODO: Fix register_demand calculation for spilling on loop exits.
575           * The problem is only mitigated because the register demand could be
576           * higher if the exec phi doesn't get assigned to exec. */
577          ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
578                                                       ctx.info[idx].exec.back().first);
579       }
580 
581       ctx.loop.pop_back();
582       return i;
583    }
584 
585    if (preds.size() == 1) {
586       ctx.info[idx].exec = ctx.info[preds[0]].exec;
587    } else {
588       assert(preds.size() == 2);
589       /* if one of the predecessors ends in exact mask, we pop it from stack */
590       unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
591                                          ctx.info[preds[1]].exec.size());
592       if (block->kind & block_kind_top_level && !(block->kind & block_kind_merge))
593          num_exec_masks = std::min(num_exec_masks, 2u);
594 
595       /* create phis for diverged exec masks */
596       for (unsigned i = 0; i < num_exec_masks; i++) {
597          bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
598          if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
599             assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
600             ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]);
601             continue;
602          }
603 
604          Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
605                                ctx.info[preds[0]].exec[i].first,
606                                ctx.info[preds[1]].exec[i].first);
607          uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
608          ctx.info[idx].exec.emplace_back(phi, mask_type);
609       }
610    }
611 
612    unsigned i = 0;
613    while (block->instructions[i]->opcode == aco_opcode::p_phi ||
614           block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
615       bld.insert(std::move(block->instructions[i]));
616       i++;
617    }
618 
619    if (block->kind & block_kind_merge)
620       ctx.info[idx].exec.pop_back();
621 
622    if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 3) {
623       assert(ctx.info[idx].exec.back().second == mask_type_exact);
624       assert(block->kind & block_kind_merge);
625       ctx.info[idx].exec.pop_back();
626    }
627 
628    /* try to satisfy the block's needs */
629    if (ctx.handle_wqm) {
630       if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
631          if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
632              (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
633             ctx.info[idx].exec.back().second |= mask_type_global;
634             transition_to_Exact(ctx, bld, idx);
635             ctx.handle_wqm = false;
636          }
637       }
638       if (ctx.info[idx].block_needs == WQM)
639          transition_to_WQM(ctx, bld, idx);
640       else if (ctx.info[idx].block_needs == Exact)
641          transition_to_Exact(ctx, bld, idx);
642    }
643 
644    if (block->kind & block_kind_merge) {
645       Temp restore = ctx.info[idx].exec.back().first;
646       assert(restore.size() == bld.lm.size());
647       ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
648    }
649 
650    return i;
651 }
652 
lower_fs_buffer_store_smem(Builder & bld,bool need_check,aco_ptr<Instruction> & instr,Temp cur_exec)653 void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
654 {
655    Operand offset = instr->operands[1];
656    if (need_check) {
657       /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
658       Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
659 
660       if (offset.isLiteral())
661          offset = bld.copy(bld.def(s1), offset);
662 
663       offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
664                         offset, Operand(UINT32_MAX), bld.scc(nonempty));
665    } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
666       offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
667    }
668    if (!offset.isConstant())
669       offset.setFixed(m0);
670 
671    switch (instr->operands[2].size()) {
672    case 1:
673       instr->opcode = aco_opcode::s_buffer_store_dword;
674       break;
675    case 2:
676       instr->opcode = aco_opcode::s_buffer_store_dwordx2;
677       break;
678    case 4:
679       instr->opcode = aco_opcode::s_buffer_store_dwordx4;
680       break;
681    default:
682       unreachable("Invalid SMEM buffer store size");
683    }
684    instr->operands[1] = offset;
685    /* as_uniform() needs to be done here so it's done in exact mode and helper
686     * lanes don't contribute. */
687    instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
688 }
689 
process_instructions(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions,unsigned idx)690 void process_instructions(exec_ctx& ctx, Block* block,
691                           std::vector<aco_ptr<Instruction>>& instructions,
692                           unsigned idx)
693 {
694    WQMState state;
695    if (ctx.info[block->index].exec.back().second & mask_type_wqm)
696       state = WQM;
697    else {
698       assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
699       state = Exact;
700    }
701 
702    /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
703    bool process = (ctx.handle_wqm &&
704                    (ctx.info[block->index].block_needs & state) !=
705                    (ctx.info[block->index].block_needs & (WQM | Exact))) ||
706                   block->kind & block_kind_uses_discard_if ||
707                   block->kind & block_kind_uses_demote ||
708                   block->kind & block_kind_needs_lowering;
709    if (!process) {
710       std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
711       instructions.insert(instructions.end(),
712                           std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
713                           std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
714       return;
715    }
716 
717    Builder bld(ctx.program, &instructions);
718 
719    for (; idx < block->instructions.size(); idx++) {
720       aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
721 
722       WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified;
723 
724       if (instr->opcode == aco_opcode::p_discard_if) {
725          if (ctx.info[block->index].block_needs & Preserve_WQM) {
726             assert(block->kind & block_kind_top_level);
727             transition_to_WQM(ctx, bld, block->index);
728             ctx.info[block->index].exec.back().second &= ~mask_type_global;
729          }
730          int num = ctx.info[block->index].exec.size();
731          assert(num);
732          Operand cond = instr->operands[0];
733          for (int i = num - 1; i >= 0; i--) {
734             Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
735                                           ctx.info[block->index].exec[i].first, cond);
736             if (i == num - 1) {
737                andn2->operands[0].setFixed(exec);
738                andn2->definitions[0].setFixed(exec);
739             }
740             if (i == 0) {
741                instr->opcode = aco_opcode::p_exit_early_if;
742                instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
743             }
744             ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
745          }
746          assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
747 
748       } else if (needs == WQM && state != WQM) {
749          transition_to_WQM(ctx, bld, block->index);
750          state = WQM;
751       } else if (needs == Exact && state != Exact) {
752          transition_to_Exact(ctx, bld, block->index);
753          state = Exact;
754       }
755 
756       if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
757          Definition dst = instr->definitions[0];
758          assert(dst.size() == bld.lm.size());
759          if (state == Exact) {
760             instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
761             instr->operands[0] = Operand(0u);
762             instr->definitions[0] = dst;
763          } else {
764             std::pair<Temp, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
765             if (instr->opcode == aco_opcode::p_load_helper &&
766                 !(ctx.info[block->index].exec[0].second & mask_type_initial)) {
767                /* find last initial exact mask */
768                for (int i = block->index; i >= 0; i--) {
769                   if (ctx.program->blocks[i].kind & block_kind_top_level &&
770                       ctx.info[i].exec[0].second & mask_type_initial) {
771                      exact_mask = ctx.info[i].exec[0];
772                      break;
773                   }
774                }
775             }
776 
777             assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
778             assert(exact_mask.second & mask_type_exact);
779 
780             instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
781             instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
782             instr->operands[1] = Operand(exact_mask.first);
783             instr->definitions[0] = dst;
784             instr->definitions[1] = bld.def(s1, scc);
785          }
786       } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
787          /* turn demote into discard_if with only exact masks */
788          assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
789          ctx.info[block->index].exec[0].second &= ~mask_type_initial;
790 
791          int num;
792          Temp cond, exit_cond;
793          if (instr->operands[0].isConstant()) {
794             assert(instr->operands[0].constantValue() == -1u);
795             /* transition to exact and set exec to zero */
796             Temp old_exec = ctx.info[block->index].exec.back().first;
797             Temp new_exec = bld.tmp(bld.lm);
798             exit_cond = bld.tmp(s1);
799             cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
800                             bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
801 
802             num = ctx.info[block->index].exec.size() - 2;
803             if (ctx.info[block->index].exec.back().second & mask_type_exact) {
804                ctx.info[block->index].exec.back().first = new_exec;
805             } else {
806                ctx.info[block->index].exec.back().first = cond;
807                ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact);
808             }
809          } else {
810             /* demote_if: transition to exact */
811             transition_to_Exact(ctx, bld, block->index);
812             assert(instr->operands[0].isTemp());
813             cond = instr->operands[0].getTemp();
814             num = ctx.info[block->index].exec.size() - 1;
815          }
816 
817          for (int i = num; i >= 0; i--) {
818             if (ctx.info[block->index].exec[i].second & mask_type_exact) {
819                Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
820                                              ctx.info[block->index].exec[i].first, cond);
821                if (i == (int)ctx.info[block->index].exec.size() - 1) {
822                   andn2->operands[0].setFixed(exec);
823                   andn2->definitions[0].setFixed(exec);
824                }
825 
826                ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
827                exit_cond = andn2->definitions[1].getTemp();
828             } else {
829                assert(i != 0);
830             }
831          }
832          instr->opcode = aco_opcode::p_exit_early_if;
833          instr->operands[0] = bld.scc(exit_cond);
834          state = Exact;
835 
836       } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
837          bool need_check = ctx.info[block->index].exec.size() != 1 &&
838                            !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
839          lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
840       }
841 
842       bld.insert(std::move(instr));
843    }
844 }
845 
add_branch_code(exec_ctx & ctx,Block * block)846 void add_branch_code(exec_ctx& ctx, Block* block)
847 {
848    unsigned idx = block->index;
849    Builder bld(ctx.program, block);
850 
851    if (idx == ctx.program->blocks.size() - 1)
852       return;
853 
854    /* try to disable wqm handling */
855    if (ctx.handle_wqm && block->kind & block_kind_top_level) {
856       if (ctx.info[idx].exec.size() == 3) {
857          assert(ctx.info[idx].exec[1].second == mask_type_wqm);
858          ctx.info[idx].exec.pop_back();
859       }
860       assert(ctx.info[idx].exec.size() <= 2);
861 
862       if (ctx.info[idx].ever_again_needs == 0 ||
863           ctx.info[idx].ever_again_needs == Exact) {
864          /* transition to Exact */
865          aco_ptr<Instruction> branch = std::move(block->instructions.back());
866          block->instructions.pop_back();
867          ctx.info[idx].exec.back().second |= mask_type_global;
868          transition_to_Exact(ctx, bld, idx);
869          bld.insert(std::move(branch));
870          ctx.handle_wqm = false;
871 
872       } else if (ctx.info[idx].block_needs & Preserve_WQM) {
873          /* transition to WQM and remove global flag */
874          aco_ptr<Instruction> branch = std::move(block->instructions.back());
875          block->instructions.pop_back();
876          transition_to_WQM(ctx, bld, idx);
877          ctx.info[idx].exec.back().second &= ~mask_type_global;
878          bld.insert(std::move(branch));
879       }
880    }
881 
882    if (block->kind & block_kind_loop_preheader) {
883       /* collect information about the succeeding loop */
884       bool has_divergent_break = false;
885       bool has_divergent_continue = false;
886       bool has_discard = false;
887       uint8_t needs = 0;
888       unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
889 
890       for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
891          Block& loop_block = ctx.program->blocks[i];
892          needs |= ctx.info[i].block_needs;
893 
894          if (loop_block.kind & block_kind_uses_discard_if ||
895              loop_block.kind & block_kind_discard ||
896              loop_block.kind & block_kind_uses_demote)
897             has_discard = true;
898          if (loop_block.loop_nest_depth != loop_nest_depth)
899             continue;
900 
901          if (loop_block.kind & block_kind_uniform)
902             continue;
903          else if (loop_block.kind & block_kind_break)
904             has_divergent_break = true;
905          else if (loop_block.kind & block_kind_continue)
906             has_divergent_continue = true;
907       }
908 
909       if (ctx.handle_wqm) {
910          if (needs & WQM) {
911             aco_ptr<Instruction> branch = std::move(block->instructions.back());
912             block->instructions.pop_back();
913             transition_to_WQM(ctx, bld, idx);
914             bld.insert(std::move(branch));
915          } else {
916             aco_ptr<Instruction> branch = std::move(block->instructions.back());
917             block->instructions.pop_back();
918             transition_to_Exact(ctx, bld, idx);
919             bld.insert(std::move(branch));
920          }
921       }
922 
923       unsigned num_exec_masks = ctx.info[idx].exec.size();
924       if (block->kind & block_kind_top_level)
925          num_exec_masks = std::min(num_exec_masks, 2u);
926 
927       ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
928                             num_exec_masks,
929                             needs,
930                             has_divergent_break,
931                             has_divergent_continue,
932                             has_discard);
933    }
934 
935    /* For normal breaks, this is the exec mask. For discard+break, it's the
936     * old exec mask before it was zero'd.
937     */
938    Operand break_cond = bld.exec(ctx.info[idx].exec.back().first);
939 
940    if (block->kind & block_kind_discard) {
941 
942       assert(block->instructions.back()->format == Format::PSEUDO_BRANCH);
943       aco_ptr<Instruction> branch = std::move(block->instructions.back());
944       block->instructions.pop_back();
945 
946       /* create a discard_if() instruction with the exec mask as condition */
947       unsigned num = 0;
948       if (ctx.loop.size()) {
949          /* if we're in a loop, only discard from the outer exec masks */
950          num = ctx.loop.back().num_exec_masks;
951       } else {
952          num = ctx.info[idx].exec.size() - 1;
953       }
954 
955       Temp old_exec = ctx.info[idx].exec.back().first;
956       Temp new_exec = bld.tmp(bld.lm);
957       Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
958                            bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
959       ctx.info[idx].exec.back().first = new_exec;
960 
961       for (int i = num - 1; i >= 0; i--) {
962          Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
963                                        ctx.info[block->index].exec[i].first, cond);
964          if (i == (int)ctx.info[idx].exec.size() - 1)
965             andn2->definitions[0].setFixed(exec);
966          if (i == 0)
967             bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
968          ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
969       }
970       assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
971 
972       break_cond = Operand(cond);
973       bld.insert(std::move(branch));
974       /* no return here as it can be followed by a divergent break */
975    }
976 
977    if (block->kind & block_kind_continue_or_break) {
978       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
979       assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
980       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
981       block->instructions.pop_back();
982 
983       bool need_parallelcopy = false;
984       while (!(ctx.info[idx].exec.back().second & mask_type_loop)) {
985          ctx.info[idx].exec.pop_back();
986          need_parallelcopy = true;
987       }
988 
989       if (need_parallelcopy)
990          ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first);
991       bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]);
992       return;
993    }
994 
995    if (block->kind & block_kind_uniform) {
996       Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(block->instructions.back().get());
997       if (branch->opcode == aco_opcode::p_branch) {
998          branch->target[0] = block->linear_succs[0];
999       } else {
1000          branch->target[0] = block->linear_succs[1];
1001          branch->target[1] = block->linear_succs[0];
1002       }
1003       return;
1004    }
1005 
1006    if (block->kind & block_kind_branch) {
1007 
1008       if (ctx.handle_wqm &&
1009           ctx.info[idx].exec.size() >= 2 &&
1010           ctx.info[idx].exec.back().second == mask_type_exact &&
1011           !(ctx.info[idx].block_needs & Exact_Branch) &&
1012           ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
1013          /* return to wqm before branching */
1014          ctx.info[idx].exec.pop_back();
1015       }
1016 
1017       // orig = s_and_saveexec_b64
1018       assert(block->linear_succs.size() == 2);
1019       assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
1020       Temp cond = block->instructions.back()->operands[0].getTemp();
1021       block->instructions.pop_back();
1022 
1023       if (ctx.info[idx].block_needs & Exact_Branch)
1024          transition_to_Exact(ctx, bld, idx);
1025 
1026       Temp current_exec = ctx.info[idx].exec.back().first;
1027       uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
1028 
1029       Temp then_mask = bld.tmp(bld.lm);
1030       Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
1031                                bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
1032 
1033       ctx.info[idx].exec.back().first = old_exec;
1034 
1035       /* add next current exec to the stack */
1036       ctx.info[idx].exec.emplace_back(then_mask, mask_type);
1037 
1038       bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]);
1039       return;
1040    }
1041 
1042    if (block->kind & block_kind_invert) {
1043       // exec = s_andn2_b64 (original_exec, exec)
1044       assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz);
1045       block->instructions.pop_back();
1046       Temp then_mask = ctx.info[idx].exec.back().first;
1047       uint8_t mask_type = ctx.info[idx].exec.back().second;
1048       ctx.info[idx].exec.pop_back();
1049       Temp orig_exec = ctx.info[idx].exec.back().first;
1050       Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
1051                                 bld.def(s1, scc), orig_exec, bld.exec(then_mask));
1052 
1053       /* add next current exec to the stack */
1054       ctx.info[idx].exec.emplace_back(else_mask, mask_type);
1055 
1056       bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]);
1057       return;
1058    }
1059 
1060    if (block->kind & block_kind_break) {
1061       // loop_mask = s_andn2_b64 (loop_mask, exec)
1062       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
1063       block->instructions.pop_back();
1064 
1065       Temp cond = Temp();
1066       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
1067          cond = bld.tmp(s1);
1068          Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
1069          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
1070                               exec_mask, break_cond);
1071          ctx.info[idx].exec[exec_idx].first = exec_mask;
1072          if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
1073             break;
1074       }
1075 
1076       /* check if the successor is the merge block, otherwise set exec to 0 */
1077       // TODO: this could be done better by directly branching to the merge block
1078       unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
1079       Block& succ = ctx.program->blocks[succ_idx];
1080       if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
1081          ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
1082       }
1083 
1084       bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
1085       return;
1086    }
1087 
1088    if (block->kind & block_kind_continue) {
1089       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
1090       block->instructions.pop_back();
1091 
1092       Temp current_exec = ctx.info[idx].exec.back().first;
1093       Temp cond = Temp();
1094       for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
1095          if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
1096             break;
1097          cond = bld.tmp(s1);
1098          Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
1099          exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
1100                               exec_mask, bld.exec(current_exec));
1101          ctx.info[idx].exec[exec_idx].first = exec_mask;
1102       }
1103       assert(cond != Temp());
1104 
1105       /* check if the successor is the merge block, otherwise set exec to 0 */
1106       // TODO: this could be done better by directly branching to the merge block
1107       unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
1108       Block& succ = ctx.program->blocks[succ_idx];
1109       if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
1110          ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
1111       }
1112 
1113       bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
1114       return;
1115    }
1116 }
1117 
process_block(exec_ctx & ctx,Block * block)1118 void process_block(exec_ctx& ctx, Block* block)
1119 {
1120    std::vector<aco_ptr<Instruction>> instructions;
1121    instructions.reserve(block->instructions.size());
1122 
1123    unsigned idx = add_coupling_code(ctx, block, instructions);
1124 
1125    assert(block->index != ctx.program->blocks.size() - 1 ||
1126           ctx.info[block->index].exec.size() <= 2);
1127 
1128    process_instructions(ctx, block, instructions, idx);
1129 
1130    block->instructions = std::move(instructions);
1131 
1132    add_branch_code(ctx, block);
1133 
1134    block->live_out_exec = ctx.info[block->index].exec.back().first;
1135 }
1136 
1137 } /* end namespace */
1138 
1139 
insert_exec_mask(Program * program)1140 void insert_exec_mask(Program *program)
1141 {
1142    exec_ctx ctx(program);
1143 
1144    if (program->needs_wqm && program->needs_exact)
1145       calculate_wqm_needs(ctx);
1146 
1147    for (Block& block : program->blocks)
1148       process_block(ctx, &block);
1149 
1150 }
1151 
1152 }
1153 
1154