• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include "common/sid.h"
29 
30 #include <map>
31 #include <stack>
32 #include <vector>
33 
34 namespace aco {
35 
36 namespace {
37 
38 /**
39  * The general idea of this pass is:
40  * The CFG is traversed in reverse postorder (forward) and loops are processed
41  * several times until no progress is made.
42  * Per BB two wait_ctx is maintained: an in-context and out-context.
43  * The in-context is the joined out-contexts of the predecessors.
44  * The context contains a map: gpr -> wait_entry
45  * consisting of the information about the cnt values to be waited for.
46  * Note: After merge-nodes, it might occur that for the same register
47  *       multiple cnt values are to be waited for.
48  *
49  * The values are updated according to the encountered instructions:
50  * - additional events increment the counter of waits of the same type
51  * - or erase gprs with counters higher than to be waited for.
52  */
53 
54 // TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
55 // when there is a load followed by a use of a previous load
56 
57 /* Instructions of the same event will finish in-order except for smem
58  * and maybe flat. Instructions of different events may not finish in-order. */
59 enum wait_event : uint16_t {
60    event_smem = 1 << 0,
61    event_lds = 1 << 1,
62    event_gds = 1 << 2,
63    event_vmem = 1 << 3,
64    event_vmem_store = 1 << 4, /* GFX10+ */
65    event_flat = 1 << 5,
66    event_exp_pos = 1 << 6,
67    event_exp_param = 1 << 7,
68    event_exp_mrt_null = 1 << 8,
69    event_gds_gpr_lock = 1 << 9,
70    event_vmem_gpr_lock = 1 << 10,
71    event_sendmsg = 1 << 11,
72    event_ldsdir = 1 << 12,
73    event_valu = 1 << 13,
74    event_trans = 1 << 14,
75    event_salu = 1 << 15,
76    num_events = 16,
77 };
78 
79 enum counter_type : uint8_t {
80    counter_exp = 1 << 0,
81    counter_lgkm = 1 << 1,
82    counter_vm = 1 << 2,
83    counter_vs = 1 << 3,
84    counter_alu = 1 << 4,
85    num_counters = 5,
86 };
87 
88 enum vmem_type : uint8_t {
89    vmem_nosampler = 1 << 0,
90    vmem_sampler = 1 << 1,
91    vmem_bvh = 1 << 2,
92 };
93 
94 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
95                                    event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
96 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
97 static const uint16_t vm_events = event_vmem | event_flat;
98 static const uint16_t vs_events = event_vmem_store;
99 
100 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
101  * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
102  * that we should switch to a different wave and contains info on dependencies as to
103  * when we can switch back.
104  *
105  * This seems to apply only for ALU->ALU dependencies as other instructions have better
106  * integration with the frontend.
107  *
108  * Note that if we do not emit s_delay_alu things will still be correct, but the wave
109  * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
110  * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
111  * instructions can take a different number of cycles based on the exec mask)
112  */
113 struct alu_delay_info {
114    /* These are the values directly above the max representable value, i.e. the wait
115     * would turn into a no-op when we try to wait for something further back than
116     * this.
117     */
118    static constexpr int8_t valu_nop = 5;
119    static constexpr int8_t trans_nop = 4;
120 
121    /* How many VALU instructions ago this value was written */
122    int8_t valu_instrs = valu_nop;
123    /* Cycles until the writing VALU instruction is finished */
124    int8_t valu_cycles = 0;
125 
126    /* How many Transcedent instructions ago this value was written */
127    int8_t trans_instrs = trans_nop;
128    /* Cycles until the writing Transcendent instruction is finished */
129    int8_t trans_cycles = 0;
130 
131    /* Cycles until the writing SALU instruction is finished*/
132    int8_t salu_cycles = 0;
133 
combineaco::__anon73e9e3a40111::alu_delay_info134    bool combine(const alu_delay_info& other)
135    {
136       bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
137                      other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
138                      other.trans_cycles > trans_cycles;
139       valu_instrs = std::min(valu_instrs, other.valu_instrs);
140       trans_instrs = std::min(trans_instrs, other.trans_instrs);
141       salu_cycles = std::max(salu_cycles, other.salu_cycles);
142       valu_cycles = std::max(valu_cycles, other.valu_cycles);
143       trans_cycles = std::max(trans_cycles, other.trans_cycles);
144       return changed;
145    }
146 
147    /* Needs to be called after any change to keep the data consistent. */
fixupaco::__anon73e9e3a40111::alu_delay_info148    void fixup()
149    {
150       if (valu_instrs >= valu_nop || valu_cycles <= 0) {
151          valu_instrs = valu_nop;
152          valu_cycles = 0;
153       }
154 
155       if (trans_instrs >= trans_nop || trans_cycles <= 0) {
156          trans_instrs = trans_nop;
157          trans_cycles = 0;
158       }
159 
160       salu_cycles = std::max<int8_t>(salu_cycles, 0);
161    }
162 
163    /* Returns true if a wait would be a no-op */
emptyaco::__anon73e9e3a40111::alu_delay_info164    bool empty() const
165    {
166       return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
167    }
168 
printaco::__anon73e9e3a40111::alu_delay_info169    UNUSED void print(FILE* output) const
170    {
171       if (valu_instrs != valu_nop)
172          fprintf(output, "valu_instrs: %u\n", valu_instrs);
173       if (valu_cycles)
174          fprintf(output, "valu_cycles: %u\n", valu_cycles);
175       if (trans_instrs != trans_nop)
176          fprintf(output, "trans_instrs: %u\n", trans_instrs);
177       if (trans_cycles)
178          fprintf(output, "trans_cycles: %u\n", trans_cycles);
179       if (salu_cycles)
180          fprintf(output, "salu_cycles: %u\n", salu_cycles);
181    }
182 };
183 
184 uint8_t
get_counters_for_event(wait_event ev)185 get_counters_for_event(wait_event ev)
186 {
187    switch (ev) {
188    case event_smem:
189    case event_lds:
190    case event_gds:
191    case event_sendmsg: return counter_lgkm;
192    case event_vmem: return counter_vm;
193    case event_vmem_store: return counter_vs;
194    case event_flat: return counter_vm | counter_lgkm;
195    case event_exp_pos:
196    case event_exp_param:
197    case event_exp_mrt_null:
198    case event_gds_gpr_lock:
199    case event_vmem_gpr_lock:
200    case event_ldsdir: return counter_exp;
201    case event_valu:
202    case event_trans:
203    case event_salu: return counter_alu;
204    default: return 0;
205    }
206 }
207 
208 struct wait_entry {
209    wait_imm imm;
210    alu_delay_info delay;
211    uint16_t events;  /* use wait_event notion */
212    uint8_t counters; /* use counter_type notion */
213    bool wait_on_read : 1;
214    bool logical : 1;
215    uint8_t vmem_types : 4;
216 
wait_entryaco::__anon73e9e3a40111::wait_entry217    wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, bool logical_,
218               bool wait_on_read_)
219        : imm(imm_), delay(delay_), events(event_), counters(get_counters_for_event(event_)),
220          wait_on_read(wait_on_read_), logical(logical_), vmem_types(0)
221    {}
222 
joinaco::__anon73e9e3a40111::wait_entry223    bool join(const wait_entry& other)
224    {
225       bool changed = (other.events & ~events) || (other.counters & ~counters) ||
226                      (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types) ||
227                      (!other.logical && logical);
228       events |= other.events;
229       counters |= other.counters;
230       changed |= imm.combine(other.imm);
231       changed |= delay.combine(other.delay);
232       wait_on_read |= other.wait_on_read;
233       vmem_types |= other.vmem_types;
234       logical &= other.logical;
235       return changed;
236    }
237 
remove_counteraco::__anon73e9e3a40111::wait_entry238    void remove_counter(counter_type counter)
239    {
240       counters &= ~counter;
241 
242       if (counter == counter_lgkm) {
243          imm.lgkm = wait_imm::unset_counter;
244          events &= ~(event_smem | event_lds | event_gds | event_sendmsg);
245       }
246 
247       if (counter == counter_vm) {
248          imm.vm = wait_imm::unset_counter;
249          events &= ~event_vmem;
250          vmem_types = 0;
251       }
252 
253       if (counter == counter_exp) {
254          imm.exp = wait_imm::unset_counter;
255          events &= ~exp_events;
256       }
257 
258       if (counter == counter_vs) {
259          imm.vs = wait_imm::unset_counter;
260          events &= ~event_vmem_store;
261       }
262 
263       if (!(counters & counter_lgkm) && !(counters & counter_vm))
264          events &= ~event_flat;
265 
266       if (counter == counter_alu) {
267          delay = alu_delay_info();
268          events &= ~(event_valu | event_trans | event_salu);
269       }
270    }
271 
printaco::__anon73e9e3a40111::wait_entry272    UNUSED void print(FILE* output) const
273    {
274       fprintf(output, "logical: %u\n", logical);
275       imm.print(output);
276       delay.print(output);
277       if (events)
278          fprintf(output, "events: %u\n", events);
279       if (counters)
280          fprintf(output, "counters: %u\n", counters);
281       if (!wait_on_read)
282          fprintf(output, "wait_on_read: %u\n", wait_on_read);
283       if (!logical)
284          fprintf(output, "logical: %u\n", logical);
285       if (vmem_types)
286          fprintf(output, "vmem_types: %u\n", vmem_types);
287    }
288 };
289 
290 struct wait_ctx {
291    Program* program;
292    enum amd_gfx_level gfx_level;
293    uint16_t max_vm_cnt;
294    uint16_t max_exp_cnt;
295    uint16_t max_lgkm_cnt;
296    uint16_t max_vs_cnt;
297    uint16_t unordered_events = event_smem | event_flat;
298 
299    bool vm_nonzero = false;
300    bool exp_nonzero = false;
301    bool lgkm_nonzero = false;
302    bool vs_nonzero = false;
303    bool pending_flat_lgkm = false;
304    bool pending_flat_vm = false;
305    bool pending_s_buffer_store = false; /* GFX10 workaround */
306 
307    wait_imm barrier_imm[storage_count];
308    uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
309 
310    std::map<PhysReg, wait_entry> gpr_map;
311 
wait_ctxaco::__anon73e9e3a40111::wait_ctx312    wait_ctx() {}
wait_ctxaco::__anon73e9e3a40111::wait_ctx313    wait_ctx(Program* program_)
314        : program(program_), gfx_level(program_->gfx_level),
315          max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6),
316          max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14),
317          max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0),
318          unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0))
319    {}
320 
joinaco::__anon73e9e3a40111::wait_ctx321    bool join(const wait_ctx* other, bool logical)
322    {
323       bool changed = other->exp_nonzero > exp_nonzero || other->vm_nonzero > vm_nonzero ||
324                      other->lgkm_nonzero > lgkm_nonzero || other->vs_nonzero > vs_nonzero ||
325                      (other->pending_flat_lgkm && !pending_flat_lgkm) ||
326                      (other->pending_flat_vm && !pending_flat_vm);
327 
328       exp_nonzero |= other->exp_nonzero;
329       vm_nonzero |= other->vm_nonzero;
330       lgkm_nonzero |= other->lgkm_nonzero;
331       vs_nonzero |= other->vs_nonzero;
332       pending_flat_lgkm |= other->pending_flat_lgkm;
333       pending_flat_vm |= other->pending_flat_vm;
334       pending_s_buffer_store |= other->pending_s_buffer_store;
335 
336       for (const auto& entry : other->gpr_map) {
337          if (entry.second.logical != logical)
338             continue;
339 
340          using iterator = std::map<PhysReg, wait_entry>::iterator;
341          const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
342          if (insert_pair.second) {
343             changed = true;
344          } else {
345             changed |= insert_pair.first->second.join(entry.second);
346          }
347       }
348 
349       for (unsigned i = 0; i < storage_count; i++) {
350          changed |= barrier_imm[i].combine(other->barrier_imm[i]);
351          changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
352          barrier_events[i] |= other->barrier_events[i];
353       }
354 
355       return changed;
356    }
357 
wait_and_remove_from_entryaco::__anon73e9e3a40111::wait_ctx358    void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
359    {
360       entry.remove_counter(counter);
361    }
362 
printaco::__anon73e9e3a40111::wait_ctx363    UNUSED void print(FILE* output) const
364    {
365       fprintf(output, "exp_nonzero: %u\n", exp_nonzero);
366       fprintf(output, "vm_nonzero: %u\n", vm_nonzero);
367       fprintf(output, "lgkm_nonzero: %u\n", lgkm_nonzero);
368       fprintf(output, "vs_nonzero: %u\n", vs_nonzero);
369       fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm);
370       fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm);
371       for (const auto& entry : gpr_map) {
372          fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
373                  entry.first.reg() & 0xff);
374          entry.second.print(output);
375          fprintf(output, "}\n");
376       }
377 
378       for (unsigned i = 0; i < storage_count; i++) {
379          if (!barrier_imm[i].empty() || barrier_events[i]) {
380             fprintf(output, "barriers[%u] = {\n", i);
381             barrier_imm[i].print(output);
382             fprintf(output, "events: %u\n", barrier_events[i]);
383             fprintf(output, "}\n");
384          }
385       }
386    }
387 };
388 
389 uint8_t
get_vmem_type(Instruction * instr)390 get_vmem_type(Instruction* instr)
391 {
392    if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
393       return vmem_bvh;
394    else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
395             instr->operands[1].regClass() == s4)
396       return vmem_sampler;
397    else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
398       return vmem_nosampler;
399    return 0;
400 }
401 
402 void
check_instr(wait_ctx & ctx,wait_imm & wait,alu_delay_info & delay,Instruction * instr)403 check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr)
404 {
405    for (const Operand op : instr->operands) {
406       if (op.isConstant() || op.isUndefined())
407          continue;
408 
409       /* check consecutively read gprs */
410       for (unsigned j = 0; j < op.size(); j++) {
411          PhysReg reg{op.physReg() + j};
412          std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
413          if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
414             continue;
415 
416          wait.combine(it->second.imm);
417          if (instr->isVALU() || instr->isSALU())
418             delay.combine(it->second.delay);
419       }
420    }
421 
422    for (const Definition& def : instr->definitions) {
423       /* check consecutively written gprs */
424       for (unsigned j = 0; j < def.getTemp().size(); j++) {
425          PhysReg reg{def.physReg() + j};
426 
427          std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
428          if (it == ctx.gpr_map.end())
429             continue;
430 
431          /* Vector Memory reads and writes return in the order they were issued */
432          uint8_t vmem_type = get_vmem_type(instr);
433          if (vmem_type && ((it->second.events & vm_events) == event_vmem) &&
434              it->second.vmem_types == vmem_type)
435             continue;
436 
437          /* LDS reads and writes return in the order they were issued. same for GDS */
438          if (instr->isDS() &&
439              (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
440             continue;
441 
442          wait.combine(it->second.imm);
443       }
444    }
445 }
446 
447 bool
parse_wait_instr(wait_ctx & ctx,wait_imm & imm,Instruction * instr)448 parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
449 {
450    if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->operands[0].physReg() == sgpr_null) {
451       imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);
452       return true;
453    } else if (instr->opcode == aco_opcode::s_waitcnt) {
454       imm.combine(wait_imm(ctx.gfx_level, instr->sopp().imm));
455       return true;
456    }
457    return false;
458 }
459 
460 bool
parse_delay_alu(wait_ctx & ctx,alu_delay_info & delay,Instruction * instr)461 parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
462 {
463    if (instr->opcode != aco_opcode::s_delay_alu)
464       return false;
465 
466    unsigned imm[2] = {instr->sopp().imm & 0xf, (instr->sopp().imm >> 7) & 0xf};
467    for (unsigned i = 0; i < 2; ++i) {
468       alu_delay_wait wait = (alu_delay_wait)imm[i];
469       if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
470          delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
471       else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
472          delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
473       else if (wait >= alu_delay_wait::SALU_CYCLE_1)
474          delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
475    }
476 
477    delay.valu_cycles = instr->pass_flags & 0xffff;
478    delay.trans_cycles = instr->pass_flags >> 16;
479 
480    return true;
481 }
482 
483 void
perform_barrier(wait_ctx & ctx,wait_imm & imm,memory_sync_info sync,unsigned semantics)484 perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
485 {
486    sync_scope subgroup_scope =
487       ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
488    if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
489       unsigned storage = sync.storage;
490       while (storage) {
491          unsigned idx = u_bit_scan(&storage);
492 
493          /* LDS is private to the workgroup */
494          sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
495 
496          uint16_t events = ctx.barrier_events[idx];
497          if (bar_scope_lds <= subgroup_scope)
498             events &= ~event_lds;
499 
500          /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
501           * in-order for the same workgroup */
502          if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
503             events &= ~(event_vmem | event_vmem_store | event_smem);
504 
505          if (events)
506             imm.combine(ctx.barrier_imm[idx]);
507       }
508    }
509 }
510 
511 void
force_waitcnt(wait_ctx & ctx,wait_imm & imm)512 force_waitcnt(wait_ctx& ctx, wait_imm& imm)
513 {
514    if (ctx.vm_nonzero)
515       imm.vm = 0;
516    if (ctx.exp_nonzero)
517       imm.exp = 0;
518    if (ctx.lgkm_nonzero)
519       imm.lgkm = 0;
520 
521    if (ctx.gfx_level >= GFX10) {
522       if (ctx.vs_nonzero)
523          imm.vs = 0;
524    }
525 }
526 
527 void
update_alu(wait_ctx & ctx,bool is_valu,bool is_trans,bool clear,int cycles)528 update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
529 {
530    std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
531    while (it != ctx.gpr_map.end()) {
532       wait_entry& entry = it->second;
533 
534       if (clear) {
535          entry.remove_counter(counter_alu);
536       } else {
537          entry.delay.valu_instrs += is_valu ? 1 : 0;
538          entry.delay.trans_instrs += is_trans ? 1 : 0;
539          entry.delay.salu_cycles -= cycles;
540          entry.delay.valu_cycles -= cycles;
541          entry.delay.trans_cycles -= cycles;
542 
543          entry.delay.fixup();
544          if (it->second.delay.empty())
545             entry.remove_counter(counter_alu);
546       }
547 
548       if (!entry.counters)
549          it = ctx.gpr_map.erase(it);
550       else
551          it++;
552    }
553 }
554 
555 void
kill(wait_imm & imm,alu_delay_info & delay,Instruction * instr,wait_ctx & ctx,memory_sync_info sync_info)556 kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
557      memory_sync_info sync_info)
558 {
559    if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
560       /* Force emitting waitcnt states right after the instruction if there is
561        * something to wait for. This is also applied for s_setpc_b64 to ensure
562        * waitcnt states are inserted before jumping to the PS epilog.
563        */
564       force_waitcnt(ctx, imm);
565    }
566 
567    /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
568     * overlapping waves proceed into the ordered section.
569     */
570    if (ctx.program->has_pops_overlapped_waves_wait &&
571        (ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done
572                                : (instr->opcode == aco_opcode::s_sendmsg &&
573                                   instr->sopp().imm == sendmsg_ordered_ps_done))) {
574       if (ctx.vm_nonzero)
575          imm.vm = 0;
576       if (ctx.gfx_level >= GFX10 && ctx.vs_nonzero)
577          imm.vs = 0;
578       /* Await SMEM loads too, as it's possible for an application to create them, like using a
579        * scalarization loop - pointless and unoptimal for an inherently divergent address of
580        * per-pixel data, but still can be done at least synthetically and must be handled correctly.
581        */
582       if (ctx.program->has_smem_buffer_or_global_loads && ctx.lgkm_nonzero)
583          imm.lgkm = 0;
584    }
585 
586    check_instr(ctx, imm, delay, instr);
587 
588    /* It's required to wait for scalar stores before "writing back" data.
589     * It shouldn't cost anything anyways since we're about to do s_endpgm.
590     */
591    if (ctx.lgkm_nonzero && instr->opcode == aco_opcode::s_dcache_wb) {
592       assert(ctx.gfx_level >= GFX8);
593       imm.lgkm = 0;
594    }
595 
596    if (ctx.gfx_level >= GFX10 && instr->isSMEM()) {
597       /* GFX10: A store followed by a load at the same address causes a problem because
598        * the load doesn't load the correct values unless we wait for the store first.
599        * This is NOT mitigated by an s_nop.
600        *
601        * TODO: Refine this when we have proper alias analysis.
602        */
603       if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
604           !instr->smem().sync.can_reorder()) {
605          imm.lgkm = 0;
606       }
607    }
608 
609    if (instr->opcode == aco_opcode::ds_ordered_count &&
610        ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
611       imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
612    }
613 
614    if (instr->opcode == aco_opcode::p_barrier)
615       perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);
616    else
617       perform_barrier(ctx, imm, sync_info, semantic_release);
618 
619    if (!imm.empty() || !delay.empty()) {
620       if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
621          imm.vm = 0;
622       if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
623          imm.lgkm = 0;
624 
625       /* reset counters */
626       ctx.exp_nonzero &= imm.exp != 0;
627       ctx.vm_nonzero &= imm.vm != 0;
628       ctx.lgkm_nonzero &= imm.lgkm != 0;
629       ctx.vs_nonzero &= imm.vs != 0;
630 
631       /* update barrier wait imms */
632       for (unsigned i = 0; i < storage_count; i++) {
633          wait_imm& bar = ctx.barrier_imm[i];
634          uint16_t& bar_ev = ctx.barrier_events[i];
635          if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
636             bar.exp = wait_imm::unset_counter;
637             bar_ev &= ~exp_events;
638          }
639          if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {
640             bar.vm = wait_imm::unset_counter;
641             bar_ev &= ~(vm_events & ~event_flat);
642          }
643          if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {
644             bar.lgkm = wait_imm::unset_counter;
645             bar_ev &= ~(lgkm_events & ~event_flat);
646          }
647          if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {
648             bar.vs = wait_imm::unset_counter;
649             bar_ev &= ~vs_events;
650          }
651          if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
652             bar_ev &= ~event_flat;
653       }
654 
655       if (ctx.program->gfx_level >= GFX11) {
656          update_alu(ctx, false, false, false,
657                     MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
658       }
659 
660       /* remove all gprs with higher counter from map */
661       std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
662       while (it != ctx.gpr_map.end()) {
663          if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
664             ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
665          if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
666             ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
667          if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
668             ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
669          if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
670             ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
671          if (delay.valu_instrs <= it->second.delay.valu_instrs)
672             it->second.delay.valu_instrs = alu_delay_info::valu_nop;
673          if (delay.trans_instrs <= it->second.delay.trans_instrs)
674             it->second.delay.trans_instrs = alu_delay_info::trans_nop;
675          it->second.delay.fixup();
676          if (it->second.delay.empty())
677             ctx.wait_and_remove_from_entry(it->first, it->second, counter_alu);
678          if (!it->second.counters)
679             it = ctx.gpr_map.erase(it);
680          else
681             it++;
682       }
683    }
684 
685    if (imm.vm == 0)
686       ctx.pending_flat_vm = false;
687    if (imm.lgkm == 0) {
688       ctx.pending_flat_lgkm = false;
689       ctx.pending_s_buffer_store = false;
690    }
691 }
692 
693 void
update_barrier_counter(uint8_t * ctr,unsigned max)694 update_barrier_counter(uint8_t* ctr, unsigned max)
695 {
696    if (*ctr != wait_imm::unset_counter && *ctr < max)
697       (*ctr)++;
698 }
699 
700 void
update_barrier_imm(wait_ctx & ctx,uint8_t counters,wait_event event,memory_sync_info sync)701 update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
702 {
703    for (unsigned i = 0; i < storage_count; i++) {
704       wait_imm& bar = ctx.barrier_imm[i];
705       uint16_t& bar_ev = ctx.barrier_events[i];
706       if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
707          bar_ev |= event;
708          if (counters & counter_lgkm)
709             bar.lgkm = 0;
710          if (counters & counter_vm)
711             bar.vm = 0;
712          if (counters & counter_exp)
713             bar.exp = 0;
714          if (counters & counter_vs)
715             bar.vs = 0;
716       } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {
717          if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)
718             update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);
719          if (counters & counter_vm && (bar_ev & vm_events) == event)
720             update_barrier_counter(&bar.vm, ctx.max_vm_cnt);
721          if (counters & counter_exp && (bar_ev & exp_events) == event)
722             update_barrier_counter(&bar.exp, ctx.max_exp_cnt);
723          if (counters & counter_vs && (bar_ev & vs_events) == event)
724             update_barrier_counter(&bar.vs, ctx.max_vs_cnt);
725       }
726    }
727 }
728 
729 void
update_counters(wait_ctx & ctx,wait_event event,memory_sync_info sync=memory_sync_info ())730 update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
731 {
732    uint8_t counters = get_counters_for_event(event);
733 
734    if (counters & counter_lgkm)
735       ctx.lgkm_nonzero = true;
736    if (counters & counter_vm)
737       ctx.vm_nonzero = true;
738    if (counters & counter_exp)
739       ctx.exp_nonzero = true;
740    if (counters & counter_vs)
741       ctx.vs_nonzero = true;
742 
743    update_barrier_imm(ctx, counters, event, sync);
744 
745    if (ctx.unordered_events & event)
746       return;
747 
748    if (ctx.pending_flat_lgkm)
749       counters &= ~counter_lgkm;
750    if (ctx.pending_flat_vm)
751       counters &= ~counter_vm;
752 
753    for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
754       wait_entry& entry = e.second;
755 
756       if (entry.events & ctx.unordered_events)
757          continue;
758 
759       assert(entry.events);
760 
761       if ((counters & counter_exp) && (entry.events & exp_events) == event &&
762           entry.imm.exp < ctx.max_exp_cnt)
763          entry.imm.exp++;
764       if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
765           entry.imm.lgkm < ctx.max_lgkm_cnt)
766          entry.imm.lgkm++;
767       if ((counters & counter_vm) && (entry.events & vm_events) == event &&
768           entry.imm.vm < ctx.max_vm_cnt)
769          entry.imm.vm++;
770       if ((counters & counter_vs) && (entry.events & vs_events) == event &&
771           entry.imm.vs < ctx.max_vs_cnt)
772          entry.imm.vs++;
773    }
774 }
775 
776 void
update_counters_for_flat_load(wait_ctx & ctx,memory_sync_info sync=memory_sync_info ())777 update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
778 {
779    assert(ctx.gfx_level < GFX10);
780 
781    ctx.lgkm_nonzero = true;
782    ctx.vm_nonzero = true;
783 
784    update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
785 
786    for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
787       if (e.second.counters & counter_vm)
788          e.second.imm.vm = 0;
789       if (e.second.counters & counter_lgkm)
790          e.second.imm.lgkm = 0;
791    }
792    ctx.pending_flat_lgkm = true;
793    ctx.pending_flat_vm = true;
794 }
795 
796 void
insert_wait_entry(wait_ctx & ctx,PhysReg reg,RegClass rc,wait_event event,bool wait_on_read,uint8_t vmem_types=0,unsigned cycles=0,bool force_linear=false)797 insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
798                   uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false)
799 {
800    uint16_t counters = get_counters_for_event(event);
801    wait_imm imm;
802    if (counters & counter_lgkm)
803       imm.lgkm = 0;
804    if (counters & counter_vm)
805       imm.vm = 0;
806    if (counters & counter_exp)
807       imm.exp = 0;
808    if (counters & counter_vs)
809       imm.vs = 0;
810 
811    alu_delay_info delay;
812    if (event == event_valu) {
813       delay.valu_instrs = 0;
814       delay.valu_cycles = cycles;
815    } else if (event == event_trans) {
816       delay.trans_instrs = 0;
817       delay.trans_cycles = cycles;
818    } else if (event == event_salu) {
819       delay.salu_cycles = cycles;
820    }
821 
822    wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read);
823    new_entry.vmem_types |= vmem_types;
824 
825    for (unsigned i = 0; i < rc.size(); i++) {
826       auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
827       if (!it.second)
828          it.first->second.join(new_entry);
829    }
830 }
831 
832 void
insert_wait_entry(wait_ctx & ctx,Operand op,wait_event event,uint8_t vmem_types=0)833 insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0)
834 {
835    if (!op.isConstant() && !op.isUndefined())
836       insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0);
837 }
838 
839 void
insert_wait_entry(wait_ctx & ctx,Definition def,wait_event event,uint8_t vmem_types=0,unsigned cycles=0)840 insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
841                   unsigned cycles = 0)
842 {
843    /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without
844     * waiting for the load to finish.
845     * Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane
846     * dependency checks.
847     */
848    uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat;
849    uint32_t alu_events = event_trans | event_valu | event_salu;
850    bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events));
851 
852    insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles,
853                      force_linear);
854 }
855 
856 void
gen_alu(Instruction * instr,wait_ctx & ctx)857 gen_alu(Instruction* instr, wait_ctx& ctx)
858 {
859    Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
860    bool is_valu = instr->isVALU();
861    bool is_trans = instr->isTrans();
862    bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() ||
863                 instr->isMUBUF() || instr->isMTBUF();
864 
865    wait_event event = (wait_event)0;
866    if (is_trans)
867       event = event_trans;
868    else if (is_valu)
869       event = event_valu;
870    else if (instr->isSALU())
871       event = event_salu;
872 
873    if (event != (wait_event)0) {
874       for (const Definition& def : instr->definitions)
875          insert_wait_entry(ctx, def, event, 0, cycle_info.latency);
876    }
877    update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
878               clear, cycle_info.issue_cycles);
879 }
880 
881 void
gen(Instruction * instr,wait_ctx & ctx)882 gen(Instruction* instr, wait_ctx& ctx)
883 {
884    switch (instr->format) {
885    case Format::EXP: {
886       Export_instruction& exp_instr = instr->exp();
887 
888       wait_event ev;
889       if (exp_instr.dest <= 9)
890          ev = event_exp_mrt_null;
891       else if (exp_instr.dest <= 15)
892          ev = event_exp_pos;
893       else
894          ev = event_exp_param;
895       update_counters(ctx, ev);
896 
897       /* insert new entries for exported vgprs */
898       for (unsigned i = 0; i < 4; i++) {
899          if (exp_instr.enabled_mask & (1 << i)) {
900             unsigned idx = exp_instr.compressed ? i >> 1 : i;
901             assert(idx < exp_instr.operands.size());
902             insert_wait_entry(ctx, exp_instr.operands[idx], ev);
903          }
904       }
905       insert_wait_entry(ctx, exec, s2, ev, false);
906       break;
907    }
908    case Format::FLAT: {
909       FLAT_instruction& flat = instr->flat();
910       if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
911          update_counters_for_flat_load(ctx, flat.sync);
912       else
913          update_counters(ctx, event_flat, flat.sync);
914 
915       if (!instr->definitions.empty())
916          insert_wait_entry(ctx, instr->definitions[0], event_flat);
917       break;
918    }
919    case Format::SMEM: {
920       SMEM_instruction& smem = instr->smem();
921       update_counters(ctx, event_smem, smem.sync);
922 
923       if (!instr->definitions.empty())
924          insert_wait_entry(ctx, instr->definitions[0], event_smem);
925       else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder())
926          ctx.pending_s_buffer_store = true;
927 
928       break;
929    }
930    case Format::DS: {
931       DS_instruction& ds = instr->ds();
932       update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync);
933       if (ds.gds)
934          update_counters(ctx, event_gds_gpr_lock);
935 
936       if (!instr->definitions.empty())
937          insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds);
938 
939       if (ds.gds) {
940          for (const Operand& op : instr->operands)
941             insert_wait_entry(ctx, op, event_gds_gpr_lock);
942          insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
943       }
944       break;
945    }
946    case Format::LDSDIR: {
947       LDSDIR_instruction& ldsdir = instr->ldsdir();
948       update_counters(ctx, event_ldsdir, ldsdir.sync);
949       insert_wait_entry(ctx, instr->definitions[0], event_ldsdir);
950       break;
951    }
952    case Format::MUBUF:
953    case Format::MTBUF:
954    case Format::MIMG:
955    case Format::GLOBAL:
956    case Format::SCRATCH: {
957       wait_event ev =
958          !instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store;
959       update_counters(ctx, ev, get_sync_info(instr));
960 
961       if (!instr->definitions.empty())
962          insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr));
963 
964       if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
965          update_counters(ctx, event_vmem_gpr_lock);
966          insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
967       } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
968          update_counters(ctx, event_vmem_gpr_lock);
969          insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
970       }
971 
972       break;
973    }
974    case Format::SOPP: {
975       if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
976          update_counters(ctx, event_sendmsg);
977       break;
978    }
979    case Format::SOP1: {
980       if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
981           instr->opcode == aco_opcode::s_sendmsg_rtn_b64) {
982          update_counters(ctx, event_sendmsg);
983          insert_wait_entry(ctx, instr->definitions[0], event_sendmsg);
984       }
985       break;
986    }
987    default: break;
988    }
989 }
990 
991 void
emit_waitcnt(wait_ctx & ctx,std::vector<aco_ptr<Instruction>> & instructions,wait_imm & imm)992 emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
993 {
994    if (imm.vs != wait_imm::unset_counter) {
995       assert(ctx.gfx_level >= GFX10);
996       SOPK_instruction* waitcnt_vs =
997          create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 1, 0);
998       waitcnt_vs->operands[0] = Operand(sgpr_null, s1);
999       waitcnt_vs->imm = imm.vs;
1000       instructions.emplace_back(waitcnt_vs);
1001       imm.vs = wait_imm::unset_counter;
1002    }
1003    if (!imm.empty()) {
1004       SOPP_instruction* waitcnt =
1005          create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
1006       waitcnt->imm = imm.pack(ctx.gfx_level);
1007       waitcnt->block = -1;
1008       instructions.emplace_back(waitcnt);
1009    }
1010    imm = wait_imm();
1011 }
1012 
1013 void
emit_delay_alu(wait_ctx & ctx,std::vector<aco_ptr<Instruction>> & instructions,alu_delay_info & delay)1014 emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
1015                alu_delay_info& delay)
1016 {
1017    uint32_t imm = 0;
1018    if (delay.trans_instrs != delay.trans_nop) {
1019       imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
1020    }
1021 
1022    if (delay.valu_instrs != delay.valu_nop) {
1023       imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
1024    }
1025 
1026    /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
1027     * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
1028     * getting this wrong isn't an issue. */
1029    if (delay.salu_cycles && imm <= 0xf) {
1030       unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
1031       imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
1032    }
1033 
1034    SOPP_instruction* inst =
1035       create_instruction<SOPP_instruction>(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
1036    inst->imm = imm;
1037    inst->block = -1;
1038    inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
1039    instructions.emplace_back(inst);
1040    delay = alu_delay_info();
1041 }
1042 
1043 void
handle_block(Program * program,Block & block,wait_ctx & ctx)1044 handle_block(Program* program, Block& block, wait_ctx& ctx)
1045 {
1046    std::vector<aco_ptr<Instruction>> new_instructions;
1047 
1048    wait_imm queued_imm;
1049    alu_delay_info queued_delay;
1050 
1051    for (aco_ptr<Instruction>& instr : block.instructions) {
1052       bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
1053       bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
1054 
1055       memory_sync_info sync_info = get_sync_info(instr.get());
1056       kill(queued_imm, queued_delay, instr.get(), ctx, sync_info);
1057 
1058       if (program->gfx_level >= GFX11)
1059          gen_alu(instr.get(), ctx);
1060       gen(instr.get(), ctx);
1061 
1062       if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) {
1063          if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
1064             instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
1065             queued_imm.exp = wait_imm::unset_counter;
1066          }
1067 
1068          if (!queued_imm.empty())
1069             emit_waitcnt(ctx, new_instructions, queued_imm);
1070          if (!queued_delay.empty())
1071             emit_delay_alu(ctx, new_instructions, queued_delay);
1072 
1073          bool is_ordered_count_acquire =
1074             instr->opcode == aco_opcode::ds_ordered_count &&
1075             !((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1);
1076 
1077          new_instructions.emplace_back(std::move(instr));
1078          perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
1079 
1080          if (is_ordered_count_acquire)
1081             queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
1082       }
1083    }
1084 
1085    /* For last block of a program which has succeed shader part, wait all memory ops done
1086     * before go to next shader part.
1087     */
1088    if (block.kind & block_kind_end_with_regs)
1089       force_waitcnt(ctx, queued_imm);
1090 
1091    if (!queued_imm.empty())
1092       emit_waitcnt(ctx, new_instructions, queued_imm);
1093    if (!queued_delay.empty())
1094       emit_delay_alu(ctx, new_instructions, queued_delay);
1095 
1096    block.instructions.swap(new_instructions);
1097 }
1098 
1099 } /* end namespace */
1100 
1101 void
insert_wait_states(Program * program)1102 insert_wait_states(Program* program)
1103 {
1104    /* per BB ctx */
1105    std::vector<bool> done(program->blocks.size());
1106    std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
1107    std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
1108 
1109    std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1110    unsigned loop_progress = 0;
1111 
1112    if (program->pending_lds_access) {
1113       update_barrier_imm(in_ctx[0], get_counters_for_event(event_lds), event_lds,
1114                          memory_sync_info(storage_shared));
1115    }
1116 
1117    for (Definition def : program->args_pending_vmem) {
1118       update_counters(in_ctx[0], event_vmem);
1119       insert_wait_entry(in_ctx[0], def, event_vmem);
1120    }
1121 
1122    for (unsigned i = 0; i < program->blocks.size();) {
1123       Block& current = program->blocks[i++];
1124 
1125       if (current.kind & block_kind_discard_early_exit) {
1126          /* Because the jump to the discard early exit block may happen anywhere in a block, it's
1127           * not possible to join it with its predecessors this way.
1128           * We emit all required waits when emitting the discard block.
1129           */
1130          continue;
1131       }
1132 
1133       wait_ctx ctx = in_ctx[current.index];
1134 
1135       if (current.kind & block_kind_loop_header) {
1136          loop_header_indices.push(current.index);
1137       } else if (current.kind & block_kind_loop_exit) {
1138          bool repeat = false;
1139          if (loop_progress == loop_header_indices.size()) {
1140             i = loop_header_indices.top();
1141             repeat = true;
1142          }
1143          loop_header_indices.pop();
1144          loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
1145          if (repeat)
1146             continue;
1147       }
1148 
1149       bool changed = false;
1150       for (unsigned b : current.linear_preds)
1151          changed |= ctx.join(&out_ctx[b], false);
1152       for (unsigned b : current.logical_preds)
1153          changed |= ctx.join(&out_ctx[b], true);
1154 
1155       if (done[current.index] && !changed) {
1156          in_ctx[current.index] = std::move(ctx);
1157          continue;
1158       } else {
1159          in_ctx[current.index] = ctx;
1160       }
1161 
1162       loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
1163       done[current.index] = true;
1164 
1165       handle_block(program, current, ctx);
1166 
1167       out_ctx[current.index] = std::move(ctx);
1168    }
1169 
1170    /* Combine s_delay_alu using the skip field. */
1171    if (program->gfx_level >= GFX11) {
1172       for (Block& block : program->blocks) {
1173          int i = 0;
1174          int prev_delay_alu = -1;
1175          for (aco_ptr<Instruction>& instr : block.instructions) {
1176             if (instr->opcode != aco_opcode::s_delay_alu) {
1177                block.instructions[i++] = std::move(instr);
1178                continue;
1179             }
1180 
1181             uint16_t imm = instr->sopp().imm;
1182             int skip = i - prev_delay_alu - 1;
1183             if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
1184                if (imm >> 7 == 0)
1185                   prev_delay_alu = i;
1186                block.instructions[i++] = std::move(instr);
1187                continue;
1188             }
1189 
1190             block.instructions[prev_delay_alu]->sopp().imm |= (skip << 4) | (imm << 7);
1191             prev_delay_alu = -1;
1192          }
1193          block.instructions.resize(i);
1194       }
1195    }
1196 }
1197 
1198 } // namespace aco
1199