• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "aco_ir.h"
7 
8 #include "util/bitscan.h"
9 #include "util/bitset.h"
10 #include "util/macros.h"
11 
12 #include <limits>
13 
14 /*
15  * This pass implements a simple forward list-scheduler which works on a small
16  * partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
17  * entirely freely. Memory load instructions must be kept in-order and any other
18  * instruction must not be re-scheduled at all.
19  *
20  * The main goal of this scheduler is to create more memory clauses, schedule
21  * memory loads early, and to improve ALU instruction level parallelism.
22  */
23 
24 namespace aco {
25 namespace {
26 
27 constexpr unsigned num_nodes = 16;
28 using mask_t = uint16_t;
29 static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
30 
31 struct VOPDInfo {
VOPDInfoaco::__anon08b85f380111::VOPDInfo32    VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0), is_commutative(0) {}
33    uint16_t is_opy_only : 1;
34    uint16_t is_dst_odd : 1;
35    uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */
36    uint16_t has_literal : 1;
37    uint16_t is_commutative : 1;
38    aco_opcode op = aco_opcode::num_opcodes;
39    uint32_t literal = 0;
40 };
41 
42 struct InstrInfo {
43    Instruction* instr;
44    int16_t wait_cycles;          /* estimated remaining cycles until instruction can be issued. */
45    mask_t dependency_mask;       /* bitmask of nodes which have to be scheduled before this node. */
46    mask_t write_for_read_mask;   /* bitmask of nodes in the DAG that have a RaW dependency. */
47    uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
48 };
49 
50 struct RegisterInfo {
51    mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
52    uint16_t latency : 11; /* estimated outstanding latency of last register write outside the DAG. */
53    uint16_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
54    uint16_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
55 };
56 
57 struct SchedILPContext {
58    Program* program;
59    bool is_vopd = false;
60    InstrInfo nodes[num_nodes];
61    RegisterInfo regs[512];
62    BITSET_DECLARE(reg_has_latency, 512) = { 0 };
63    mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
64    mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
65    uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
66    uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
67    bool potential_partial_clause; /* indicates that last_non_reorderable is the last instruction in
68                                      the DAG, meaning the clause might continue outside of it. */
69 
70    /* VOPD scheduler: */
71    VOPDInfo vopd[num_nodes];
72    VOPDInfo prev_vopd_info;
73    InstrInfo prev_info;
74 
75    mask_t vopd_odd_mask = 0;
76    mask_t vopd_even_mask = 0;
77 };
78 
79 /**
80  * Returns true for side-effect free SALU and VALU instructions.
81  */
82 bool
can_reorder(const Instruction * const instr)83 can_reorder(const Instruction* const instr)
84 {
85    if (instr->isVALU() || instr->isVINTRP())
86       return true;
87    if (!instr->isSALU() || instr->isSOPP())
88       return false;
89 
90    switch (instr->opcode) {
91    /* SOP2 */
92    case aco_opcode::s_cbranch_g_fork:
93    case aco_opcode::s_rfe_restore_b64:
94    /* SOP1 */
95    case aco_opcode::s_setpc_b64:
96    case aco_opcode::s_swappc_b64:
97    case aco_opcode::s_rfe_b64:
98    case aco_opcode::s_cbranch_join:
99    case aco_opcode::s_set_gpr_idx_idx:
100    case aco_opcode::s_sendmsg_rtn_b32:
101    case aco_opcode::s_sendmsg_rtn_b64:
102    case aco_opcode::s_barrier_signal:
103    case aco_opcode::s_barrier_signal_isfirst:
104    case aco_opcode::s_get_barrier_state:
105    case aco_opcode::s_barrier_init:
106    case aco_opcode::s_barrier_join:
107    case aco_opcode::s_wakeup_barrier:
108    /* SOPK */
109    case aco_opcode::s_cbranch_i_fork:
110    case aco_opcode::s_getreg_b32:
111    case aco_opcode::s_setreg_b32:
112    case aco_opcode::s_setreg_imm32_b32:
113    case aco_opcode::s_call_b64:
114    case aco_opcode::s_waitcnt_vscnt:
115    case aco_opcode::s_waitcnt_vmcnt:
116    case aco_opcode::s_waitcnt_expcnt:
117    case aco_opcode::s_waitcnt_lgkmcnt:
118    case aco_opcode::s_subvector_loop_begin:
119    case aco_opcode::s_subvector_loop_end:
120    /* SOPC */
121    case aco_opcode::s_setvskip:
122    case aco_opcode::s_set_gpr_idx_on: return false;
123    default: break;
124    }
125 
126    return true;
127 }
128 
129 VOPDInfo
get_vopd_info(const SchedILPContext & ctx,const Instruction * instr)130 get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
131 {
132    if (instr->format != Format::VOP1 && instr->format != Format::VOP2)
133       return VOPDInfo();
134 
135    VOPDInfo info;
136    info.is_commutative = true;
137    switch (instr->opcode) {
138    case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break;
139    case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break;
140    case aco_opcode::v_fmamk_f32:
141       info.op = aco_opcode::v_dual_fmamk_f32;
142       info.is_commutative = false;
143       break;
144    case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break;
145    case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break;
146    case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break;
147    case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break;
148    case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break;
149    case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break;
150    case aco_opcode::v_bfrev_b32:
151       if (!instr->operands[0].isConstant())
152          return VOPDInfo();
153       info.op = aco_opcode::v_dual_mov_b32;
154       break;
155    case aco_opcode::v_cndmask_b32:
156       info.op = aco_opcode::v_dual_cndmask_b32;
157       info.is_commutative = false;
158       break;
159    case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break;
160    case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break;
161    case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break;
162    case aco_opcode::v_add_u32:
163       info.op = aco_opcode::v_dual_add_nc_u32;
164       info.is_opy_only = true;
165       break;
166    case aco_opcode::v_lshlrev_b32:
167       info.op = aco_opcode::v_dual_lshlrev_b32;
168       info.is_opy_only = true;
169       info.is_commutative = false;
170       break;
171    case aco_opcode::v_and_b32:
172       info.op = aco_opcode::v_dual_and_b32;
173       info.is_opy_only = true;
174       break;
175    default: return VOPDInfo();
176    }
177 
178    /* Each instruction may use at most one SGPR. */
179    if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr))
180       return VOPDInfo();
181 
182    info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1;
183 
184    static const unsigned bank_mask[3] = {0x3, 0x3, 0x1};
185    bool has_sgpr = false;
186    for (unsigned i = 0; i < instr->operands.size(); i++) {
187       Operand op = instr->operands[i];
188       if (instr->opcode == aco_opcode::v_bfrev_b32)
189          op = Operand::get_const(ctx.program->gfx_level, util_bitreverse(op.constantValue()), 4);
190 
191       unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
192       if (op.isOfType(RegType::vgpr))
193          info.src_banks |= 1 << (port * 4 + (op.physReg().reg() & bank_mask[port]));
194 
195       /* Check all operands because of fmaak/fmamk. */
196       if (op.isLiteral()) {
197          assert(!info.has_literal || info.literal == op.constantValue());
198          info.has_literal = true;
199          info.literal = op.constantValue();
200       }
201 
202       /* Check all operands because of cndmask. */
203       has_sgpr |= !op.isConstant() && op.isOfType(RegType::sgpr);
204    }
205 
206    /* An instruction can't use both a literal and an SGPR. */
207    if (has_sgpr && info.has_literal)
208       return VOPDInfo();
209 
210    info.is_commutative &= instr->operands[0].isOfType(RegType::vgpr);
211 
212    return info;
213 }
214 
215 bool
is_vopd_compatible(const VOPDInfo & a,const VOPDInfo & b)216 is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
217 {
218    if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
219       return false;
220 
221    /* Both can use a literal, but it must be the same literal. */
222    if (a.has_literal && b.has_literal && a.literal != b.literal)
223       return false;
224 
225    /* The rest is checking src VGPR bank compatibility. */
226    if ((a.src_banks & b.src_banks) == 0)
227       return true;
228 
229    if (!a.is_commutative && !b.is_commutative)
230       return false;
231 
232    uint16_t src0 = a.src_banks & 0xf;
233    uint16_t src1 = a.src_banks & 0xf0;
234    uint16_t src2 = a.src_banks & 0x300;
235    uint16_t a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
236    if ((a_src_banks & b.src_banks) != 0)
237       return false;
238 
239    /* If we have to turn v_mov_b32 into v_add_u32 but there is already an OPY-only instruction,
240     * we can't do it.
241     */
242    if (a.op == aco_opcode::v_dual_mov_b32 && !b.is_commutative && b.is_opy_only)
243       return false;
244    if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
245       return false;
246 
247    return true;
248 }
249 
250 bool
can_use_vopd(const SchedILPContext & ctx,unsigned idx)251 can_use_vopd(const SchedILPContext& ctx, unsigned idx)
252 {
253    VOPDInfo cur_vopd = ctx.vopd[idx];
254    Instruction* first = ctx.nodes[idx].instr;
255    Instruction* second = ctx.prev_info.instr;
256 
257    if (!second)
258       return false;
259 
260    if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
261       return false;
262 
263    if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
264       return false;
265 
266    assert(first->definitions.size() == 1);
267    assert(first->definitions[0].size() == 1);
268    assert(second->definitions.size() == 1);
269    assert(second->definitions[0].size() == 1);
270 
271    /* Check for WaW dependency. */
272    if (first->definitions[0].physReg() == second->definitions[0].physReg())
273       return false;
274 
275    /* Check for RaW dependency. */
276    for (Operand op : second->operands) {
277       assert(op.size() == 1);
278       if (first->definitions[0].physReg() == op.physReg())
279          return false;
280    }
281 
282    /* WaR dependencies are not a concern. */
283    return true;
284 }
285 
286 Instruction_cycle_info
get_cycle_info_with_mem_latency(const SchedILPContext & ctx,const Instruction * const instr)287 get_cycle_info_with_mem_latency(const SchedILPContext& ctx, const Instruction* const instr)
288 {
289    Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
290 
291    /* Based on get_wait_counter_info in aco_statistics.cpp. */
292    if (instr->isVMEM() || instr->isFlatLike()) {
293       cycle_info.latency = 320;
294    } else if (instr->isSMEM()) {
295       if (instr->operands.empty()) {
296          cycle_info.latency = 1;
297       } else if (instr->operands[0].size() == 2 ||
298                  (instr->operands[1].isConstant() &&
299                   (instr->operands.size() < 3 || instr->operands[2].isConstant()))) {
300          /* Likely cached. */
301          cycle_info.latency = 30;
302       } else {
303          cycle_info.latency = 200;
304       }
305    } else if (instr->isLDSDIR()) {
306       cycle_info.latency = 13;
307    } else if (instr->isDS()) {
308       cycle_info.latency = 20;
309    }
310 
311    return cycle_info;
312 }
313 
314 bool
is_memory_instr(const Instruction * const instr)315 is_memory_instr(const Instruction* const instr)
316 {
317    /* For memory instructions, we allow to reorder them with ALU if it helps
318     * to form larger clauses or to increase def-use distances.
319     */
320    return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || instr->accessesLDS() ||
321           instr->isEXP();
322 }
323 
324 constexpr unsigned max_sgpr = 128;
325 constexpr unsigned min_vgpr = 256;
326 
327 void
add_entry(SchedILPContext & ctx,Instruction * const instr,const uint32_t idx)328 add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
329 {
330    InstrInfo& entry = ctx.nodes[idx];
331    entry.instr = instr;
332    entry.wait_cycles = 0;
333    entry.write_for_read_mask = 0;
334    const mask_t mask = BITFIELD_BIT(idx);
335    bool reorder = can_reorder(instr);
336    ctx.active_mask |= mask;
337 
338    if (ctx.is_vopd) {
339       VOPDInfo vopd = get_vopd_info(ctx, entry.instr);
340 
341       ctx.vopd[idx] = vopd;
342       ctx.vopd_odd_mask &= ~mask;
343       ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0;
344       ctx.vopd_even_mask &= ~mask;
345       ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask;
346    }
347 
348    for (const Operand& op : instr->operands) {
349       assert(op.isFixed());
350       unsigned reg = op.physReg();
351       if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
352          reorder &= reg != pops_exiting_wave_id;
353          continue;
354       }
355 
356       for (unsigned i = 0; i < op.size(); i++) {
357          RegisterInfo& reg_info = ctx.regs[reg + i];
358 
359          /* Add register reads. */
360          reg_info.read_mask |= mask;
361 
362          if (reg_info.has_direct_dependency) {
363             /* A previous dependency is still part of the DAG. */
364             ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
365             entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
366          } else if (BITSET_TEST(ctx.reg_has_latency, reg + i)) {
367             entry.wait_cycles = MAX2(entry.wait_cycles, reg_info.latency);
368          }
369       }
370    }
371 
372    /* Check if this instructions reads implicit registers. */
373    if (needs_exec_mask(instr)) {
374       for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
375          if (ctx.regs[reg].has_direct_dependency) {
376             entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
377             ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
378          }
379          ctx.regs[reg].read_mask |= mask;
380       }
381    }
382    if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
383       for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
384          if (ctx.regs[reg].has_direct_dependency) {
385             entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
386             ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
387          }
388          ctx.regs[reg].read_mask |= mask;
389       }
390    }
391 
392    mask_t write_dep_mask = 0;
393    for (const Definition& def : instr->definitions) {
394       for (unsigned i = 0; i < def.size(); i++) {
395          RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
396 
397          /* Add all previous register reads and writes to the dependencies. */
398          write_dep_mask |= reg_info.read_mask;
399          reg_info.read_mask = mask;
400 
401          /* This register write is a direct dependency for all following reads. */
402          reg_info.has_direct_dependency = 1;
403          reg_info.direct_dependency = idx;
404       }
405    }
406 
407    if (!reorder) {
408       ctx.non_reorder_mask |= mask;
409 
410       /* Set this node as last non-reorderable instruction */
411       if (ctx.next_non_reorderable == UINT8_MAX) {
412          ctx.next_non_reorderable = idx;
413       } else {
414          ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
415       }
416       ctx.last_non_reorderable = idx;
417       entry.next_non_reorderable = UINT8_MAX;
418 
419       /* Just don't reorder these at all. */
420       if (!is_memory_instr(instr) || instr->definitions.empty() ||
421           get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) {
422          /* Add all previous instructions as dependencies. */
423          entry.dependency_mask = ctx.active_mask & ~ctx.non_reorder_mask;
424       }
425 
426       /* Remove non-reorderable instructions from dependencies, since WaR dependencies can interfere
427        * with clause formation. This should be fine, since these are always scheduled in-order and
428        * any cases that are actually a concern for clause formation are added as transitive
429        * dependencies. */
430       write_dep_mask &= ~ctx.non_reorder_mask;
431       ctx.potential_partial_clause = true;
432    } else if (ctx.last_non_reorderable != UINT8_MAX) {
433       ctx.potential_partial_clause = false;
434    }
435 
436    entry.dependency_mask |= write_dep_mask;
437    entry.dependency_mask &= ~mask;
438 
439    for (unsigned i = 0; i < num_nodes; i++) {
440       if (!ctx.nodes[i].instr || i == idx)
441          continue;
442 
443       /* Add transitive dependencies. */
444       if (entry.dependency_mask & BITFIELD_BIT(i))
445          entry.dependency_mask |= ctx.nodes[i].dependency_mask;
446    }
447 }
448 
449 void
remove_entry(SchedILPContext & ctx,const Instruction * const instr,const uint32_t idx)450 remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_t idx)
451 {
452    const mask_t mask = ~BITFIELD_BIT(idx);
453    ctx.active_mask &= mask;
454 
455    int latency = 0;
456    int stall = 1;
457    if (!ctx.is_vopd) {
458       Instruction_cycle_info cycle_info = get_cycle_info_with_mem_latency(ctx, instr);
459       latency = cycle_info.latency;
460       stall = cycle_info.issue_cycles;
461 
462       if (ctx.nodes[idx].wait_cycles > 0) {
463          /* Add remaining latency stall. */
464          stall += ctx.nodes[idx].wait_cycles;
465       }
466 
467       unsigned i;
468       BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
469          if (ctx.regs[i].latency <= stall) {
470             ctx.regs[i].latency = 0;
471             BITSET_CLEAR(ctx.reg_has_latency, i);
472          } else {
473             ctx.regs[i].latency -= stall;
474          }
475       }
476    }
477 
478    for (const Operand& op : instr->operands) {
479       const unsigned reg = op.physReg();
480       if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
481          continue;
482 
483       for (unsigned i = 0; i < op.size(); i++) {
484          RegisterInfo& reg_info = ctx.regs[reg + i];
485          reg_info.read_mask &= mask;
486       }
487    }
488    if (needs_exec_mask(instr)) {
489       ctx.regs[exec_lo].read_mask &= mask;
490       ctx.regs[exec_hi].read_mask &= mask;
491    }
492    if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
493       ctx.regs[flat_scr_lo].read_mask &= mask;
494       ctx.regs[flat_scr_hi].read_mask &= mask;
495    }
496 
497    for (const Definition& def : instr->definitions) {
498       for (unsigned i = 0; i < def.size(); i++) {
499          unsigned reg = def.physReg().reg() + i;
500          ctx.regs[reg].read_mask &= mask;
501          if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) {
502             ctx.regs[reg].has_direct_dependency = false;
503             if (!ctx.is_vopd) {
504                BITSET_SET(ctx.reg_has_latency, reg);
505                ctx.regs[reg].latency = latency;
506             }
507          }
508       }
509    }
510 
511    for (unsigned i = 0; i < num_nodes; i++) {
512       ctx.nodes[i].dependency_mask &= mask;
513       ctx.nodes[i].wait_cycles -= stall;
514       if (ctx.nodes[idx].write_for_read_mask & BITFIELD_BIT(i) && !ctx.is_vopd) {
515          ctx.nodes[i].wait_cycles = MAX2(ctx.nodes[i].wait_cycles, latency);
516       }
517    }
518 
519    if (ctx.next_non_reorderable == idx) {
520       ctx.non_reorder_mask &= mask;
521       ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
522       if (ctx.last_non_reorderable == idx) {
523          ctx.last_non_reorderable = UINT8_MAX;
524          ctx.potential_partial_clause = false;
525       }
526    }
527 }
528 
529 /**
530  * Returns a bitfield of nodes which have to be scheduled before the
531  * next non-reorderable instruction.
532  * If the next non-reorderable instruction can form a clause, returns the
533  * dependencies of the entire clause.
534  */
535 mask_t
collect_clause_dependencies(const SchedILPContext & ctx,const uint8_t next,mask_t clause_mask)536 collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask_t clause_mask)
537 {
538    const InstrInfo& entry = ctx.nodes[next];
539    mask_t dependencies = entry.dependency_mask;
540    clause_mask |= BITFIELD_BIT(next);
541 
542    /* If we dependent on the clause, don't add our dependencies. */
543    if (dependencies & clause_mask)
544       return 0;
545 
546    if (!is_memory_instr(entry.instr))
547       return dependencies;
548 
549    /* If this is potentially an "open" clause, meaning that the clause might
550     * consist of instruction not yet added to the DAG, consider all previous
551     * instructions as dependencies. This prevents splitting of larger, already
552     * formed clauses.
553     */
554    if (next == ctx.last_non_reorderable && ctx.potential_partial_clause)
555       return (~clause_mask & ctx.active_mask) | dependencies;
556 
557    /* Check if this can form a clause with the following non-reorderable instruction */
558    if (entry.next_non_reorderable != UINT8_MAX &&
559        should_form_clause(entry.instr, ctx.nodes[entry.next_non_reorderable].instr)) {
560       dependencies |= collect_clause_dependencies(ctx, entry.next_non_reorderable, clause_mask);
561    }
562 
563    return dependencies;
564 }
565 
566 /**
567  * Returns the index of the next instruction to be selected.
568  */
569 unsigned
select_instruction_ilp(const SchedILPContext & ctx)570 select_instruction_ilp(const SchedILPContext& ctx)
571 {
572    mask_t mask = ctx.active_mask;
573 
574    /* First, continue the currently open clause.
575     * Otherwise collect all dependencies of the next non-reorderable instruction(s).
576     * These make up the list of possible candidates.
577     */
578    if (ctx.next_non_reorderable != UINT8_MAX) {
579       if (ctx.prev_info.instr && ctx.nodes[ctx.next_non_reorderable].dependency_mask == 0 &&
580           should_form_clause(ctx.prev_info.instr, ctx.nodes[ctx.next_non_reorderable].instr))
581          return ctx.next_non_reorderable;
582       mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
583    }
584 
585    /* VINTRP(gfx6-10.3) can be handled like alu, but switching between VINTRP and other
586     * alu has a cost. So if the previous instr was VINTRP, try to keep selecting VINTRP.
587     */
588    bool prefer_vintrp = ctx.prev_info.instr && ctx.prev_info.instr->isVINTRP();
589 
590    /* Select the instruction with lowest wait_cycles of all candidates. */
591    unsigned idx = -1u;
592    bool idx_vintrp = false;
593    int32_t wait_cycles = INT32_MAX;
594    u_foreach_bit (i, mask) {
595       const InstrInfo& candidate = ctx.nodes[i];
596 
597       /* Check if the candidate has pending dependencies. */
598       if (candidate.dependency_mask)
599          continue;
600 
601       bool is_vintrp = prefer_vintrp && candidate.instr->isVINTRP();
602 
603       if (idx == -1u || (is_vintrp && !idx_vintrp) ||
604           (is_vintrp == idx_vintrp && candidate.wait_cycles < wait_cycles)) {
605          idx = i;
606          idx_vintrp = is_vintrp;
607          wait_cycles = candidate.wait_cycles;
608       }
609    }
610 
611    if (idx != -1u)
612       return idx;
613 
614    /* Select the next non-reorderable instruction. (it must have no dependencies) */
615    assert(ctx.next_non_reorderable != UINT8_MAX);
616    assert(ctx.nodes[ctx.next_non_reorderable].dependency_mask == 0);
617    return ctx.next_non_reorderable;
618 }
619 
620 bool
compare_nodes_vopd(const SchedILPContext & ctx,int num_vopd_odd_minus_even,bool * use_vopd,unsigned current,unsigned candidate)621 compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
622                    unsigned current, unsigned candidate)
623 {
624    if (can_use_vopd(ctx, candidate)) {
625       /* If we can form a VOPD instruction, always prefer to do so. */
626       if (!*use_vopd) {
627          *use_vopd = true;
628          return true;
629       }
630    } else {
631       if (*use_vopd)
632          return false;
633 
634       /* Neither current nor candidate can form a VOPD instruction with the previously scheduled
635        * instruction. */
636       VOPDInfo current_vopd = ctx.vopd[current];
637       VOPDInfo candidate_vopd = ctx.vopd[candidate];
638 
639       /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */
640       bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes;
641       bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes;
642       if (current_vopd_capable != candidate_vopd_capable)
643          return !candidate_vopd_capable;
644 
645       /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of
646        * odd/even instructions, in case selecting this instruction fails to make a pair.
647        */
648       if (current_vopd_capable && num_vopd_odd_minus_even != 0) {
649          assert(candidate_vopd_capable);
650          bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0;
651          if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd)
652             return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd;
653       }
654    }
655 
656    return ctx.nodes[candidate].wait_cycles < ctx.nodes[current].wait_cycles;
657 }
658 
659 unsigned
select_instruction_vopd(const SchedILPContext & ctx,bool * use_vopd)660 select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
661 {
662    *use_vopd = false;
663 
664    mask_t mask = ctx.active_mask;
665    if (ctx.next_non_reorderable != UINT8_MAX)
666       mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask;
667 
668    if (mask == 0)
669       return ctx.next_non_reorderable;
670 
671    int num_vopd_odd_minus_even =
672       (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask);
673 
674    unsigned cur = -1u;
675    u_foreach_bit (i, mask) {
676       const InstrInfo& candidate = ctx.nodes[i];
677 
678       /* Check if the candidate has pending dependencies. */
679       if (candidate.dependency_mask)
680          continue;
681 
682       if (cur == -1u) {
683          cur = i;
684          *use_vopd = can_use_vopd(ctx, i);
685       } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
686          cur = i;
687       }
688    }
689 
690    assert(cur != -1u);
691    return cur;
692 }
693 
694 void
get_vopd_opcode_operands(const SchedILPContext & ctx,Instruction * instr,const VOPDInfo & info,bool swap,aco_opcode * op,unsigned * num_operands,Operand * operands)695 get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const VOPDInfo& info,
696                          bool swap, aco_opcode* op, unsigned* num_operands, Operand* operands)
697 {
698    *op = info.op;
699    *num_operands += instr->operands.size();
700    std::copy(instr->operands.begin(), instr->operands.end(), operands);
701 
702    if (instr->opcode == aco_opcode::v_bfrev_b32) {
703       operands[0] = Operand::get_const(ctx.program->gfx_level,
704                                        util_bitreverse(operands[0].constantValue()), 4);
705    }
706 
707    if (swap && info.op == aco_opcode::v_dual_mov_b32) {
708       *op = aco_opcode::v_dual_add_nc_u32;
709       (*num_operands)++;
710       operands[1] = operands[0];
711       operands[0] = Operand::zero();
712    } else if (swap) {
713       if (info.op == aco_opcode::v_dual_sub_f32)
714          *op = aco_opcode::v_dual_subrev_f32;
715       else if (info.op == aco_opcode::v_dual_subrev_f32)
716          *op = aco_opcode::v_dual_sub_f32;
717       std::swap(operands[0], operands[1]);
718    }
719 }
720 
721 Instruction*
create_vopd_instruction(const SchedILPContext & ctx,unsigned idx)722 create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
723 {
724    Instruction* x = ctx.prev_info.instr;
725    Instruction* y = ctx.nodes[idx].instr;
726    VOPDInfo x_info = ctx.prev_vopd_info;
727    VOPDInfo y_info = ctx.vopd[idx];
728 
729    bool swap_x = false, swap_y = false;
730    if (x_info.src_banks & y_info.src_banks) {
731       assert(x_info.is_commutative || y_info.is_commutative);
732       /* Avoid swapping v_mov_b32 because it will become an OPY-only opcode. */
733       if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
734          swap_x = true;
735          x_info.is_opy_only = true;
736       } else {
737          swap_x = x_info.is_commutative && x_info.op != aco_opcode::v_dual_mov_b32;
738          swap_y = y_info.is_commutative && !swap_x;
739       }
740    }
741 
742    if (x_info.is_opy_only) {
743       std::swap(x, y);
744       std::swap(x_info, y_info);
745       std::swap(swap_x, swap_y);
746    }
747 
748    aco_opcode x_op, y_op;
749    unsigned num_operands = 0;
750    Operand operands[6];
751    get_vopd_opcode_operands(ctx, x, x_info, swap_x, &x_op, &num_operands, operands);
752    get_vopd_opcode_operands(ctx, y, y_info, swap_y, &y_op, &num_operands, operands + num_operands);
753 
754    Instruction* instr = create_instruction(x_op, Format::VOPD, num_operands, 2);
755    instr->vopd().opy = y_op;
756    instr->definitions[0] = x->definitions[0];
757    instr->definitions[1] = y->definitions[0];
758    std::copy(operands, operands + num_operands, instr->operands.begin());
759 
760    return instr;
761 }
762 
763 template <typename It>
764 void
do_schedule(SchedILPContext & ctx,It & insert_it,It & remove_it,It instructions_begin,It instructions_end)765 do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin,
766             It instructions_end)
767 {
768    for (unsigned i = 0; i < num_nodes; i++) {
769       if (remove_it == instructions_end)
770          break;
771 
772       add_entry(ctx, (remove_it++)->get(), i);
773    }
774 
775    ctx.prev_info.instr = NULL;
776    bool use_vopd = false;
777 
778    while (ctx.active_mask) {
779       unsigned next_idx =
780          ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
781       Instruction* next_instr = ctx.nodes[next_idx].instr;
782 
783       if (use_vopd) {
784          std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
785          ctx.prev_info.instr = NULL;
786       } else {
787          (insert_it++)->reset(next_instr);
788          ctx.prev_info = ctx.nodes[next_idx];
789          ctx.prev_vopd_info = ctx.vopd[next_idx];
790       }
791 
792       remove_entry(ctx, next_instr, next_idx);
793       ctx.nodes[next_idx].instr = NULL;
794 
795       if (remove_it != instructions_end) {
796          add_entry(ctx, (remove_it++)->get(), next_idx);
797       } else if (ctx.last_non_reorderable != UINT8_MAX) {
798          ctx.potential_partial_clause = false;
799          ctx.last_non_reorderable = UINT8_MAX;
800       }
801    }
802 }
803 
804 } // namespace
805 
806 void
schedule_ilp(Program * program)807 schedule_ilp(Program* program)
808 {
809    SchedILPContext ctx = {program};
810 
811    for (Block& block : program->blocks) {
812       if (block.instructions.empty())
813          continue;
814       auto it = block.instructions.begin();
815       auto insert_it = block.instructions.begin();
816       do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
817       block.instructions.resize(insert_it - block.instructions.begin());
818       if (block.linear_succs.empty() || block.instructions.back()->opcode == aco_opcode::s_branch)
819          BITSET_ZERO(ctx.reg_has_latency);
820    }
821 }
822 
823 void
schedule_vopd(Program * program)824 schedule_vopd(Program* program)
825 {
826    if (program->gfx_level < GFX11 || program->wave_size != 32)
827       return;
828 
829    SchedILPContext ctx = {program};
830    ctx.is_vopd = true;
831 
832    for (Block& block : program->blocks) {
833       auto it = block.instructions.rbegin();
834       auto insert_it = block.instructions.rbegin();
835       do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend());
836       block.instructions.erase(block.instructions.begin(), insert_it.base());
837    }
838 }
839 
840 } // namespace aco
841