• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "aco_ir.h"
7 
8 #include "util/bitscan.h"
9 #include "util/macros.h"
10 
11 #include <limits>
12 
13 /*
14  * This pass implements a simple forward list-scheduler which works on a small
15  * partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
16  * entirely freely. Memory load instructions must be kept in-order and any other
17  * instruction must not be re-scheduled at all.
18  *
19  * The main goal of this scheduler is to create more memory clauses, schedule
20  * memory loads early, and to improve ALU instruction level parallelism.
21  */
22 
23 namespace aco {
24 namespace {
25 
26 constexpr unsigned num_nodes = 16;
27 using mask_t = uint16_t;
28 static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
29 
30 struct VOPDInfo {
VOPDInfoaco::__anon48ae54590111::VOPDInfo31    VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0), is_commutative(0) {}
32    uint16_t is_opy_only : 1;
33    uint16_t is_dst_odd : 1;
34    uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */
35    uint16_t has_literal : 1;
36    uint16_t is_commutative : 1;
37    aco_opcode op = aco_opcode::num_opcodes;
38    uint32_t literal = 0;
39 };
40 
41 struct InstrInfo {
42    Instruction* instr;
43    int32_t priority;
44    mask_t dependency_mask;       /* bitmask of nodes which have to be scheduled before this node. */
45    uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
46    bool potential_clause; /* indicates that this instruction is not (yet) immediately followed by a
47                              reorderable instruction. */
48 };
49 
50 struct RegisterInfo {
51    mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
52    int8_t latency;   /* estimated latency of last register write. */
53    uint8_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
54    uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
55    uint8_t padding : 3;
56 };
57 
58 struct SchedILPContext {
59    Program* program;
60    bool is_vopd = false;
61    InstrInfo nodes[num_nodes];
62    RegisterInfo regs[512];
63    mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
64    mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
65    uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
66    uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
67 
68    /* VOPD scheduler: */
69    VOPDInfo vopd[num_nodes];
70    VOPDInfo prev_vopd_info;
71    InstrInfo prev_info;
72 
73    mask_t vopd_odd_mask = 0;
74    mask_t vopd_even_mask = 0;
75 };
76 
77 /**
78  * Returns true for side-effect free SALU and VALU instructions.
79  */
80 bool
can_reorder(const Instruction * const instr)81 can_reorder(const Instruction* const instr)
82 {
83    if (instr->isVALU())
84       return true;
85    if (!instr->isSALU() || instr->isSOPP())
86       return false;
87 
88    switch (instr->opcode) {
89    /* SOP2 */
90    case aco_opcode::s_cbranch_g_fork:
91    case aco_opcode::s_rfe_restore_b64:
92    /* SOP1 */
93    case aco_opcode::s_setpc_b64:
94    case aco_opcode::s_swappc_b64:
95    case aco_opcode::s_rfe_b64:
96    case aco_opcode::s_cbranch_join:
97    case aco_opcode::s_set_gpr_idx_idx:
98    case aco_opcode::s_sendmsg_rtn_b32:
99    case aco_opcode::s_sendmsg_rtn_b64:
100    /* SOPK */
101    case aco_opcode::s_cbranch_i_fork:
102    case aco_opcode::s_getreg_b32:
103    case aco_opcode::s_setreg_b32:
104    case aco_opcode::s_setreg_imm32_b32:
105    case aco_opcode::s_call_b64:
106    case aco_opcode::s_waitcnt_vscnt:
107    case aco_opcode::s_waitcnt_vmcnt:
108    case aco_opcode::s_waitcnt_expcnt:
109    case aco_opcode::s_waitcnt_lgkmcnt:
110    case aco_opcode::s_subvector_loop_begin:
111    case aco_opcode::s_subvector_loop_end:
112    /* SOPC */
113    case aco_opcode::s_setvskip:
114    case aco_opcode::s_set_gpr_idx_on: return false;
115    default: break;
116    }
117 
118    return true;
119 }
120 
121 VOPDInfo
get_vopd_info(const Instruction * instr)122 get_vopd_info(const Instruction* instr)
123 {
124    if (instr->format != Format::VOP1 && instr->format != Format::VOP2)
125       return VOPDInfo();
126 
127    VOPDInfo info;
128    info.is_commutative = true;
129    switch (instr->opcode) {
130    case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break;
131    case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break;
132    case aco_opcode::v_fmamk_f32:
133       info.op = aco_opcode::v_dual_fmamk_f32;
134       info.is_commutative = false;
135       break;
136    case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break;
137    case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break;
138    case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break;
139    case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break;
140    case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break;
141    case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break;
142    case aco_opcode::v_cndmask_b32:
143       info.op = aco_opcode::v_dual_cndmask_b32;
144       info.is_commutative = false;
145       break;
146    case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break;
147    case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break;
148    case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break;
149    case aco_opcode::v_add_u32:
150       info.op = aco_opcode::v_dual_add_nc_u32;
151       info.is_opy_only = true;
152       break;
153    case aco_opcode::v_lshlrev_b32:
154       info.op = aco_opcode::v_dual_lshlrev_b32;
155       info.is_opy_only = true;
156       info.is_commutative = false;
157       break;
158    case aco_opcode::v_and_b32:
159       info.op = aco_opcode::v_dual_and_b32;
160       info.is_opy_only = true;
161       break;
162    default: return VOPDInfo();
163    }
164 
165    /* Each instruction may use at most one SGPR. */
166    if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr))
167       return VOPDInfo();
168 
169    info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1;
170 
171    static const unsigned bank_mask[3] = {0x3, 0x3, 0x1};
172    bool has_sgpr = false;
173    for (unsigned i = 0; i < instr->operands.size(); i++) {
174       unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
175       if (instr->operands[i].isOfType(RegType::vgpr))
176          info.src_banks |= 1 << (port * 4 + (instr->operands[i].physReg().reg() & bank_mask[port]));
177 
178       /* Check all operands because of fmaak/fmamk. */
179       if (instr->operands[i].isLiteral()) {
180          assert(!info.has_literal || info.literal == instr->operands[i].constantValue());
181          info.has_literal = true;
182          info.literal = instr->operands[i].constantValue();
183       }
184 
185       /* Check all operands because of cndmask. */
186       has_sgpr |= !instr->operands[i].isConstant() && instr->operands[i].isOfType(RegType::sgpr);
187    }
188 
189    /* An instruction can't use both a literal and an SGPR. */
190    if (has_sgpr && info.has_literal)
191       return VOPDInfo();
192 
193    info.is_commutative &= instr->operands[0].isOfType(RegType::vgpr);
194 
195    return info;
196 }
197 
198 bool
is_vopd_compatible(const VOPDInfo & a,const VOPDInfo & b)199 is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
200 {
201    if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
202       return false;
203 
204    /* Both can use a literal, but it must be the same literal. */
205    if (a.has_literal && b.has_literal && a.literal != b.literal)
206       return false;
207 
208    /* The rest is checking src VGPR bank compatibility. */
209    if ((a.src_banks & b.src_banks) == 0)
210       return true;
211 
212    if (!a.is_commutative && !b.is_commutative)
213       return false;
214 
215    uint16_t src0 = a.src_banks & 0xf;
216    uint16_t src1 = a.src_banks & 0xf0;
217    uint16_t src2 = a.src_banks & 0x300;
218    uint16_t a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
219    if ((a_src_banks & b.src_banks) != 0)
220       return false;
221 
222    /* If we have to turn v_mov_b32 into v_add_u32 but there is already an OPY-only instruction,
223     * we can't do it.
224     */
225    if (a.op == aco_opcode::v_dual_mov_b32 && !b.is_commutative && b.is_opy_only)
226       return false;
227    if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
228       return false;
229 
230    return true;
231 }
232 
233 bool
can_use_vopd(const SchedILPContext & ctx,unsigned idx)234 can_use_vopd(const SchedILPContext& ctx, unsigned idx)
235 {
236    VOPDInfo cur_vopd = ctx.vopd[idx];
237    Instruction* first = ctx.nodes[idx].instr;
238    Instruction* second = ctx.prev_info.instr;
239 
240    if (!second)
241       return false;
242 
243    if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
244       return false;
245 
246    if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
247       return false;
248 
249    assert(first->definitions.size() == 1);
250    assert(first->definitions[0].size() == 1);
251    assert(second->definitions.size() == 1);
252    assert(second->definitions[0].size() == 1);
253 
254    /* Check for WaW dependency. */
255    if (first->definitions[0].physReg() == second->definitions[0].physReg())
256       return false;
257 
258    /* Check for RaW dependency. */
259    for (Operand op : second->operands) {
260       assert(op.size() == 1);
261       if (first->definitions[0].physReg() == op.physReg())
262          return false;
263    }
264 
265    /* WaR dependencies are not a concern. */
266    return true;
267 }
268 
269 unsigned
get_latency(const Instruction * const instr)270 get_latency(const Instruction* const instr)
271 {
272    /* Note, that these are not accurate latency estimations. */
273    if (instr->isVALU())
274       return 5;
275    if (instr->isSALU())
276       return 2;
277    if (instr->isVMEM() || instr->isFlatLike())
278       return 32;
279    if (instr->isSMEM())
280       return 5;
281    if (instr->accessesLDS())
282       return 2;
283 
284    return 0;
285 }
286 
287 bool
is_memory_instr(const Instruction * const instr)288 is_memory_instr(const Instruction* const instr)
289 {
290    /* For memory instructions, we allow to reorder them with ALU if it helps
291     * to form larger clauses or to increase def-use distances.
292     */
293    return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || instr->accessesLDS();
294 }
295 
296 constexpr unsigned max_sgpr = 128;
297 constexpr unsigned min_vgpr = 256;
298 
299 void
add_entry(SchedILPContext & ctx,Instruction * const instr,const uint32_t idx)300 add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
301 {
302    InstrInfo& entry = ctx.nodes[idx];
303    entry.instr = instr;
304    entry.priority = 0;
305    const mask_t mask = BITFIELD_BIT(idx);
306    bool reorder = can_reorder(instr);
307    ctx.active_mask |= mask;
308 
309    if (ctx.is_vopd) {
310       VOPDInfo vopd = get_vopd_info(entry.instr);
311 
312       ctx.vopd[idx] = vopd;
313       ctx.vopd_odd_mask &= ~mask;
314       ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0;
315       ctx.vopd_even_mask &= ~mask;
316       ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask;
317    }
318 
319    for (const Operand& op : instr->operands) {
320       assert(op.isFixed());
321       unsigned reg = op.physReg();
322       if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
323          reorder &= reg != pops_exiting_wave_id;
324          continue;
325       }
326 
327       for (unsigned i = 0; i < op.size(); i++) {
328          RegisterInfo& reg_info = ctx.regs[reg + i];
329 
330          /* Add register reads. */
331          reg_info.read_mask |= mask;
332 
333          int cycles_since_reg_write = num_nodes;
334          if (reg_info.has_direct_dependency) {
335             /* A previous dependency is still part of the DAG. */
336             entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
337             cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
338          }
339 
340          if (reg_info.latency) {
341             /* Ignore and reset register latencies for memory loads and other non-reorderable
342              * instructions. We schedule these as early as possible anyways.
343              */
344             if (reorder && reg_info.latency > cycles_since_reg_write) {
345                entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
346 
347                /* If a previous register write created some latency, ensure that this
348                 * is the first read of the register by making this instruction a direct
349                 * dependency of all following register reads.
350                 */
351                reg_info.has_direct_dependency = 1;
352                reg_info.direct_dependency = idx;
353             }
354             reg_info.latency = 0;
355          }
356       }
357    }
358 
359    /* Check if this instructions reads implicit registers. */
360    if (needs_exec_mask(instr)) {
361       for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
362          if (ctx.regs[reg].has_direct_dependency)
363             entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
364          ctx.regs[reg].read_mask |= mask;
365       }
366    }
367    if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
368       for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
369          if (ctx.regs[reg].has_direct_dependency)
370             entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
371          ctx.regs[reg].read_mask |= mask;
372       }
373    }
374 
375    for (const Definition& def : instr->definitions) {
376       for (unsigned i = 0; i < def.size(); i++) {
377          RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
378 
379          /* Add all previous register reads and writes to the dependencies. */
380          entry.dependency_mask |= reg_info.read_mask;
381          reg_info.read_mask = mask;
382 
383          /* This register write is a direct dependency for all following reads. */
384          reg_info.has_direct_dependency = 1;
385          reg_info.direct_dependency = idx;
386 
387          if (!ctx.is_vopd) {
388             /* Add latency information for the next register read. */
389             reg_info.latency = get_latency(instr);
390          }
391       }
392    }
393 
394    if (!reorder) {
395       ctx.non_reorder_mask |= mask;
396 
397       /* Set this node as last non-reorderable instruction */
398       if (ctx.next_non_reorderable == UINT8_MAX) {
399          ctx.next_non_reorderable = idx;
400       } else {
401          ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
402       }
403       ctx.last_non_reorderable = idx;
404       entry.next_non_reorderable = UINT8_MAX;
405 
406       /* Just don't reorder these at all. */
407       if (!is_memory_instr(instr) || instr->definitions.empty() ||
408           get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) {
409          /* Add all previous instructions as dependencies. */
410          entry.dependency_mask = ctx.active_mask;
411       }
412 
413       /* Remove non-reorderable instructions from dependencies, since WaR dependencies can interfere
414        * with clause formation. This should be fine, since these are always scheduled in-order and
415        * any cases that are actually a concern for clause formation are added as transitive
416        * dependencies. */
417       entry.dependency_mask &= ~ctx.non_reorder_mask;
418       entry.potential_clause = true;
419    } else if (ctx.last_non_reorderable != UINT8_MAX) {
420       ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
421    }
422 
423    entry.dependency_mask &= ~mask;
424 
425    for (unsigned i = 0; i < num_nodes; i++) {
426       if (!ctx.nodes[i].instr || i == idx)
427          continue;
428 
429       /* Add transitive dependencies. */
430       if (entry.dependency_mask & BITFIELD_BIT(i))
431          entry.dependency_mask |= ctx.nodes[i].dependency_mask;
432 
433       /* increment base priority */
434       ctx.nodes[i].priority++;
435    }
436 }
437 
438 void
remove_entry(SchedILPContext & ctx,const Instruction * const instr,const uint32_t idx)439 remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_t idx)
440 {
441    const mask_t mask = ~BITFIELD_BIT(idx);
442    ctx.active_mask &= mask;
443 
444    for (const Operand& op : instr->operands) {
445       const unsigned reg = op.physReg();
446       if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
447          continue;
448 
449       for (unsigned i = 0; i < op.size(); i++) {
450          RegisterInfo& reg_info = ctx.regs[reg + i];
451          reg_info.read_mask &= mask;
452          reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
453       }
454    }
455    if (needs_exec_mask(instr)) {
456       ctx.regs[exec_lo].read_mask &= mask;
457       ctx.regs[exec_hi].read_mask &= mask;
458    }
459    if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
460       ctx.regs[flat_scr_lo].read_mask &= mask;
461       ctx.regs[flat_scr_hi].read_mask &= mask;
462    }
463    for (const Definition& def : instr->definitions) {
464       for (unsigned i = 0; i < def.size(); i++) {
465          unsigned reg = def.physReg().reg() + i;
466          ctx.regs[reg].read_mask &= mask;
467          ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
468       }
469    }
470 
471    for (unsigned i = 0; i < num_nodes; i++)
472       ctx.nodes[i].dependency_mask &= mask;
473 
474    if (ctx.next_non_reorderable == idx) {
475       ctx.non_reorder_mask &= mask;
476       ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
477       if (ctx.last_non_reorderable == idx)
478          ctx.last_non_reorderable = UINT8_MAX;
479    }
480 }
481 
482 /**
483  * Returns a bitfield of nodes which have to be scheduled before the
484  * next non-reorderable instruction.
485  * If the next non-reorderable instruction can form a clause, returns the
486  * dependencies of the entire clause.
487  */
488 mask_t
collect_clause_dependencies(const SchedILPContext & ctx,const uint8_t next,mask_t clause_mask)489 collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask_t clause_mask)
490 {
491    const InstrInfo& entry = ctx.nodes[next];
492    mask_t dependencies = entry.dependency_mask;
493    clause_mask |= (entry.potential_clause << next);
494 
495    if (!is_memory_instr(entry.instr))
496       return dependencies;
497 
498    /* If this is potentially an "open" clause, meaning that the clause might
499     * consist of instruction not yet added to the DAG, consider all previous
500     * instructions as dependencies. This prevents splitting of larger, already
501     * formed clauses.
502     */
503    if (next == ctx.last_non_reorderable && entry.potential_clause)
504       return (~clause_mask & ctx.active_mask) | dependencies;
505 
506    if (entry.next_non_reorderable == UINT8_MAX)
507       return dependencies;
508 
509    /* Check if this can form a clause with the following non-reorderable instruction */
510    if (should_form_clause(entry.instr, ctx.nodes[entry.next_non_reorderable].instr)) {
511       mask_t clause_deps =
512          collect_clause_dependencies(ctx, entry.next_non_reorderable, clause_mask);
513 
514       /* if the following clause is independent from us, add their dependencies */
515       if (!(clause_deps & BITFIELD_BIT(next)))
516          dependencies |= clause_deps;
517    }
518 
519    return dependencies;
520 }
521 
522 /**
523  * Returns the index of the next instruction to be selected.
524  */
525 unsigned
select_instruction_ilp(const SchedILPContext & ctx)526 select_instruction_ilp(const SchedILPContext& ctx)
527 {
528    mask_t mask = ctx.active_mask;
529 
530    /* First, collect all dependencies of the next non-reorderable instruction(s).
531     * These make up the list of possible candidates.
532     */
533    if (ctx.next_non_reorderable != UINT8_MAX)
534       mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
535 
536    /* If the next non-reorderable instruction has no dependencies, select it */
537    if (mask == 0)
538       return ctx.next_non_reorderable;
539 
540    /* Otherwise, select the instruction with highest priority of all candidates. */
541    unsigned idx = -1u;
542    int32_t priority = INT32_MIN;
543    u_foreach_bit (i, mask) {
544       const InstrInfo& candidate = ctx.nodes[i];
545 
546       /* Check if the candidate has pending dependencies. */
547       if (candidate.dependency_mask)
548          continue;
549 
550       if (idx == -1u || candidate.priority > priority) {
551          idx = i;
552          priority = candidate.priority;
553       }
554    }
555 
556    assert(idx != -1u);
557    return idx;
558 }
559 
560 bool
compare_nodes_vopd(const SchedILPContext & ctx,int num_vopd_odd_minus_even,bool * use_vopd,unsigned current,unsigned candidate)561 compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
562                    unsigned current, unsigned candidate)
563 {
564    if (can_use_vopd(ctx, candidate)) {
565       /* If we can form a VOPD instruction, always prefer to do so. */
566       if (!*use_vopd) {
567          *use_vopd = true;
568          return true;
569       }
570    } else {
571       if (*use_vopd)
572          return false;
573 
574       /* Neither current nor candidate can form a VOPD instruction with the previously scheduled
575        * instruction. */
576       VOPDInfo current_vopd = ctx.vopd[current];
577       VOPDInfo candidate_vopd = ctx.vopd[candidate];
578 
579       /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */
580       bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes;
581       bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes;
582       if (current_vopd_capable != candidate_vopd_capable)
583          return !candidate_vopd_capable;
584 
585       /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of
586        * odd/even instructions, in case selecting this instruction fails to make a pair.
587        */
588       if (current_vopd_capable && num_vopd_odd_minus_even != 0) {
589          assert(candidate_vopd_capable);
590          bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0;
591          if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd)
592             return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd;
593       }
594    }
595 
596    return ctx.nodes[candidate].priority > ctx.nodes[current].priority;
597 }
598 
599 unsigned
select_instruction_vopd(const SchedILPContext & ctx,bool * use_vopd)600 select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
601 {
602    *use_vopd = false;
603 
604    mask_t mask = ctx.active_mask;
605    if (ctx.next_non_reorderable != UINT8_MAX)
606       mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask;
607 
608    if (mask == 0)
609       return ctx.next_non_reorderable;
610 
611    int num_vopd_odd_minus_even =
612       (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask);
613 
614    unsigned cur = -1u;
615    u_foreach_bit (i, mask) {
616       const InstrInfo& candidate = ctx.nodes[i];
617 
618       /* Check if the candidate has pending dependencies. */
619       if (candidate.dependency_mask)
620          continue;
621 
622       if (cur == -1u) {
623          cur = i;
624          *use_vopd = can_use_vopd(ctx, i);
625       } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
626          cur = i;
627       }
628    }
629 
630    assert(cur != -1u);
631    return cur;
632 }
633 
634 void
get_vopd_opcode_operands(Instruction * instr,const VOPDInfo & info,bool swap,aco_opcode * op,unsigned * num_operands,Operand * operands)635 get_vopd_opcode_operands(Instruction* instr, const VOPDInfo& info, bool swap, aco_opcode* op,
636                          unsigned* num_operands, Operand* operands)
637 {
638    *op = info.op;
639    *num_operands += instr->operands.size();
640    std::copy(instr->operands.begin(), instr->operands.end(), operands);
641 
642    if (swap && info.op == aco_opcode::v_dual_mov_b32) {
643       *op = aco_opcode::v_dual_add_nc_u32;
644       (*num_operands)++;
645       operands[0] = Operand::zero();
646       operands[1] = instr->operands[0];
647    } else if (swap) {
648       if (info.op == aco_opcode::v_dual_sub_f32)
649          *op = aco_opcode::v_dual_subrev_f32;
650       else if (info.op == aco_opcode::v_dual_subrev_f32)
651          *op = aco_opcode::v_dual_sub_f32;
652       std::swap(operands[0], operands[1]);
653    }
654 }
655 
656 Instruction*
create_vopd_instruction(const SchedILPContext & ctx,unsigned idx)657 create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
658 {
659    Instruction* x = ctx.prev_info.instr;
660    Instruction* y = ctx.nodes[idx].instr;
661    VOPDInfo x_info = ctx.prev_vopd_info;
662    VOPDInfo y_info = ctx.vopd[idx];
663 
664    bool swap_x = false, swap_y = false;
665    if (x_info.src_banks & y_info.src_banks) {
666       assert(x_info.is_commutative || y_info.is_commutative);
667       /* Avoid swapping v_mov_b32 because it will become an OPY-only opcode. */
668       if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
669          swap_x = true;
670          x_info.is_opy_only = true;
671       } else {
672          swap_x = x_info.is_commutative && x_info.op != aco_opcode::v_dual_mov_b32;
673          swap_y = y_info.is_commutative && !swap_x;
674       }
675    }
676 
677    if (x_info.is_opy_only) {
678       std::swap(x, y);
679       std::swap(x_info, y_info);
680       std::swap(swap_x, swap_y);
681    }
682 
683    aco_opcode x_op, y_op;
684    unsigned num_operands = 0;
685    Operand operands[6];
686    get_vopd_opcode_operands(x, x_info, swap_x, &x_op, &num_operands, operands);
687    get_vopd_opcode_operands(y, y_info, swap_y, &y_op, &num_operands, operands + num_operands);
688 
689    VOPD_instruction* instr =
690       create_instruction<VOPD_instruction>(x_op, Format::VOPD, num_operands, 2);
691    instr->opy = y_op;
692    instr->definitions[0] = x->definitions[0];
693    instr->definitions[1] = y->definitions[0];
694    std::copy(operands, operands + num_operands, instr->operands.begin());
695 
696    return instr;
697 }
698 
699 template <typename It>
700 void
do_schedule(SchedILPContext & ctx,It & insert_it,It & remove_it,It instructions_begin,It instructions_end)701 do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin,
702             It instructions_end)
703 {
704    for (unsigned i = 0; i < num_nodes; i++) {
705       if (remove_it == instructions_end)
706          break;
707 
708       add_entry(ctx, (remove_it++)->get(), i);
709    }
710 
711    ctx.prev_info.instr = NULL;
712    bool use_vopd = false;
713 
714    while (ctx.active_mask) {
715       unsigned next_idx =
716          ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
717       Instruction* next_instr = ctx.nodes[next_idx].instr;
718 
719       if (use_vopd) {
720          std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
721          ctx.prev_info.instr = NULL;
722       } else {
723          (insert_it++)->reset(next_instr);
724          ctx.prev_info = ctx.nodes[next_idx];
725          ctx.prev_vopd_info = ctx.vopd[next_idx];
726       }
727 
728       remove_entry(ctx, next_instr, next_idx);
729       ctx.nodes[next_idx].instr = NULL;
730 
731       if (remove_it != instructions_end) {
732          add_entry(ctx, (remove_it++)->get(), next_idx);
733       } else if (ctx.last_non_reorderable != UINT8_MAX) {
734          ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
735          ctx.last_non_reorderable = UINT8_MAX;
736       }
737    }
738 }
739 
740 } // namespace
741 
742 void
schedule_ilp(Program * program)743 schedule_ilp(Program* program)
744 {
745    SchedILPContext ctx = {program};
746 
747    for (Block& block : program->blocks) {
748       auto it = block.instructions.begin();
749       auto insert_it = block.instructions.begin();
750       do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
751       block.instructions.resize(insert_it - block.instructions.begin());
752    }
753 }
754 
755 void
schedule_vopd(Program * program)756 schedule_vopd(Program* program)
757 {
758    if (program->gfx_level < GFX11 || program->wave_size != 32)
759       return;
760 
761    SchedILPContext ctx = {program};
762    ctx.is_vopd = true;
763 
764    for (Block& block : program->blocks) {
765       auto it = block.instructions.rbegin();
766       auto insert_it = block.instructions.rbegin();
767       do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend());
768       block.instructions.erase(block.instructions.begin(), insert_it.base());
769    }
770 }
771 
772 } // namespace aco
773