• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include "util/bitset.h"
11 
12 #include <algorithm>
13 #include <bitset>
14 #include <set>
15 #include <stack>
16 #include <vector>
17 
18 namespace aco {
19 namespace {
20 
21 struct State {
22    Program* program;
23    Block* block;
24    std::vector<aco_ptr<Instruction>> old_instructions;
25 };
26 
27 struct NOP_ctx_gfx6 {
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx628    void join(const NOP_ctx_gfx6& other)
29    {
30       set_vskip_mode_then_vector =
31          MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
32       valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
33       salu_wr_m0_then_gds_msg_ttrace =
34          MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
35       valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
36       salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
37       salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
38       setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
39       vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
40       smem_clause |= other.smem_clause;
41       smem_write |= other.smem_write;
42       for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
43          smem_clause_read_write[i] |= other.smem_clause_read_write[i];
44          smem_clause_write[i] |= other.smem_clause_write[i];
45       }
46    }
47 
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx648    bool operator==(const NOP_ctx_gfx6& other)
49    {
50       return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
51              valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
52              vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
53              salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
54              valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
55              salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
56              salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
57              setreg_then_getsetreg == other.setreg_then_getsetreg &&
58              smem_clause == other.smem_clause && smem_write == other.smem_write &&
59              BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
60              BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
61    }
62 
add_wait_statesaco::__anon0afcfc6c0111::NOP_ctx_gfx663    void add_wait_states(unsigned amount)
64    {
65       if ((set_vskip_mode_then_vector -= amount) < 0)
66          set_vskip_mode_then_vector = 0;
67 
68       if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
69          valu_wr_vcc_then_div_fmas = 0;
70 
71       if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
72          salu_wr_m0_then_gds_msg_ttrace = 0;
73 
74       if ((valu_wr_exec_then_dpp -= amount) < 0)
75          valu_wr_exec_then_dpp = 0;
76 
77       if ((salu_wr_m0_then_lds -= amount) < 0)
78          salu_wr_m0_then_lds = 0;
79 
80       if ((salu_wr_m0_then_moverel -= amount) < 0)
81          salu_wr_m0_then_moverel = 0;
82 
83       if ((setreg_then_getsetreg -= amount) < 0)
84          setreg_then_getsetreg = 0;
85 
86       vmem_store_then_wr_data.reset();
87    }
88 
89    /* setting MODE.vskip and then any vector op requires 2 wait states */
90    int8_t set_vskip_mode_then_vector = 0;
91 
92    /* VALU writing VCC followed by v_div_fmas require 4 wait states */
93    int8_t valu_wr_vcc_then_div_fmas = 0;
94 
95    /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
96    int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
97 
98    /* VALU writing EXEC followed by DPP requires 5 wait states */
99    int8_t valu_wr_exec_then_dpp = 0;
100 
101    /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
102    int8_t salu_wr_m0_then_lds = 0;
103 
104    /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
105    int8_t salu_wr_m0_then_moverel = 0;
106 
107    /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
108     * currently we don't look at the actual register */
109    int8_t setreg_then_getsetreg = 0;
110 
111    /* some memory instructions writing >64bit followed by a instructions
112     * writing the VGPRs holding the writedata requires 1 wait state */
113    std::bitset<256> vmem_store_then_wr_data;
114 
115    /* we break up SMEM clauses that contain stores or overwrite an
116     * operand/definition of another instruction in the clause */
117    bool smem_clause = false;
118    bool smem_write = false;
119    BITSET_DECLARE(smem_clause_read_write, 128) = {0};
120    BITSET_DECLARE(smem_clause_write, 128) = {0};
121 };
122 
123 struct NOP_ctx_gfx10 {
124    bool has_VOPC_write_exec = false;
125    bool has_nonVALU_exec_read = false;
126    bool has_VMEM = false;
127    bool has_branch_after_VMEM = false;
128    bool has_DS = false;
129    bool has_branch_after_DS = false;
130    bool has_NSA_MIMG = false;
131    bool has_writelane = false;
132    std::bitset<128> sgprs_read_by_VMEM;
133    std::bitset<128> sgprs_read_by_VMEM_store;
134    std::bitset<128> sgprs_read_by_DS;
135    std::bitset<128> sgprs_read_by_SMEM;
136 
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx10137    void join(const NOP_ctx_gfx10& other)
138    {
139       has_VOPC_write_exec |= other.has_VOPC_write_exec;
140       has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
141       has_VMEM |= other.has_VMEM;
142       has_branch_after_VMEM |= other.has_branch_after_VMEM;
143       has_DS |= other.has_DS;
144       has_branch_after_DS |= other.has_branch_after_DS;
145       has_NSA_MIMG |= other.has_NSA_MIMG;
146       has_writelane |= other.has_writelane;
147       sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
148       sgprs_read_by_DS |= other.sgprs_read_by_DS;
149       sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
150       sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
151    }
152 
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx10153    bool operator==(const NOP_ctx_gfx10& other)
154    {
155       return has_VOPC_write_exec == other.has_VOPC_write_exec &&
156              has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
157              has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
158              has_branch_after_DS == other.has_branch_after_DS &&
159              has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
160              sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
161              sgprs_read_by_DS == other.sgprs_read_by_DS &&
162              sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
163              sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
164    }
165 };
166 
167 template <int Max> struct RegCounterMap {
incaco::__anon0afcfc6c0111::RegCounterMap168    void inc() { base++; }
setaco::__anon0afcfc6c0111::RegCounterMap169    void set(PhysReg reg) { update(reg, 0); }
170 
getaco::__anon0afcfc6c0111::RegCounterMap171    uint8_t get(PhysReg reg)
172    {
173       if (present.test(reg.reg() & 0x7F)) {
174          for (entry& e : list) {
175             if (e.reg == reg.reg())
176                return MIN2(base - e.val, Max);
177          }
178       }
179       return Max;
180    }
181 
resetaco::__anon0afcfc6c0111::RegCounterMap182    void reset()
183    {
184       present.reset();
185       list.clear();
186       base = 0;
187    }
188 
emptyaco::__anon0afcfc6c0111::RegCounterMap189    bool empty()
190    {
191       for (entry& e : list) {
192          if (base - e.val < Max)
193             return false;
194       }
195       return true;
196    }
197 
join_minaco::__anon0afcfc6c0111::RegCounterMap198    void join_min(const RegCounterMap& other)
199    {
200       for (const entry& e : other.list) {
201          int idx = other.base - e.val;
202          if (idx >= Max)
203             continue;
204 
205          update(e.reg, idx);
206       }
207    }
208 
updateaco::__anon0afcfc6c0111::RegCounterMap209    void update(uint16_t reg, int idx)
210    {
211       int16_t val = base - idx;
212       for (entry& e : list) {
213          if (e.reg == reg) {
214             e.val = MAX2(e.val, val);
215             return;
216          }
217       }
218       list.push_back(entry{reg, val});
219       present.set(reg & 0x7F);
220    }
221 
operator ==aco::__anon0afcfc6c0111::RegCounterMap222    bool operator==(const RegCounterMap& other) const
223    {
224       /* Two maps with different bases could also be equal, but for our use case,
225        * i.e. checking for changes at loop headers, this is sufficient since we
226        * always join the predecessors into an empty map with base=0.
227        */
228       return base == other.base && list == other.list;
229    }
230 
231 private:
232    struct entry {
233       uint16_t reg;
234       int16_t val;
operator !=aco::__anon0afcfc6c0111::RegCounterMap::entry235       bool operator!=(const entry& other) const { return reg != other.reg || val != other.val; }
236    };
237 
238    std::bitset<128> present;
239    small_vec<entry, 4> list;
240    int base = 0;
241 };
242 
243 struct NOP_ctx_gfx11 {
244    /* VcmpxPermlaneHazard */
245    bool has_Vcmpx = false;
246 
247    /* LdsDirectVMEMHazard */
248    std::bitset<256> vgpr_used_by_vmem_load;
249    std::bitset<256> vgpr_used_by_vmem_sample;
250    std::bitset<256> vgpr_used_by_vmem_bvh;
251    std::bitset<256> vgpr_used_by_vmem_store;
252    std::bitset<256> vgpr_used_by_ds;
253 
254    /* VALUTransUseHazard */
255    RegCounterMap<6> valu_since_wr_by_trans;
256    RegCounterMap<2> trans_since_wr_by_trans;
257 
258    /* VALUMaskWriteHazard */
259    std::bitset<128> sgpr_read_by_valu_as_lanemask;
260    std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
261 
262    /* WMMAHazards */
263    std::bitset<256> vgpr_written_by_wmma;
264 
265    /* VALUReadSGPRHazard */
266    std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
267    RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
268 
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx11269    void join(const NOP_ctx_gfx11& other)
270    {
271       has_Vcmpx |= other.has_Vcmpx;
272       vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
273       vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
274       vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
275       vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
276       vgpr_used_by_ds |= other.vgpr_used_by_ds;
277       valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
278       trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
279       sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
280       sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
281          other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
282       vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
283       sgpr_read_by_valu |= other.sgpr_read_by_valu;
284       sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
285    }
286 
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx11287    bool operator==(const NOP_ctx_gfx11& other)
288    {
289       return has_Vcmpx == other.has_Vcmpx &&
290              vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
291              vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
292              vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
293              vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
294              vgpr_used_by_ds == other.vgpr_used_by_ds &&
295              valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
296              trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
297              sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
298              sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
299                 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
300              vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
301              sgpr_read_by_valu == other.sgpr_read_by_valu &&
302              sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
303    }
304 };
305 
306 int
get_wait_states(aco_ptr<Instruction> & instr)307 get_wait_states(aco_ptr<Instruction>& instr)
308 {
309    if (instr->opcode == aco_opcode::s_nop)
310       return instr->salu().imm + 1;
311    else if (instr->opcode == aco_opcode::p_constaddr)
312       return 3; /* lowered to 3 instructions in the assembler */
313    else
314       return 1;
315 }
316 
317 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)318 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
319 {
320    return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
321 }
322 
323 template <typename GlobalState, typename BlockState,
324           bool (*block_cb)(GlobalState&, BlockState&, Block*),
325           bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
326 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)327 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
328                           Block* block, bool start_at_end)
329 {
330    if (block == state.block && start_at_end) {
331       /* If it's the current block, block->instructions is incomplete. */
332       for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
333          aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
334          if (!instr)
335             break; /* Instruction has been moved to block->instructions. */
336          if (instr_cb(global_state, block_state, instr))
337             return;
338       }
339    }
340 
341    for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
342       if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
343          return;
344    }
345 
346    PRAGMA_DIAGNOSTIC_PUSH
347    PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
348    if (block_cb != nullptr && !block_cb(global_state, block_state, block))
349       return;
350    PRAGMA_DIAGNOSTIC_POP
351 
352    for (unsigned lin_pred : block->linear_preds) {
353       search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
354          state, global_state, block_state, &state.program->blocks[lin_pred], true);
355    }
356 }
357 
358 template <typename GlobalState, typename BlockState,
359           bool (*block_cb)(GlobalState&, BlockState&, Block*),
360           bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
361 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)362 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
363 {
364    search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
365       state, global_state, block_state, state.block, false);
366 }
367 
368 struct HandleRawHazardGlobalState {
369    PhysReg reg;
370    int nops_needed;
371 };
372 
373 struct HandleRawHazardBlockState {
374    uint32_t mask;
375    int nops_needed;
376 };
377 
378 template <bool Valu, bool Vintrp, bool Salu>
379 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)380 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
381                         HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
382 {
383    unsigned mask_size = util_last_bit(block_state.mask);
384 
385    uint32_t writemask = 0;
386    for (Definition& def : pred->definitions) {
387       if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
388          unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
389          unsigned end = MIN2(mask_size, start + def.size());
390          writemask |= u_bit_consecutive(start, end - start);
391       }
392    }
393 
394    bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
395                                        (pred->isSALU() && Salu));
396    if (is_hazard) {
397       global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
398       return true;
399    }
400 
401    block_state.mask &= ~writemask;
402    block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
403 
404    if (block_state.mask == 0)
405       block_state.nops_needed = 0;
406 
407    return block_state.nops_needed == 0;
408 }
409 
410 template <bool Valu, bool Vintrp, bool Salu>
411 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)412 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
413 {
414    if (*NOPs >= min_states)
415       return;
416 
417    HandleRawHazardGlobalState global = {op.physReg(), 0};
418    HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
419 
420    /* Loops require branch instructions, which count towards the wait
421     * states. So even with loops this should finish unless nops_needed is some
422     * huge value. */
423    search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
424                     handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
425 
426    *NOPs = MAX2(*NOPs, global.nops_needed);
427 }
428 
429 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
430 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
431 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
432 
433 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)434 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
435 {
436    unsigned end = start + size - 1;
437    unsigned start_mod = start % BITSET_WORDBITS;
438    if (start_mod + size <= BITSET_WORDBITS) {
439       BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
440    } else {
441       unsigned first_size = BITSET_WORDBITS - start_mod;
442       set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
443       set_bitset_range(words, start + first_size, size - first_size);
444    }
445 }
446 
447 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)448 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
449 {
450    unsigned end = start + size - 1;
451    unsigned start_mod = start % BITSET_WORDBITS;
452    if (start_mod + size <= BITSET_WORDBITS) {
453       return BITSET_TEST_RANGE(words, start, end);
454    } else {
455       unsigned first_size = BITSET_WORDBITS - start_mod;
456       return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
457              test_bitset_range(words, start + first_size, size - first_size);
458    }
459 }
460 
461 /* A SMEM clause is any group of consecutive SMEM instructions. The
462  * instructions in this group may return out of order and/or may be replayed.
463  *
464  * To fix this potential hazard correctly, we have to make sure that when a
465  * clause has more than one instruction, no instruction in the clause writes
466  * to a register that is read by another instruction in the clause (including
467  * itself). In this case, we have to break the SMEM clause by inserting non
468  * SMEM instructions.
469  *
470  * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
471  */
472 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)473 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
474                            int* NOPs)
475 {
476    /* break off from previous SMEM clause if needed */
477    if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
478       /* Don't allow clauses with store instructions since the clause's
479        * instructions may use the same address. */
480       if (ctx.smem_write || instr->definitions.empty() ||
481           instr_info.is_atomic[(unsigned)instr->opcode]) {
482          *NOPs = 1;
483       } else if (program->dev.xnack_enabled) {
484          for (Operand op : instr->operands) {
485             if (!op.isConstant() &&
486                 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
487                *NOPs = 1;
488                break;
489             }
490          }
491 
492          Definition def = instr->definitions[0];
493          if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
494             *NOPs = 1;
495       }
496    }
497 }
498 
499 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
500 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)501 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
502                         std::vector<aco_ptr<Instruction>>& new_instructions)
503 {
504    /* check hazards */
505    int NOPs = 0;
506 
507    if (instr->isSMEM()) {
508       if (state.program->gfx_level == GFX6) {
509          /* A read of an SGPR by SMRD instruction requires 4 wait states
510           * when the SGPR was written by a VALU instruction. According to LLVM,
511           * there is also an undocumented hardware behavior when the buffer
512           * descriptor is written by a SALU instruction */
513          for (unsigned i = 0; i < instr->operands.size(); i++) {
514             Operand op = instr->operands[i];
515             if (op.isConstant())
516                continue;
517 
518             bool is_buffer_desc = i == 0 && op.size() > 2;
519             if (is_buffer_desc)
520                handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
521             else
522                handle_valu_then_read_hazard(state, &NOPs, 4, op);
523          }
524       }
525 
526       handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
527    } else if (instr->isSALU()) {
528       if (instr->opcode == aco_opcode::s_setreg_b32 ||
529           instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
530           instr->opcode == aco_opcode::s_getreg_b32) {
531          NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
532       }
533 
534       if (state.program->gfx_level == GFX9) {
535          if (instr->opcode == aco_opcode::s_movrels_b32 ||
536              instr->opcode == aco_opcode::s_movrels_b64 ||
537              instr->opcode == aco_opcode::s_movreld_b32 ||
538              instr->opcode == aco_opcode::s_movreld_b64) {
539             NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
540          }
541       }
542 
543       if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
544          NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
545    } else if (instr->isDS() && instr->ds().gds) {
546       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
547    } else if (instr->isVALU() || instr->isVINTRP()) {
548       if (instr->isDPP()) {
549          NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
550          handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
551       }
552 
553       for (Definition def : instr->definitions) {
554          if (def.regClass().type() != RegType::sgpr) {
555             for (unsigned i = 0; i < def.size(); i++)
556                NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
557          }
558       }
559 
560       if ((instr->opcode == aco_opcode::v_readlane_b32 ||
561            instr->opcode == aco_opcode::v_readlane_b32_e64 ||
562            instr->opcode == aco_opcode::v_writelane_b32 ||
563            instr->opcode == aco_opcode::v_writelane_b32_e64) &&
564           !instr->operands[1].isConstant()) {
565          handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
566       }
567 
568       /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
569        * is followed by a read with v_readfirstlane or v_readlane to fix GPU
570        * hangs on GFX6. Note that v_writelane_* is apparently not affected.
571        * This hazard isn't documented anywhere but AMD confirmed that hazard.
572        */
573       if (state.program->gfx_level == GFX6 &&
574           (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
575            instr->opcode == aco_opcode::v_readfirstlane_b32)) {
576          handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
577       }
578 
579       if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
580           instr->opcode == aco_opcode::v_div_fmas_f64)
581          NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
582    } else if (instr->isVMEM() || instr->isFlatLike()) {
583       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
584       for (Operand op : instr->operands) {
585          if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
586             handle_valu_then_read_hazard(state, &NOPs, 5, op);
587       }
588    }
589 
590    if (!instr->isSALU() && instr->format != Format::SMEM)
591       NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
592 
593    if (state.program->gfx_level == GFX9) {
594       bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
595       if (instr->isVINTRP() || lds_scratch_global ||
596           instr->opcode == aco_opcode::ds_read_addtid_b32 ||
597           instr->opcode == aco_opcode::ds_write_addtid_b32 ||
598           instr->opcode == aco_opcode::buffer_store_lds_dword) {
599          NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
600       }
601    }
602 
603    ctx.add_wait_states(NOPs + get_wait_states(instr));
604 
605    // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
606    if (NOPs) {
607       /* create NOP */
608       aco_ptr<Instruction> nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)};
609       nop->salu().imm = NOPs - 1;
610       new_instructions.emplace_back(std::move(nop));
611    }
612 
613    /* update information to check for later hazards */
614    if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
615       ctx.smem_clause = false;
616       ctx.smem_write = false;
617 
618       if (state.program->dev.xnack_enabled) {
619          BITSET_ZERO(ctx.smem_clause_read_write);
620          BITSET_ZERO(ctx.smem_clause_write);
621       }
622    }
623 
624    if (instr->isSMEM()) {
625       if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
626          ctx.smem_write = true;
627       } else {
628          ctx.smem_clause = true;
629 
630          if (state.program->dev.xnack_enabled) {
631             for (Operand op : instr->operands) {
632                if (!op.isConstant()) {
633                   set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
634                }
635             }
636 
637             Definition def = instr->definitions[0];
638             set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
639             set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
640          }
641       }
642    } else if (instr->isVALU()) {
643       for (Definition def : instr->definitions) {
644          if (def.regClass().type() == RegType::sgpr) {
645             if (def.physReg() == vcc || def.physReg() == vcc_hi) {
646                ctx.valu_wr_vcc_then_div_fmas = 4;
647             }
648             if (def.physReg() == exec || def.physReg() == exec_hi) {
649                ctx.valu_wr_exec_then_dpp = 5;
650             }
651          }
652       }
653    } else if (instr->isSALU()) {
654       if (!instr->definitions.empty()) {
655          /* all other definitions should be SCC */
656          Definition def = instr->definitions[0];
657          if (def.physReg() == m0) {
658             ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
659             ctx.salu_wr_m0_then_lds = 1;
660             ctx.salu_wr_m0_then_moverel = 1;
661          }
662       } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
663                  instr->opcode == aco_opcode::s_setreg_imm32_b32) {
664          SALU_instruction& sopk = instr->salu();
665          unsigned offset = (sopk.imm >> 6) & 0x1f;
666          unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
667          unsigned reg = sopk.imm & 0x3f;
668          ctx.setreg_then_getsetreg = 2;
669 
670          if (reg == 1 && offset >= 28 && size > (28 - offset))
671             ctx.set_vskip_mode_then_vector = 2;
672       }
673    } else if (instr->isVMEM() || instr->isFlatLike()) {
674       /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
675       bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
676                           instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
677       /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
678        * store) */
679       bool consider_mimg = instr->isMIMG() &&
680                            instr->operands[1].regClass().type() == RegType::vgpr &&
681                            instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
682       /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
683       bool consider_flat =
684          instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
685       if (consider_buf || consider_mimg || consider_flat) {
686          PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
687          unsigned size = instr->operands[consider_flat ? 2 : 3].size();
688          for (unsigned i = 0; i < size; i++)
689             ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
690       }
691    }
692 }
693 
694 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)695 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
696 {
697    if (pred->isVINTRP())
698       global_state = true;
699    return true;
700 }
701 
702 template <bool Salu, bool Sgpr>
703 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)704 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
705 {
706    if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
707       for (Definition dst : pred->definitions) {
708          if ((dst.physReg().reg() < 256) == Sgpr) {
709             global_state = MAX2(global_state, block_state);
710             return true;
711          }
712       }
713    }
714 
715    block_state -= get_wait_states(pred);
716    return block_state <= 0;
717 }
718 
719 template <bool Salu, bool Sgpr>
720 void
handle_wr_hazard(State & state,int * NOPs,int min_states)721 handle_wr_hazard(State& state, int* NOPs, int min_states)
722 {
723    if (*NOPs >= min_states)
724       return;
725 
726    int global = 0;
727    int block = min_states;
728    search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
729    *NOPs = MAX2(*NOPs, global);
730 }
731 
732 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)733 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
734                  std::vector<aco_ptr<Instruction>>& new_instructions)
735 {
736    int NOPs = 0;
737 
738    /* SGPR->SMEM hazards */
739    if (state.program->gfx_level == GFX6) {
740       handle_wr_hazard<true, true>(state, &NOPs, 4);
741       handle_wr_hazard<false, true>(state, &NOPs, 4);
742    }
743 
744    /* Break up SMEM clauses */
745    if (ctx.smem_clause || ctx.smem_write)
746       NOPs = MAX2(NOPs, 1);
747 
748    /* SALU/GDS hazards */
749    NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
750    if (state.program->gfx_level == GFX9)
751       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
752    NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
753 
754    /* VALU hazards */
755    NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
756    if (state.program->gfx_level >= GFX8)
757       handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
758    NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
759    if (state.program->gfx_level == GFX6) {
760       /* VINTRP->v_readlane_b32/etc */
761       bool vintrp = false;
762       search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
763       if (vintrp)
764          NOPs = MAX2(NOPs, 1);
765    }
766    NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
767 
768    /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
769    handle_wr_hazard<false, true>(state, &NOPs, 5);
770 
771    NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
772 
773    if (state.program->gfx_level == GFX9)
774       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
775 
776    ctx.add_wait_states(NOPs);
777    if (NOPs) {
778       Builder bld(state.program, &new_instructions);
779       bld.sopp(aco_opcode::s_nop, NOPs - 1);
780    }
781 }
782 
783 template <std::size_t N>
784 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)785 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
786 {
787    return std::any_of(instr->definitions.begin(), instr->definitions.end(),
788                       [&check_regs](const Definition& def) -> bool
789                       {
790                          bool writes_any = false;
791                          for (unsigned i = 0; i < def.size(); i++) {
792                             unsigned def_reg = def.physReg() + i;
793                             writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
794                          }
795                          return writes_any;
796                       });
797 }
798 
799 template <std::size_t N>
800 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)801 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
802 {
803    return std::any_of(instr->operands.begin(), instr->operands.end(),
804                       [&check_regs](const Operand& op) -> bool
805                       {
806                          if (op.isConstant())
807                             return false;
808                          bool writes_any = false;
809                          for (unsigned i = 0; i < op.size(); i++) {
810                             unsigned op_reg = op.physReg() + i;
811                             writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
812                          }
813                          return writes_any;
814                       });
815 }
816 
817 template <std::size_t N>
818 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)819 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
820 {
821    for (const Operand& op : instr->operands) {
822       for (unsigned i = 0; i < op.size(); i++) {
823          unsigned reg = op.physReg() + i;
824          if (reg < reg_reads.size())
825             reg_reads.set(reg);
826       }
827    }
828 }
829 
830 template <std::size_t N>
831 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)832 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
833 {
834    mark_read_regs(instr, reg_reads);
835    reg_reads.set(exec);
836    if (state.program->wave_size == 64)
837       reg_reads.set(exec_hi);
838 }
839 
840 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)841 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
842 {
843    if (instr->isVOPC())
844       return true;
845    if (instr->isVOP3() && instr->definitions.size() == 2)
846       return true;
847    if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
848        instr->opcode == aco_opcode::v_readlane_b32 ||
849        instr->opcode == aco_opcode::v_readlane_b32_e64)
850       return true;
851    return false;
852 }
853 
854 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)855 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
856 {
857    return std::any_of(instr->definitions.begin(), instr->definitions.end(),
858                       [](const Definition& def) -> bool
859                       { return def.getTemp().type() == RegType::sgpr; });
860 }
861 
862 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)863 instr_is_branch(const aco_ptr<Instruction>& instr)
864 {
865    return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
866           instr->opcode == aco_opcode::s_cbranch_scc1 ||
867           instr->opcode == aco_opcode::s_cbranch_vccz ||
868           instr->opcode == aco_opcode::s_cbranch_vccnz ||
869           instr->opcode == aco_opcode::s_cbranch_execz ||
870           instr->opcode == aco_opcode::s_cbranch_execnz ||
871           instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
872           instr->opcode == aco_opcode::s_cbranch_cdbguser ||
873           instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
874           instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
875           instr->opcode == aco_opcode::s_subvector_loop_begin ||
876           instr->opcode == aco_opcode::s_subvector_loop_end ||
877           instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
878           instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
879 }
880 
881 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)882 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
883                          std::vector<aco_ptr<Instruction>>& new_instructions)
884 {
885    // TODO: s_dcache_inv needs to be in it's own group on GFX10
886 
887    Builder bld(state.program, &new_instructions);
888 
889    unsigned vm_vsrc = 7;
890    unsigned sa_sdst = 1;
891    if (debug_flags & DEBUG_FORCE_WAITDEPS) {
892       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
893       vm_vsrc = 0;
894       sa_sdst = 0;
895    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
896       vm_vsrc = (instr->salu().imm >> 2) & 0x7;
897       sa_sdst = instr->salu().imm & 0x1;
898    }
899 
900    /* VMEMtoScalarWriteHazard
901     * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
902     * in-between.
903     */
904    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
905       /* Remember all SGPRs that are read by the VMEM/DS instruction */
906       if (instr->isVMEM() || instr->isFlatLike())
907          mark_read_regs_exec(
908             state, instr,
909             instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
910       if (instr->isFlat() || instr->isDS())
911          mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
912    } else if (instr->isSALU() || instr->isSMEM()) {
913       wait_imm imm;
914       if (imm.unpack(state.program->gfx_level, instr.get())) {
915          if (imm.vm == 0)
916             ctx.sgprs_read_by_VMEM.reset();
917          if (imm.lgkm == 0)
918             ctx.sgprs_read_by_DS.reset();
919          if (imm.vs == 0)
920             ctx.sgprs_read_by_VMEM_store.reset();
921       } else if (vm_vsrc == 0) {
922          ctx.sgprs_read_by_VMEM.reset();
923          ctx.sgprs_read_by_DS.reset();
924          ctx.sgprs_read_by_VMEM_store.reset();
925       }
926 
927       /* Check if SALU writes an SGPR that was previously read by the VALU */
928       if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
929           check_written_regs(instr, ctx.sgprs_read_by_DS) ||
930           check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
931          ctx.sgprs_read_by_VMEM.reset();
932          ctx.sgprs_read_by_DS.reset();
933          ctx.sgprs_read_by_VMEM_store.reset();
934 
935          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
936          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
937       }
938    } else if (instr->isVALU()) {
939       /* Hazard is mitigated by any VALU instruction */
940       ctx.sgprs_read_by_VMEM.reset();
941       ctx.sgprs_read_by_DS.reset();
942       ctx.sgprs_read_by_VMEM_store.reset();
943    }
944 
945    /* VcmpxPermlaneHazard
946     * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
947     */
948    if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
949       /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
950       ctx.has_VOPC_write_exec = true;
951    } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
952                                           instr->opcode == aco_opcode::v_permlanex16_b32)) {
953       ctx.has_VOPC_write_exec = false;
954 
955       /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
956       bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
957                Operand(instr->operands[0].physReg(), v1));
958    } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
959       ctx.has_VOPC_write_exec = false;
960    }
961 
962    /* VcmpxExecWARHazard
963     * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
964     */
965    if (!instr->isVALU() && instr->reads_exec()) {
966       ctx.has_nonVALU_exec_read = true;
967    } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
968       if (instr->writes_exec()) {
969          ctx.has_nonVALU_exec_read = false;
970 
971          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
972          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
973       } else if (instr_writes_sgpr(instr)) {
974          /* Any VALU instruction that writes an SGPR mitigates the problem */
975          ctx.has_nonVALU_exec_read = false;
976       }
977    } else if (sa_sdst == 0) {
978       ctx.has_nonVALU_exec_read = false;
979    }
980 
981    /* SMEMtoVectorWriteHazard
982     * Handle any VALU instruction writing an SGPR after an SMEM reads it.
983     */
984    if (instr->isSMEM()) {
985       /* Remember all SGPRs that are read by the SMEM instruction */
986       mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
987    } else if (VALU_writes_sgpr(instr)) {
988       /* Check if VALU writes an SGPR that was previously read by SMEM */
989       if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
990          ctx.sgprs_read_by_SMEM.reset();
991 
992          /* Insert s_mov to mitigate the problem */
993          bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
994       }
995    } else if (instr->isSALU()) {
996       wait_imm imm;
997       if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
998          /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
999          ctx.sgprs_read_by_SMEM.reset();
1000       } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1001          /* SALU can mitigate the hazard */
1002          ctx.sgprs_read_by_SMEM.reset();
1003       }
1004    }
1005 
1006    /* LdsBranchVmemWARHazard
1007     * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1008     */
1009    if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1010       if (ctx.has_branch_after_DS)
1011          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1012       ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1013       ctx.has_VMEM = true;
1014    } else if (instr->isDS()) {
1015       if (ctx.has_branch_after_VMEM)
1016          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1017       ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1018       ctx.has_DS = true;
1019    } else if (instr_is_branch(instr)) {
1020       ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1021       ctx.has_branch_after_DS |= ctx.has_DS;
1022       ctx.has_VMEM = ctx.has_DS = false;
1023    } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1024       /* Only s_waitcnt_vscnt can mitigate the hazard */
1025       const SALU_instruction& sopk = instr->salu();
1026       if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1027          ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1028    }
1029 
1030    /* NSAToVMEMBug
1031     * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1032     * 0).
1033     */
1034    if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1035       ctx.has_NSA_MIMG = true;
1036    } else if (ctx.has_NSA_MIMG) {
1037       ctx.has_NSA_MIMG = false;
1038 
1039       if (instr->isMUBUF() || instr->isMTBUF()) {
1040          uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1041          if (offset & 6)
1042             bld.sopp(aco_opcode::s_nop, 0);
1043       }
1044    }
1045 
1046    /* waNsaCannotFollowWritelane
1047     * Handles NSA MIMG immediately following a v_writelane_b32.
1048     */
1049    if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1050       ctx.has_writelane = true;
1051    } else if (ctx.has_writelane) {
1052       ctx.has_writelane = false;
1053       if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1054          bld.sopp(aco_opcode::s_nop, 0);
1055    }
1056 }
1057 
1058 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1059 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1060                   std::vector<aco_ptr<Instruction>>& new_instructions)
1061 {
1062    Builder bld(state.program, &new_instructions);
1063 
1064    size_t prev_count = new_instructions.size();
1065 
1066    /* VcmpxPermlaneHazard */
1067    if (ctx.has_VOPC_write_exec) {
1068       ctx.has_VOPC_write_exec = false;
1069       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1070 
1071       /* VALU mitigates VMEMtoScalarWriteHazard. */
1072       ctx.sgprs_read_by_VMEM.reset();
1073       ctx.sgprs_read_by_DS.reset();
1074       ctx.sgprs_read_by_VMEM_store.reset();
1075    }
1076 
1077    unsigned waitcnt_depctr = 0xffff;
1078 
1079    /* VMEMtoScalarWriteHazard */
1080    if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1081        ctx.sgprs_read_by_VMEM_store.any()) {
1082       ctx.sgprs_read_by_VMEM.reset();
1083       ctx.sgprs_read_by_DS.reset();
1084       ctx.sgprs_read_by_VMEM_store.reset();
1085       waitcnt_depctr &= 0xffe3;
1086    }
1087 
1088    /* VcmpxExecWARHazard */
1089    if (ctx.has_nonVALU_exec_read) {
1090       ctx.has_nonVALU_exec_read = false;
1091       waitcnt_depctr &= 0xfffe;
1092    }
1093 
1094    if (waitcnt_depctr != 0xffff)
1095       bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1096 
1097    /* SMEMtoVectorWriteHazard */
1098    if (ctx.sgprs_read_by_SMEM.any()) {
1099       ctx.sgprs_read_by_SMEM.reset();
1100       bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1101    }
1102 
1103    /* LdsBranchVmemWARHazard */
1104    if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1105       bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1106       ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1107    }
1108 
1109    /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1110    if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1111       ctx.has_NSA_MIMG = ctx.has_writelane = false;
1112       /* Any instruction resolves these hazards. */
1113       if (new_instructions.size() == prev_count)
1114          bld.sopp(aco_opcode::s_nop, 0);
1115    }
1116 }
1117 
1118 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1119 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1120 {
1121    if (reg.reg() < 256)
1122       return;
1123    for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1124       set.set(reg.reg() - 256 + i);
1125 }
1126 
1127 bool
test_vgpr_bitset(std::bitset<256> & set,Operand op)1128 test_vgpr_bitset(std::bitset<256>& set, Operand op)
1129 {
1130    if (op.physReg().reg() < 256)
1131       return false;
1132    for (unsigned i = 0; i < op.size(); i++) {
1133       if (set[op.physReg().reg() - 256 + i])
1134          return true;
1135    }
1136    return false;
1137 }
1138 
1139 /* GFX11 */
1140 struct LdsDirectVALUHazardGlobalState {
1141    unsigned wait_vdst = 15;
1142    PhysReg vgpr;
1143    std::set<unsigned> loop_headers_visited;
1144 };
1145 
1146 struct LdsDirectVALUHazardBlockState {
1147    unsigned num_valu = 0;
1148    bool has_trans = false;
1149 
1150    unsigned num_instrs = 0;
1151    unsigned num_blocks = 0;
1152 };
1153 
1154 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1155 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1156                                     LdsDirectVALUHazardBlockState& block_state,
1157                                     aco_ptr<Instruction>& instr)
1158 {
1159    if (instr->isVALU()) {
1160       block_state.has_trans |= instr->isTrans();
1161 
1162       bool uses_vgpr = false;
1163       for (Definition& def : instr->definitions)
1164          uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1165       for (Operand& op : instr->operands) {
1166          uses_vgpr |=
1167             !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1168       }
1169       if (uses_vgpr) {
1170          /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1171          global_state.wait_vdst =
1172             MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1173          return true;
1174       }
1175 
1176       block_state.num_valu++;
1177    }
1178 
1179    if (parse_depctr_wait(instr.get()).va_vdst == 0)
1180       return true;
1181 
1182    block_state.num_instrs++;
1183    if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1184       /* Exit to limit compile times and set wait_vdst to be safe. */
1185       global_state.wait_vdst =
1186          MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1187       return true;
1188    }
1189 
1190    return block_state.num_valu >= global_state.wait_vdst;
1191 }
1192 
1193 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1194 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1195                                     LdsDirectVALUHazardBlockState& block_state, Block* block)
1196 {
1197    if (block->kind & block_kind_loop_header) {
1198       if (global_state.loop_headers_visited.count(block->index))
1199          return false;
1200       global_state.loop_headers_visited.insert(block->index);
1201    }
1202 
1203    block_state.num_blocks++;
1204 
1205    return true;
1206 }
1207 
1208 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1209 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1210 {
1211    /* LdsDirectVALUHazard
1212     * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1213     */
1214    if (instr->ldsdir().wait_vdst == 0)
1215       return 0; /* early exit */
1216 
1217    LdsDirectVALUHazardGlobalState global_state;
1218    global_state.wait_vdst = instr->ldsdir().wait_vdst;
1219    global_state.vgpr = instr->definitions[0].physReg();
1220    LdsDirectVALUHazardBlockState block_state;
1221    search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1222                     &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1223       state, global_state, block_state);
1224    return global_state.wait_vdst;
1225 }
1226 
1227 enum VALUPartialForwardingHazardState : uint8_t {
1228    nothing_written,
1229    written_after_exec_write,
1230    exec_written,
1231 };
1232 
1233 struct VALUPartialForwardingHazardGlobalState {
1234    bool hazard_found = false;
1235    std::set<unsigned> loop_headers_visited;
1236 };
1237 
1238 struct VALUPartialForwardingHazardBlockState {
1239    /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1240    uint8_t num_vgprs_read = 0;
1241    BITSET_DECLARE(vgprs_read, 256) = {0};
1242    enum VALUPartialForwardingHazardState state = nothing_written;
1243    unsigned num_valu_since_read = 0;
1244    unsigned num_valu_since_write = 0;
1245 
1246    unsigned num_instrs = 0;
1247    unsigned num_blocks = 0;
1248 };
1249 
1250 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1251 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1252                                             VALUPartialForwardingHazardBlockState& block_state,
1253                                             aco_ptr<Instruction>& instr)
1254 {
1255    /* Check if there is already a hazard found on some other control flow path. */
1256    if (global_state.hazard_found)
1257       return true;
1258 
1259    if (instr->isSALU() && !instr->definitions.empty()) {
1260       if (block_state.state == written_after_exec_write && instr->writes_exec())
1261          block_state.state = exec_written;
1262    } else if (instr->isVALU()) {
1263       bool vgpr_write = false;
1264       for (Definition& def : instr->definitions) {
1265          if (def.physReg().reg() < 256)
1266             continue;
1267 
1268          for (unsigned i = 0; i < def.size(); i++) {
1269             unsigned reg = def.physReg().reg() - 256 + i;
1270             if (!BITSET_TEST(block_state.vgprs_read, reg))
1271                continue;
1272 
1273             if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1274                global_state.hazard_found = true;
1275                return true;
1276             }
1277 
1278             BITSET_CLEAR(block_state.vgprs_read, reg);
1279             block_state.num_vgprs_read--;
1280             vgpr_write = true;
1281          }
1282       }
1283 
1284       if (vgpr_write) {
1285          /* If the state is nothing_written: the check below should ensure that this write is
1286           * close enough to the read.
1287           *
1288           * If the state is exec_written: the current choice of second write has failed. Reset and
1289           * try with the current write as the second one, if it's close enough to the read.
1290           *
1291           * If the state is written_after_exec_write: a further second write would be better, if
1292           * it's close enough to the read.
1293           */
1294          if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1295             block_state.state = written_after_exec_write;
1296             block_state.num_valu_since_write = 0;
1297          } else {
1298             block_state.num_valu_since_write++;
1299          }
1300       } else {
1301          block_state.num_valu_since_write++;
1302       }
1303 
1304       block_state.num_valu_since_read++;
1305    } else if (parse_depctr_wait(instr.get()).va_vdst == 0) {
1306       return true;
1307    }
1308 
1309    if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1310       return true; /* Hazard not possible at this distance. */
1311    if (block_state.num_vgprs_read == 0)
1312       return true; /* All VGPRs have been written and a hazard was never found. */
1313 
1314    block_state.num_instrs++;
1315    if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1316       /* Exit to limit compile times and set hazard_found=true to be safe. */
1317       global_state.hazard_found = true;
1318       return true;
1319    }
1320 
1321    return false;
1322 }
1323 
1324 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1325 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1326                                             VALUPartialForwardingHazardBlockState& block_state,
1327                                             Block* block)
1328 {
1329    if (block->kind & block_kind_loop_header) {
1330       if (global_state.loop_headers_visited.count(block->index))
1331          return false;
1332       global_state.loop_headers_visited.insert(block->index);
1333    }
1334 
1335    block_state.num_blocks++;
1336 
1337    return true;
1338 }
1339 
1340 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1341 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1342 {
1343    /* VALUPartialForwardingHazard
1344     * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1345     * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1346     * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1347     */
1348    if (state.program->wave_size != 64 || !instr->isVALU())
1349       return false;
1350 
1351    unsigned num_vgprs = 0;
1352    for (Operand& op : instr->operands)
1353       num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1354    if (num_vgprs <= 1)
1355       return false; /* early exit */
1356 
1357    VALUPartialForwardingHazardBlockState block_state;
1358 
1359    for (unsigned i = 0; i < instr->operands.size(); i++) {
1360       Operand& op = instr->operands[i];
1361       if (op.physReg().reg() < 256)
1362          continue;
1363       for (unsigned j = 0; j < op.size(); j++)
1364          BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1365    }
1366    block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1367 
1368    if (block_state.num_vgprs_read <= 1)
1369       return false; /* early exit */
1370 
1371    VALUPartialForwardingHazardGlobalState global_state;
1372    search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1373                     &handle_valu_partial_forwarding_hazard_block,
1374                     &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1375    return global_state.hazard_found;
1376 }
1377 
1378 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1379 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1380                          std::vector<aco_ptr<Instruction>>& new_instructions)
1381 {
1382    Builder bld(state.program, &new_instructions);
1383 
1384    /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1385    if (instr->opcode == aco_opcode::s_sendmsg && instr->salu().imm == sendmsg_dealloc_vgprs &&
1386        (new_instructions.empty() || new_instructions.back()->opcode != aco_opcode::s_nop)) {
1387       bld.sopp(aco_opcode::s_nop, 0);
1388    }
1389 
1390    /* VcmpxPermlaneHazard
1391     * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1392     */
1393    if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1394       ctx.has_Vcmpx = true;
1395    } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1396                                 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1397                                 instr->opcode == aco_opcode::v_permlane64_b32 ||
1398                                 instr->opcode == aco_opcode::v_permlane16_var_b32 ||
1399                                 instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
1400       ctx.has_Vcmpx = false;
1401 
1402       /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1403       bld.vop1(aco_opcode::v_nop);
1404    } else if (instr->isVALU()) {
1405       ctx.has_Vcmpx = false;
1406    }
1407 
1408    unsigned va_vdst = parse_depctr_wait(instr.get()).va_vdst;
1409    unsigned vm_vsrc = 7;
1410    unsigned sa_sdst = 1;
1411 
1412    if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1413       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
1414       va_vdst = 0;
1415       vm_vsrc = 0;
1416       sa_sdst = 0;
1417    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1418       /* va_vdst already obtained through parse_depctr_wait(). */
1419       vm_vsrc = (instr->salu().imm >> 2) & 0x7;
1420       sa_sdst = instr->salu().imm & 0x1;
1421    } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
1422       vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
1423    }
1424 
1425    if (instr->isLDSDIR()) {
1426       unsigned count = handle_lds_direct_valu_hazard(state, instr);
1427       LDSDIR_instruction* ldsdir = &instr->ldsdir();
1428       if (count < va_vdst) {
1429          ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1430          va_vdst = MIN2(va_vdst, count);
1431       }
1432    }
1433 
1434    /* VALUTransUseHazard
1435     * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1436     * in-between.
1437     */
1438    if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
1439       uint8_t num_valu = 15;
1440       uint8_t num_trans = 15;
1441       for (Operand& op : instr->operands) {
1442          if (op.physReg().reg() < 256)
1443             continue;
1444          for (unsigned i = 0; i < op.size(); i++) {
1445             PhysReg reg = op.physReg().advance(i * 4);
1446             num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(reg));
1447             num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(reg));
1448          }
1449       }
1450       if (num_trans <= 1 && num_valu <= 5) {
1451          bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1452          va_vdst = 0;
1453       }
1454    }
1455 
1456    if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
1457        handle_valu_partial_forwarding_hazard(state, instr)) {
1458       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1459       va_vdst = 0;
1460    }
1461 
1462    if (state.program->gfx_level < GFX12) {
1463       /* VALUMaskWriteHazard
1464        * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
1465        * VALU.
1466        */
1467       if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
1468           check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1469          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1470          sa_sdst = 0;
1471       }
1472 
1473       if (va_vdst == 0) {
1474          ctx.valu_since_wr_by_trans.reset();
1475          ctx.trans_since_wr_by_trans.reset();
1476       }
1477 
1478       if (sa_sdst == 0)
1479          ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1480 
1481       if (state.program->wave_size == 64 && instr->isSALU() &&
1482           check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1483          unsigned reg = instr->definitions[0].physReg().reg();
1484          for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1485             ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg + i] = 1;
1486       }
1487 
1488       if (instr->isVALU()) {
1489          bool is_trans = instr->isTrans();
1490 
1491          ctx.valu_since_wr_by_trans.inc();
1492          if (is_trans)
1493             ctx.trans_since_wr_by_trans.inc();
1494 
1495          if (is_trans) {
1496             for (Definition& def : instr->definitions) {
1497                for (unsigned i = 0; i < def.size(); i++) {
1498                   PhysReg reg = def.physReg().advance(i * 4);
1499                   ctx.valu_since_wr_by_trans.set(reg);
1500                   ctx.trans_since_wr_by_trans.set(reg);
1501                }
1502             }
1503          }
1504 
1505          if (state.program->wave_size == 64) {
1506             for (Operand& op : instr->operands) {
1507                /* This should ignore exec reads */
1508                if (!op.isConstant() && op.physReg().reg() < 126)
1509                   ctx.sgpr_read_by_valu_as_lanemask.reset();
1510             }
1511             switch (instr->opcode) {
1512             case aco_opcode::v_addc_co_u32:
1513             case aco_opcode::v_subb_co_u32:
1514             case aco_opcode::v_subbrev_co_u32:
1515             case aco_opcode::v_cndmask_b16:
1516             case aco_opcode::v_cndmask_b32:
1517             case aco_opcode::v_div_fmas_f32:
1518             case aco_opcode::v_div_fmas_f64:
1519                if (instr->operands.back().physReg() != exec) {
1520                   ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1521                   ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1522                }
1523                break;
1524             default: break;
1525             }
1526          }
1527       }
1528    } else {
1529       /* VALUReadSGPRHazard
1530        * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
1531        */
1532       if (instr->isVALU() || instr->isSALU()) {
1533          unsigned expiry_count = instr->isSALU() ? 10 : 11;
1534          for (Operand& op : instr->operands) {
1535             if (sa_sdst == 0)
1536                break;
1537 
1538             for (unsigned i = 0; i < op.size(); i++) {
1539                PhysReg reg = op.physReg().advance(i * 4);
1540                if (reg <= m0 && ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
1541                   bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1542                   sa_sdst = 0;
1543                   break;
1544                }
1545             }
1546          }
1547       }
1548 
1549       if (sa_sdst == 0)
1550          ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1551       else if (instr->isSALU() && !instr->isSOPP())
1552          ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
1553 
1554       if (instr->isVALU()) {
1555          for (const Operand& op : instr->operands) {
1556             for (unsigned i = 0; i < DIV_ROUND_UP(op.size(), 2); i++) {
1557                unsigned reg = (op.physReg() / 2) + i;
1558                if (reg < ctx.sgpr_read_by_valu.size())
1559                   ctx.sgpr_read_by_valu.set(reg);
1560             }
1561          }
1562       } else if (instr->isSALU() && !instr->definitions.empty()) {
1563          for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
1564             PhysReg def_reg = instr->definitions[0].physReg().advance(i * 4);
1565             if ((def_reg / 2) < ctx.sgpr_read_by_valu.size() && ctx.sgpr_read_by_valu[def_reg / 2])
1566                ctx.sgpr_read_by_valu_then_wr_by_salu.set(def_reg);
1567          }
1568       }
1569    }
1570 
1571    /* LdsDirectVMEMHazard
1572     * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1573     */
1574    if (instr->isVMEM() || instr->isFlatLike()) {
1575       if (instr->definitions.empty()) {
1576          for (Operand& op : instr->operands)
1577             fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1578       } else {
1579          uint8_t vmem_type = state.program->gfx_level >= GFX12
1580                                 ? get_vmem_type(state.program->gfx_level, instr.get())
1581                                 : vmem_nosampler;
1582          std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
1583          if (vmem_type == vmem_sampler)
1584             vgprs = &ctx.vgpr_used_by_vmem_sample;
1585          else if (vmem_type == vmem_bvh)
1586             vgprs = &ctx.vgpr_used_by_vmem_bvh;
1587 
1588          for (Definition& def : instr->definitions)
1589             fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
1590          for (Operand& op : instr->operands)
1591             fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
1592       }
1593    }
1594    if (instr->isDS() || instr->isFlat()) {
1595       for (Definition& def : instr->definitions)
1596          fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1597       for (Operand& op : instr->operands)
1598          fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1599    }
1600    wait_imm imm;
1601    if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1602       ctx.vgpr_used_by_vmem_load.reset();
1603       ctx.vgpr_used_by_vmem_sample.reset();
1604       ctx.vgpr_used_by_vmem_bvh.reset();
1605       ctx.vgpr_used_by_vmem_store.reset();
1606       ctx.vgpr_used_by_ds.reset();
1607    } else if (imm.unpack(state.program->gfx_level, instr.get())) {
1608       if (imm.vm == 0)
1609          ctx.vgpr_used_by_vmem_load.reset();
1610       if (imm.sample == 0)
1611          ctx.vgpr_used_by_vmem_sample.reset();
1612       if (imm.bvh == 0)
1613          ctx.vgpr_used_by_vmem_bvh.reset();
1614       if (imm.lgkm == 0)
1615          ctx.vgpr_used_by_ds.reset();
1616       if (imm.vs == 0)
1617          ctx.vgpr_used_by_vmem_store.reset();
1618    }
1619    if (instr->isLDSDIR()) {
1620       if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1621           ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
1622           ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
1623           ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1624           ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1625          if (state.program->gfx_level >= GFX12)
1626             instr->ldsdir().wait_vsrc = 0;
1627          else
1628             bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
1629          ctx.vgpr_used_by_vmem_load.reset();
1630          ctx.vgpr_used_by_vmem_sample.reset();
1631          ctx.vgpr_used_by_vmem_bvh.reset();
1632          ctx.vgpr_used_by_vmem_store.reset();
1633          ctx.vgpr_used_by_ds.reset();
1634       }
1635    }
1636 
1637    /* WMMA Hazards */
1638    if (instr_info.classes[(int)instr->opcode] == instr_class::wmma) {
1639       assert(instr->operands.back().regClass() == instr->definitions[0].regClass());
1640 
1641       bool is_swmma = instr->operands.size() == 4;
1642       if (test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[0]) ||
1643           test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[1]) ||
1644           (is_swmma && test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[2]))) {
1645          bld.vop1(aco_opcode::v_nop);
1646       }
1647 
1648       ctx.vgpr_written_by_wmma.reset();
1649       fill_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->definitions[0].physReg(),
1650                        instr->definitions[0].bytes());
1651    } else if (instr->isVALU()) {
1652       ctx.vgpr_written_by_wmma.reset();
1653    }
1654 }
1655 
1656 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1657 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1658 {
1659    if (parse_depctr_wait(pred.get()).va_vdst == 0)
1660       return true;
1661 
1662    if (--block_state == 0) {
1663       global_state = false;
1664       return true;
1665    }
1666 
1667    if (pred->isVALU()) {
1668       bool vgpr_rd_or_wr = false;
1669       for (Definition def : pred->definitions) {
1670          if (def.physReg().reg() >= 256)
1671             vgpr_rd_or_wr = true;
1672       }
1673       for (Operand op : pred->operands) {
1674          if (op.physReg().reg() >= 256)
1675             vgpr_rd_or_wr = true;
1676       }
1677       if (vgpr_rd_or_wr) {
1678          global_state = false;
1679          return true;
1680       }
1681    }
1682 
1683    return false;
1684 }
1685 
1686 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1687 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1688                   std::vector<aco_ptr<Instruction>>& new_instructions)
1689 {
1690    Builder bld(state.program, &new_instructions);
1691 
1692    unsigned waitcnt_depctr = 0xffff;
1693    bool valu_read_sgpr = false;
1694 
1695    /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1696    bool has_vdst0_since_valu = true;
1697    unsigned depth = 16;
1698    search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1699       state, has_vdst0_since_valu, depth);
1700    if (!has_vdst0_since_valu) {
1701       waitcnt_depctr &= 0x0fff;
1702       ctx.valu_since_wr_by_trans.reset();
1703       ctx.trans_since_wr_by_trans.reset();
1704    }
1705 
1706    /* VcmpxPermlaneHazard/WMMAHazards */
1707    if (ctx.has_Vcmpx || ctx.vgpr_written_by_wmma.any()) {
1708       ctx.has_Vcmpx = false;
1709       ctx.vgpr_written_by_wmma.reset();
1710       bld.vop1(aco_opcode::v_nop);
1711    }
1712 
1713    /* VALUMaskWriteHazard */
1714    if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
1715       if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
1716          waitcnt_depctr &= 0xfffe;
1717          ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1718       }
1719       if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
1720          valu_read_sgpr = true;
1721          ctx.sgpr_read_by_valu_as_lanemask.reset();
1722       }
1723    }
1724 
1725    /* VALUReadSGPRHazard */
1726    if (state.program->gfx_level >= GFX12) {
1727       if (!ctx.sgpr_read_by_valu_then_wr_by_salu.empty())
1728          waitcnt_depctr &= 0xfffe;
1729 
1730       ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1731    }
1732 
1733    /* LdsDirectVMEMHazard */
1734    if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1735        ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
1736        ctx.vgpr_used_by_vmem_bvh.any()) {
1737       waitcnt_depctr &= 0xffe3;
1738       ctx.vgpr_used_by_vmem_load.reset();
1739       ctx.vgpr_used_by_vmem_store.reset();
1740       ctx.vgpr_used_by_ds.reset();
1741    }
1742 
1743    if (waitcnt_depctr != 0xffff)
1744       bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1745 
1746    if (valu_read_sgpr) {
1747       /* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
1748        * other hazards. */
1749       bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
1750                Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
1751 
1752       /* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
1753       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1754    }
1755 }
1756 
1757 template <typename Ctx>
1758 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1759                              std::vector<aco_ptr<Instruction>>&);
1760 
1761 template <typename Ctx>
1762 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1763 
1764 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1765 void
handle_block(Program * program,Ctx & ctx,Block & block)1766 handle_block(Program* program, Ctx& ctx, Block& block)
1767 {
1768    if (block.instructions.empty())
1769       return;
1770 
1771    State state;
1772    state.program = program;
1773    state.block = &block;
1774    state.old_instructions = std::move(block.instructions);
1775 
1776    block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1777    block.instructions.reserve(state.old_instructions.size());
1778 
1779    bool found_end = false;
1780    for (aco_ptr<Instruction>& instr : state.old_instructions) {
1781       Handle(state, ctx, instr, block.instructions);
1782 
1783       /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1784       if (instr->opcode == aco_opcode::s_setpc_b64) {
1785          block.instructions.emplace_back(std::move(instr));
1786 
1787          std::vector<aco_ptr<Instruction>> resolve_instrs;
1788          Resolve(state, ctx, resolve_instrs);
1789          block.instructions.insert(std::prev(block.instructions.end()),
1790                                    std::move_iterator(resolve_instrs.begin()),
1791                                    std::move_iterator(resolve_instrs.end()));
1792 
1793          found_end = true;
1794          continue;
1795       }
1796 
1797       found_end |= instr->opcode == aco_opcode::s_endpgm;
1798       block.instructions.emplace_back(std::move(instr));
1799    }
1800 
1801    /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1802    if (block.linear_succs.empty() && !found_end)
1803       Resolve(state, ctx, block.instructions);
1804 }
1805 
1806 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1807 void
mitigate_hazards(Program * program,Ctx initial_ctx=Ctx ())1808 mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
1809 {
1810    std::vector<Ctx> all_ctx(program->blocks.size());
1811    std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1812 
1813    for (unsigned i = 0; i < program->blocks.size(); i++) {
1814       Block& block = program->blocks[i];
1815       Ctx& ctx = all_ctx[i];
1816 
1817       if (i == 0 || (block.kind & block_kind_resume))
1818          ctx = initial_ctx;
1819 
1820       if (block.kind & block_kind_loop_header) {
1821          loop_header_indices.push(i);
1822       } else if (block.kind & block_kind_loop_exit) {
1823          /* Go through the whole loop again */
1824          for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1825             Ctx loop_block_ctx;
1826             for (unsigned b : program->blocks[idx].linear_preds)
1827                loop_block_ctx.join(all_ctx[b]);
1828 
1829             handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1830 
1831             /* We only need to continue if the loop header context changed */
1832             if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1833                break;
1834 
1835             all_ctx[idx] = loop_block_ctx;
1836          }
1837 
1838          loop_header_indices.pop();
1839       }
1840 
1841       for (unsigned b : block.linear_preds)
1842          ctx.join(all_ctx[b]);
1843 
1844       handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1845    }
1846 }
1847 
1848 /* FeatureRequiredExportPriority in LLVM */
1849 void
required_export_priority(Program * program)1850 required_export_priority(Program* program)
1851 {
1852    /* Skip callees, assuming that the caller has already increased the priority. */
1853    bool increase_priority = !program->is_epilog && !program->info.vs.has_prolog &&
1854                             (!program->info.merged_shader_compiled_separately ||
1855                              program->stage.sw == SWStage::VS || program->stage.sw == SWStage::TES);
1856    increase_priority |= program->is_prolog;
1857 
1858    for (Block& block : program->blocks) {
1859       std::vector<aco_ptr<Instruction>> new_instructions;
1860       new_instructions.reserve(block.instructions.size() + 6);
1861 
1862       Builder bld(program, &new_instructions);
1863 
1864       if (increase_priority && block.index == 0) {
1865          if (!block.instructions.empty() && block.instructions[0]->opcode == aco_opcode::s_setprio)
1866             block.instructions[0]->salu().imm = MAX2(block.instructions[0]->salu().imm, 2);
1867          else
1868             bld.sopp(aco_opcode::s_setprio, 2);
1869       }
1870 
1871       for (unsigned i = 0; i < block.instructions.size(); i++) {
1872          Instruction* instr = block.instructions[i].get();
1873          new_instructions.push_back(std::move(block.instructions[i]));
1874 
1875          if (instr->opcode == aco_opcode::s_setprio) {
1876             instr->salu().imm = MAX2(instr->salu().imm, 2);
1877             continue;
1878          }
1879 
1880          bool end_of_export_sequence = instr->isEXP() && (i == block.instructions.size() - 1 ||
1881                                                           !block.instructions[i + 1]->isEXP());
1882          if (!end_of_export_sequence)
1883             continue;
1884 
1885          bool before_endpgm = false;
1886          if (i != block.instructions.size() - 1) {
1887             before_endpgm = block.instructions[i + 1]->opcode == aco_opcode::s_endpgm;
1888          } else {
1889             /* Does this fallthrough to a s_endpgm? */
1890             for (unsigned j = block.index + 1; j < program->blocks.size(); j++) {
1891                if (program->blocks[j].instructions.size() == 1 &&
1892                    program->blocks[j].instructions[0]->opcode == aco_opcode::s_endpgm)
1893                   before_endpgm = true;
1894                if (!program->blocks[j].instructions.empty())
1895                   break;
1896             }
1897          }
1898 
1899          bld.sopp(aco_opcode::s_setprio, 0);
1900          if (!before_endpgm)
1901             bld.sopk(aco_opcode::s_waitcnt_expcnt, Operand(sgpr_null, s1), 0);
1902          bld.sopp(aco_opcode::s_nop, 0);
1903          bld.sopp(aco_opcode::s_nop, 0);
1904          if (!before_endpgm)
1905             bld.sopp(aco_opcode::s_setprio, 2);
1906       }
1907 
1908       block.instructions = std::move(new_instructions);
1909    }
1910 }
1911 
1912 } /* end namespace */
1913 
1914 void
insert_NOPs(Program * program)1915 insert_NOPs(Program* program)
1916 {
1917    if (program->gfx_level >= GFX11) {
1918       NOP_ctx_gfx11 initial_ctx;
1919 
1920       bool has_previous_part =
1921          program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
1922          (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
1923           program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
1924       if (program->gfx_level >= GFX12 && has_previous_part) {
1925          /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
1926           * SGPR might have been read by VALU if there was a previous shader part.
1927           */
1928          initial_ctx.sgpr_read_by_valu.flip();
1929       }
1930 
1931       mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
1932                                                                                    initial_ctx);
1933    } else if (program->gfx_level >= GFX10_3) {
1934       ; /* no hazards/bugs to mitigate */
1935    } else if (program->gfx_level >= GFX10) {
1936       mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1937    } else {
1938       mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1939    }
1940 
1941    if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1942                                          program->stage.hw == AC_HW_PIXEL_SHADER))
1943       required_export_priority(program);
1944 }
1945 
1946 } // namespace aco
1947