• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include "util/bitset.h"
11 
12 #include <algorithm>
13 #include <bitset>
14 #include <set>
15 #include <stack>
16 #include <vector>
17 
18 namespace aco {
19 namespace {
20 
21 struct State {
22    Program* program;
23    Block* block;
24    std::vector<aco_ptr<Instruction>> old_instructions;
25 };
26 
27 struct NOP_ctx_gfx6 {
joinaco::__anone77085690111::NOP_ctx_gfx628    void join(const NOP_ctx_gfx6& other)
29    {
30       set_vskip_mode_then_vector =
31          MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
32       valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
33       salu_wr_m0_then_gds_msg_ttrace =
34          MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
35       valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
36       salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
37       salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
38       setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
39       vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
40       smem_clause |= other.smem_clause;
41       smem_write |= other.smem_write;
42       for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
43          smem_clause_read_write[i] |= other.smem_clause_read_write[i];
44          smem_clause_write[i] |= other.smem_clause_write[i];
45       }
46    }
47 
operator ==aco::__anone77085690111::NOP_ctx_gfx648    bool operator==(const NOP_ctx_gfx6& other)
49    {
50       return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
51              valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
52              vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
53              salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
54              valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
55              salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
56              salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
57              setreg_then_getsetreg == other.setreg_then_getsetreg &&
58              smem_clause == other.smem_clause && smem_write == other.smem_write &&
59              BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
60              BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
61    }
62 
add_wait_statesaco::__anone77085690111::NOP_ctx_gfx663    void add_wait_states(unsigned amount)
64    {
65       if ((set_vskip_mode_then_vector -= amount) < 0)
66          set_vskip_mode_then_vector = 0;
67 
68       if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
69          valu_wr_vcc_then_div_fmas = 0;
70 
71       if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
72          salu_wr_m0_then_gds_msg_ttrace = 0;
73 
74       if ((valu_wr_exec_then_dpp -= amount) < 0)
75          valu_wr_exec_then_dpp = 0;
76 
77       if ((salu_wr_m0_then_lds -= amount) < 0)
78          salu_wr_m0_then_lds = 0;
79 
80       if ((salu_wr_m0_then_moverel -= amount) < 0)
81          salu_wr_m0_then_moverel = 0;
82 
83       if ((setreg_then_getsetreg -= amount) < 0)
84          setreg_then_getsetreg = 0;
85 
86       vmem_store_then_wr_data.reset();
87    }
88 
89    /* setting MODE.vskip and then any vector op requires 2 wait states */
90    int8_t set_vskip_mode_then_vector = 0;
91 
92    /* VALU writing VCC followed by v_div_fmas require 4 wait states */
93    int8_t valu_wr_vcc_then_div_fmas = 0;
94 
95    /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
96    int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
97 
98    /* VALU writing EXEC followed by DPP requires 5 wait states */
99    int8_t valu_wr_exec_then_dpp = 0;
100 
101    /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
102    int8_t salu_wr_m0_then_lds = 0;
103 
104    /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
105    int8_t salu_wr_m0_then_moverel = 0;
106 
107    /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
108     * currently we don't look at the actual register */
109    int8_t setreg_then_getsetreg = 0;
110 
111    /* some memory instructions writing >64bit followed by a instructions
112     * writing the VGPRs holding the writedata requires 1 wait state */
113    std::bitset<256> vmem_store_then_wr_data;
114 
115    /* we break up SMEM clauses that contain stores or overwrite an
116     * operand/definition of another instruction in the clause */
117    bool smem_clause = false;
118    bool smem_write = false;
119    BITSET_DECLARE(smem_clause_read_write, 128) = {0};
120    BITSET_DECLARE(smem_clause_write, 128) = {0};
121 };
122 
123 struct NOP_ctx_gfx10 {
124    bool has_VOPC_write_exec = false;
125    bool has_nonVALU_exec_read = false;
126    bool has_VMEM = false;
127    bool has_branch_after_VMEM = false;
128    bool has_DS = false;
129    bool has_branch_after_DS = false;
130    bool has_NSA_MIMG = false;
131    bool has_writelane = false;
132    std::bitset<128> sgprs_read_by_VMEM;
133    std::bitset<128> sgprs_read_by_VMEM_store;
134    std::bitset<128> sgprs_read_by_DS;
135    std::bitset<128> sgprs_read_by_SMEM;
136 
joinaco::__anone77085690111::NOP_ctx_gfx10137    void join(const NOP_ctx_gfx10& other)
138    {
139       has_VOPC_write_exec |= other.has_VOPC_write_exec;
140       has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
141       has_VMEM |= other.has_VMEM;
142       has_branch_after_VMEM |= other.has_branch_after_VMEM;
143       has_DS |= other.has_DS;
144       has_branch_after_DS |= other.has_branch_after_DS;
145       has_NSA_MIMG |= other.has_NSA_MIMG;
146       has_writelane |= other.has_writelane;
147       sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
148       sgprs_read_by_DS |= other.sgprs_read_by_DS;
149       sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
150       sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
151    }
152 
operator ==aco::__anone77085690111::NOP_ctx_gfx10153    bool operator==(const NOP_ctx_gfx10& other)
154    {
155       return has_VOPC_write_exec == other.has_VOPC_write_exec &&
156              has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
157              has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
158              has_branch_after_DS == other.has_branch_after_DS &&
159              has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
160              sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
161              sgprs_read_by_DS == other.sgprs_read_by_DS &&
162              sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
163              sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
164    }
165 };
166 
167 template <int Max> struct RegCounterMap {
incaco::__anone77085690111::RegCounterMap168    void inc() { base++; }
setaco::__anone77085690111::RegCounterMap169    void set(PhysReg reg) { update(reg, 0); }
170 
getaco::__anone77085690111::RegCounterMap171    uint8_t get(PhysReg reg)
172    {
173       if (present.test(reg.reg() & 0x7F)) {
174          for (entry& e : list) {
175             if (e.reg == reg.reg())
176                return MIN2(base - e.val, Max);
177          }
178       }
179       return Max;
180    }
181 
resetaco::__anone77085690111::RegCounterMap182    void reset()
183    {
184       present.reset();
185       list.clear();
186       base = 0;
187    }
188 
emptyaco::__anone77085690111::RegCounterMap189    bool empty()
190    {
191       for (entry& e : list) {
192          if (base - e.val < Max)
193             return false;
194       }
195       return true;
196    }
197 
join_minaco::__anone77085690111::RegCounterMap198    void join_min(const RegCounterMap& other)
199    {
200       for (const entry& e : other.list) {
201          int idx = other.base - e.val;
202          if (idx >= Max)
203             continue;
204 
205          update(e.reg, idx);
206       }
207    }
208 
updateaco::__anone77085690111::RegCounterMap209    void update(uint16_t reg, int idx)
210    {
211       int16_t val = base - idx;
212       for (entry& e : list) {
213          if (e.reg == reg) {
214             e.val = MAX2(e.val, val);
215             return;
216          }
217       }
218       list.push_back(entry{reg, val});
219       present.set(reg & 0x7F);
220    }
221 
operator ==aco::__anone77085690111::RegCounterMap222    bool operator==(const RegCounterMap& other) const
223    {
224       /* Two maps with different bases could also be equal, but for our use case,
225        * i.e. checking for changes at loop headers, this is sufficient since we
226        * always join the predecessors into an empty map with base=0.
227        */
228       return base == other.base && list == other.list;
229    }
230 
231 private:
232    struct entry {
233       uint16_t reg;
234       int16_t val;
operator !=aco::__anone77085690111::RegCounterMap::entry235       bool operator!=(const entry& other) const { return reg != other.reg || val != other.val; }
236    };
237 
238    std::bitset<128> present;
239    small_vec<entry, 4> list;
240    int base = 0;
241 };
242 
243 struct NOP_ctx_gfx11 {
244    /* VcmpxPermlaneHazard */
245    bool has_Vcmpx = false;
246 
247    /* LdsDirectVMEMHazard */
248    std::bitset<256> vgpr_used_by_vmem_load;
249    std::bitset<256> vgpr_used_by_vmem_sample;
250    std::bitset<256> vgpr_used_by_vmem_bvh;
251    std::bitset<256> vgpr_used_by_vmem_store;
252    std::bitset<256> vgpr_used_by_ds;
253 
254    /* VALUTransUseHazard */
255    RegCounterMap<6> valu_since_wr_by_trans;
256    RegCounterMap<2> trans_since_wr_by_trans;
257 
258    /* VALUMaskWriteHazard */
259    std::bitset<128> sgpr_read_by_valu_as_lanemask;
260    std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
261 
262    /* WMMAHazards */
263    std::bitset<256> vgpr_written_by_wmma;
264 
265    /* VALUReadSGPRHazard */
266    std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
267    std::bitset<m0.reg()> sgpr_read_by_valu_then_wr_by_valu;
268    RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
269 
joinaco::__anone77085690111::NOP_ctx_gfx11270    void join(const NOP_ctx_gfx11& other)
271    {
272       has_Vcmpx |= other.has_Vcmpx;
273       vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
274       vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
275       vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
276       vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
277       vgpr_used_by_ds |= other.vgpr_used_by_ds;
278       valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
279       trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
280       sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
281       sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
282          other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
283       vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
284       sgpr_read_by_valu |= other.sgpr_read_by_valu;
285       sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
286       sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
287    }
288 
operator ==aco::__anone77085690111::NOP_ctx_gfx11289    bool operator==(const NOP_ctx_gfx11& other)
290    {
291       return has_Vcmpx == other.has_Vcmpx &&
292              vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
293              vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
294              vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
295              vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
296              vgpr_used_by_ds == other.vgpr_used_by_ds &&
297              valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
298              trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
299              sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
300              sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
301                 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
302              vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
303              sgpr_read_by_valu == other.sgpr_read_by_valu &&
304              sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
305    }
306 };
307 
308 int
get_wait_states(aco_ptr<Instruction> & instr)309 get_wait_states(aco_ptr<Instruction>& instr)
310 {
311    if (instr->opcode == aco_opcode::s_nop)
312       return instr->salu().imm + 1;
313    else if (instr->opcode == aco_opcode::p_constaddr)
314       return 3; /* lowered to 3 instructions in the assembler */
315    else
316       return 1;
317 }
318 
319 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)320 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
321 {
322    return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
323 }
324 
325 template <typename GlobalState, typename BlockState,
326           bool (*block_cb)(GlobalState&, BlockState&, Block*),
327           bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
328 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)329 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
330                           Block* block, bool start_at_end)
331 {
332    if (block == state.block && start_at_end) {
333       /* If it's the current block, block->instructions is incomplete. */
334       for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
335          aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
336          if (!instr)
337             break; /* Instruction has been moved to block->instructions. */
338          if (instr_cb(global_state, block_state, instr))
339             return;
340       }
341    }
342 
343    for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
344       if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
345          return;
346    }
347 
348    PRAGMA_DIAGNOSTIC_PUSH
349    PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
350    if (block_cb != nullptr && !block_cb(global_state, block_state, block))
351       return;
352    PRAGMA_DIAGNOSTIC_POP
353 
354    for (unsigned lin_pred : block->linear_preds) {
355       search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
356          state, global_state, block_state, &state.program->blocks[lin_pred], true);
357    }
358 }
359 
360 template <typename GlobalState, typename BlockState,
361           bool (*block_cb)(GlobalState&, BlockState&, Block*),
362           bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
363 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)364 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
365 {
366    search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
367       state, global_state, block_state, state.block, false);
368 }
369 
370 struct HandleRawHazardGlobalState {
371    PhysReg reg;
372    int nops_needed;
373 };
374 
375 struct HandleRawHazardBlockState {
376    uint32_t mask;
377    int nops_needed;
378 };
379 
380 template <bool Valu, bool Vintrp, bool Salu>
381 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)382 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
383                         HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
384 {
385    unsigned mask_size = util_last_bit(block_state.mask);
386 
387    uint32_t writemask = 0;
388    for (Definition& def : pred->definitions) {
389       if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
390          unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
391          unsigned end = MIN2(mask_size, start + def.size());
392          writemask |= u_bit_consecutive(start, end - start);
393       }
394    }
395 
396    bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
397                                        (pred->isSALU() && Salu));
398    if (is_hazard) {
399       global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
400       return true;
401    }
402 
403    block_state.mask &= ~writemask;
404    block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
405 
406    if (block_state.mask == 0)
407       block_state.nops_needed = 0;
408 
409    return block_state.nops_needed == 0;
410 }
411 
412 template <bool Valu, bool Vintrp, bool Salu>
413 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)414 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
415 {
416    if (*NOPs >= min_states)
417       return;
418 
419    HandleRawHazardGlobalState global = {op.physReg(), 0};
420    HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
421 
422    /* Loops require branch instructions, which count towards the wait
423     * states. So even with loops this should finish unless nops_needed is some
424     * huge value. */
425    search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
426                     handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
427 
428    *NOPs = MAX2(*NOPs, global.nops_needed);
429 }
430 
431 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
432 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
433 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
434 
435 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)436 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
437 {
438    unsigned end = start + size - 1;
439    unsigned start_mod = start % BITSET_WORDBITS;
440    if (start_mod + size <= BITSET_WORDBITS) {
441       BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
442    } else {
443       unsigned first_size = BITSET_WORDBITS - start_mod;
444       set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
445       set_bitset_range(words, start + first_size, size - first_size);
446    }
447 }
448 
449 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)450 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
451 {
452    unsigned end = start + size - 1;
453    unsigned start_mod = start % BITSET_WORDBITS;
454    if (start_mod + size <= BITSET_WORDBITS) {
455       return BITSET_TEST_RANGE(words, start, end);
456    } else {
457       unsigned first_size = BITSET_WORDBITS - start_mod;
458       return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
459              test_bitset_range(words, start + first_size, size - first_size);
460    }
461 }
462 
463 /* A SMEM clause is any group of consecutive SMEM instructions. The
464  * instructions in this group may return out of order and/or may be replayed.
465  *
466  * To fix this potential hazard correctly, we have to make sure that when a
467  * clause has more than one instruction, no instruction in the clause writes
468  * to a register that is read by another instruction in the clause (including
469  * itself). In this case, we have to break the SMEM clause by inserting non
470  * SMEM instructions.
471  *
472  * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
473  */
474 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)475 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
476                            int* NOPs)
477 {
478    /* break off from previous SMEM clause if needed */
479    if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
480       /* Don't allow clauses with store instructions since the clause's
481        * instructions may use the same address. */
482       if (ctx.smem_write || instr->definitions.empty() ||
483           instr_info.is_atomic[(unsigned)instr->opcode]) {
484          *NOPs = 1;
485       } else if (program->dev.xnack_enabled) {
486          for (Operand op : instr->operands) {
487             if (!op.isConstant() &&
488                 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
489                *NOPs = 1;
490                break;
491             }
492          }
493 
494          Definition def = instr->definitions[0];
495          if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
496             *NOPs = 1;
497       }
498    }
499 }
500 
501 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
502 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)503 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
504                         std::vector<aco_ptr<Instruction>>& new_instructions)
505 {
506    /* check hazards */
507    int NOPs = 0;
508 
509    if (instr->isSMEM()) {
510       if (state.program->gfx_level == GFX6) {
511          /* A read of an SGPR by SMRD instruction requires 4 wait states
512           * when the SGPR was written by a VALU instruction. According to LLVM,
513           * there is also an undocumented hardware behavior when the buffer
514           * descriptor is written by a SALU instruction */
515          for (unsigned i = 0; i < instr->operands.size(); i++) {
516             Operand op = instr->operands[i];
517             if (op.isConstant())
518                continue;
519 
520             bool is_buffer_desc = i == 0 && op.size() > 2;
521             if (is_buffer_desc)
522                handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
523             else
524                handle_valu_then_read_hazard(state, &NOPs, 4, op);
525          }
526       }
527 
528       handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
529    } else if (instr->isSALU()) {
530       if (instr->opcode == aco_opcode::s_setreg_b32 ||
531           instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
532           instr->opcode == aco_opcode::s_getreg_b32) {
533          NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
534       }
535 
536       if (state.program->gfx_level == GFX9) {
537          if (instr->opcode == aco_opcode::s_movrels_b32 ||
538              instr->opcode == aco_opcode::s_movrels_b64 ||
539              instr->opcode == aco_opcode::s_movreld_b32 ||
540              instr->opcode == aco_opcode::s_movreld_b64) {
541             NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
542          }
543       }
544 
545       if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
546          NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
547    } else if (instr->isDS() && instr->ds().gds) {
548       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
549    } else if (instr->isVALU() || instr->isVINTRP()) {
550       if (instr->isDPP()) {
551          NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
552          handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
553       }
554 
555       for (Definition def : instr->definitions) {
556          if (def.regClass().type() != RegType::sgpr) {
557             for (unsigned i = 0; i < def.size(); i++)
558                NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
559          }
560       }
561 
562       if ((instr->opcode == aco_opcode::v_readlane_b32 ||
563            instr->opcode == aco_opcode::v_readlane_b32_e64 ||
564            instr->opcode == aco_opcode::v_writelane_b32 ||
565            instr->opcode == aco_opcode::v_writelane_b32_e64) &&
566           !instr->operands[1].isConstant()) {
567          handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
568       }
569 
570       /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
571        * is followed by a read with v_readfirstlane or v_readlane to fix GPU
572        * hangs on GFX6. Note that v_writelane_* is apparently not affected.
573        * This hazard isn't documented anywhere but AMD confirmed that hazard.
574        */
575       if (state.program->gfx_level == GFX6 &&
576           (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
577            instr->opcode == aco_opcode::v_readfirstlane_b32)) {
578          handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
579       }
580 
581       if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
582           instr->opcode == aco_opcode::v_div_fmas_f64)
583          NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
584    } else if (instr->isVMEM() || instr->isFlatLike()) {
585       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
586       for (Operand op : instr->operands) {
587          if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
588             handle_valu_then_read_hazard(state, &NOPs, 5, op);
589       }
590    }
591 
592    if (!instr->isSALU() && instr->format != Format::SMEM)
593       NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
594 
595    if (state.program->gfx_level == GFX9) {
596       bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
597       if (instr->isVINTRP() || lds_scratch_global ||
598           instr->opcode == aco_opcode::ds_read_addtid_b32 ||
599           instr->opcode == aco_opcode::ds_write_addtid_b32 ||
600           instr->opcode == aco_opcode::buffer_store_lds_dword) {
601          NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
602       }
603    }
604 
605    ctx.add_wait_states(NOPs + get_wait_states(instr));
606 
607    // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
608    if (NOPs) {
609       /* create NOP */
610       aco_ptr<Instruction> nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)};
611       nop->salu().imm = NOPs - 1;
612       new_instructions.emplace_back(std::move(nop));
613    }
614 
615    /* update information to check for later hazards */
616    if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
617       ctx.smem_clause = false;
618       ctx.smem_write = false;
619 
620       if (state.program->dev.xnack_enabled) {
621          BITSET_ZERO(ctx.smem_clause_read_write);
622          BITSET_ZERO(ctx.smem_clause_write);
623       }
624    }
625 
626    if (instr->isSMEM()) {
627       if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
628          ctx.smem_write = true;
629       } else {
630          ctx.smem_clause = true;
631 
632          if (state.program->dev.xnack_enabled) {
633             for (Operand op : instr->operands) {
634                if (!op.isConstant()) {
635                   set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
636                }
637             }
638 
639             Definition def = instr->definitions[0];
640             set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
641             set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
642          }
643       }
644    } else if (instr->isVALU()) {
645       for (Definition def : instr->definitions) {
646          if (def.regClass().type() == RegType::sgpr) {
647             if (def.physReg() == vcc || def.physReg() == vcc_hi) {
648                ctx.valu_wr_vcc_then_div_fmas = 4;
649             }
650             if (def.physReg() == exec || def.physReg() == exec_hi) {
651                ctx.valu_wr_exec_then_dpp = 5;
652             }
653          }
654       }
655    } else if (instr->isSALU()) {
656       if (!instr->definitions.empty()) {
657          /* all other definitions should be SCC */
658          Definition def = instr->definitions[0];
659          if (def.physReg() == m0) {
660             ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
661             ctx.salu_wr_m0_then_lds = 1;
662             ctx.salu_wr_m0_then_moverel = 1;
663          }
664       } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
665                  instr->opcode == aco_opcode::s_setreg_imm32_b32) {
666          SALU_instruction& sopk = instr->salu();
667          unsigned offset = (sopk.imm >> 6) & 0x1f;
668          unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
669          unsigned reg = sopk.imm & 0x3f;
670          ctx.setreg_then_getsetreg = 2;
671 
672          if (reg == 1 && offset >= 28 && size > (28 - offset))
673             ctx.set_vskip_mode_then_vector = 2;
674       }
675    } else if (instr->isVMEM() || instr->isFlatLike()) {
676       /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
677       bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
678                           instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
679       /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
680        * store) */
681       bool consider_mimg = instr->isMIMG() &&
682                            instr->operands[1].regClass().type() == RegType::vgpr &&
683                            instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
684       /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
685       bool consider_flat =
686          instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
687       if (consider_buf || consider_mimg || consider_flat) {
688          PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
689          unsigned size = instr->operands[consider_flat ? 2 : 3].size();
690          for (unsigned i = 0; i < size; i++)
691             ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
692       }
693    }
694 }
695 
696 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)697 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
698 {
699    if (pred->isVINTRP())
700       global_state = true;
701    return true;
702 }
703 
704 template <bool Salu, bool Sgpr>
705 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)706 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
707 {
708    if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
709       for (Definition dst : pred->definitions) {
710          if ((dst.physReg().reg() < 256) == Sgpr) {
711             global_state = MAX2(global_state, block_state);
712             return true;
713          }
714       }
715    }
716 
717    block_state -= get_wait_states(pred);
718    return block_state <= 0;
719 }
720 
721 template <bool Salu, bool Sgpr>
722 void
handle_wr_hazard(State & state,int * NOPs,int min_states)723 handle_wr_hazard(State& state, int* NOPs, int min_states)
724 {
725    if (*NOPs >= min_states)
726       return;
727 
728    int global = 0;
729    int block = min_states;
730    search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
731    *NOPs = MAX2(*NOPs, global);
732 }
733 
734 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)735 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
736                  std::vector<aco_ptr<Instruction>>& new_instructions)
737 {
738    int NOPs = 0;
739 
740    /* SGPR->SMEM hazards */
741    if (state.program->gfx_level == GFX6) {
742       handle_wr_hazard<true, true>(state, &NOPs, 4);
743       handle_wr_hazard<false, true>(state, &NOPs, 4);
744    }
745 
746    /* Break up SMEM clauses */
747    if (ctx.smem_clause || ctx.smem_write)
748       NOPs = MAX2(NOPs, 1);
749 
750    /* SALU/GDS hazards */
751    NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
752    if (state.program->gfx_level == GFX9)
753       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
754    NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
755 
756    /* VALU hazards */
757    NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
758    if (state.program->gfx_level >= GFX8)
759       handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
760    NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
761    if (state.program->gfx_level == GFX6) {
762       /* VINTRP->v_readlane_b32/etc */
763       bool vintrp = false;
764       search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
765       if (vintrp)
766          NOPs = MAX2(NOPs, 1);
767    }
768    NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
769 
770    /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
771    handle_wr_hazard<false, true>(state, &NOPs, 5);
772 
773    NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
774 
775    if (state.program->gfx_level == GFX9)
776       NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
777 
778    ctx.add_wait_states(NOPs);
779    if (NOPs) {
780       Builder bld(state.program, &new_instructions);
781       bld.sopp(aco_opcode::s_nop, NOPs - 1);
782    }
783 }
784 
785 template <std::size_t N>
786 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)787 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
788 {
789    return std::any_of(instr->definitions.begin(), instr->definitions.end(),
790                       [&check_regs](const Definition& def) -> bool
791                       {
792                          bool writes_any = false;
793                          for (unsigned i = 0; i < def.size(); i++) {
794                             unsigned def_reg = def.physReg() + i;
795                             writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
796                          }
797                          return writes_any;
798                       });
799 }
800 
801 template <std::size_t N>
802 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)803 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
804 {
805    return std::any_of(instr->operands.begin(), instr->operands.end(),
806                       [&check_regs](const Operand& op) -> bool
807                       {
808                          if (op.isConstant())
809                             return false;
810                          bool writes_any = false;
811                          for (unsigned i = 0; i < op.size(); i++) {
812                             unsigned op_reg = op.physReg() + i;
813                             writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
814                          }
815                          return writes_any;
816                       });
817 }
818 
819 template <std::size_t N>
820 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)821 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
822 {
823    for (const Operand& op : instr->operands) {
824       for (unsigned i = 0; i < op.size(); i++) {
825          unsigned reg = op.physReg() + i;
826          if (reg < reg_reads.size())
827             reg_reads.set(reg);
828       }
829    }
830 }
831 
832 template <std::size_t N>
833 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)834 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
835 {
836    mark_read_regs(instr, reg_reads);
837    reg_reads.set(exec);
838    if (state.program->wave_size == 64)
839       reg_reads.set(exec_hi);
840 }
841 
842 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)843 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
844 {
845    if (instr->isVOPC())
846       return true;
847    if (instr->isVOP3() && instr->definitions.size() == 2)
848       return true;
849    if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
850        instr->opcode == aco_opcode::v_readlane_b32 ||
851        instr->opcode == aco_opcode::v_readlane_b32_e64)
852       return true;
853    return false;
854 }
855 
856 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)857 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
858 {
859    return std::any_of(instr->definitions.begin(), instr->definitions.end(),
860                       [](const Definition& def) -> bool
861                       { return def.getTemp().type() == RegType::sgpr; });
862 }
863 
864 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)865 instr_is_branch(const aco_ptr<Instruction>& instr)
866 {
867    return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
868           instr->opcode == aco_opcode::s_cbranch_scc1 ||
869           instr->opcode == aco_opcode::s_cbranch_vccz ||
870           instr->opcode == aco_opcode::s_cbranch_vccnz ||
871           instr->opcode == aco_opcode::s_cbranch_execz ||
872           instr->opcode == aco_opcode::s_cbranch_execnz ||
873           instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
874           instr->opcode == aco_opcode::s_cbranch_cdbguser ||
875           instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
876           instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
877           instr->opcode == aco_opcode::s_subvector_loop_begin ||
878           instr->opcode == aco_opcode::s_subvector_loop_end ||
879           instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
880           instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
881 }
882 
883 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)884 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
885                          std::vector<aco_ptr<Instruction>>& new_instructions)
886 {
887    // TODO: s_dcache_inv needs to be in it's own group on GFX10
888 
889    Builder bld(state.program, &new_instructions);
890 
891    unsigned vm_vsrc = 7;
892    unsigned sa_sdst = 1;
893    if (debug_flags & DEBUG_FORCE_WAITDEPS) {
894       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
895       vm_vsrc = 0;
896       sa_sdst = 0;
897    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
898       vm_vsrc = (instr->salu().imm >> 2) & 0x7;
899       sa_sdst = instr->salu().imm & 0x1;
900    }
901 
902    /* VMEMtoScalarWriteHazard
903     * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
904     * in-between.
905     */
906    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
907       /* Remember all SGPRs that are read by the VMEM/DS instruction */
908       if (instr->isVMEM() || instr->isFlatLike())
909          mark_read_regs_exec(
910             state, instr,
911             instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
912       if (instr->isFlat() || instr->isDS())
913          mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
914    } else if (instr->isSALU() || instr->isSMEM()) {
915       wait_imm imm;
916       if (imm.unpack(state.program->gfx_level, instr.get())) {
917          if (imm.vm == 0)
918             ctx.sgprs_read_by_VMEM.reset();
919          if (imm.lgkm == 0)
920             ctx.sgprs_read_by_DS.reset();
921          if (imm.vs == 0)
922             ctx.sgprs_read_by_VMEM_store.reset();
923       } else if (vm_vsrc == 0) {
924          ctx.sgprs_read_by_VMEM.reset();
925          ctx.sgprs_read_by_DS.reset();
926          ctx.sgprs_read_by_VMEM_store.reset();
927       }
928 
929       /* Check if SALU writes an SGPR that was previously read by the VALU */
930       if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
931           check_written_regs(instr, ctx.sgprs_read_by_DS) ||
932           check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
933          ctx.sgprs_read_by_VMEM.reset();
934          ctx.sgprs_read_by_DS.reset();
935          ctx.sgprs_read_by_VMEM_store.reset();
936 
937          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
938          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
939       }
940    } else if (instr->isVALU()) {
941       /* Hazard is mitigated by any VALU instruction */
942       ctx.sgprs_read_by_VMEM.reset();
943       ctx.sgprs_read_by_DS.reset();
944       ctx.sgprs_read_by_VMEM_store.reset();
945    }
946 
947    /* VcmpxPermlaneHazard
948     * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
949     */
950    if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
951       /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
952       ctx.has_VOPC_write_exec = true;
953    } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
954                                           instr->opcode == aco_opcode::v_permlanex16_b32)) {
955       ctx.has_VOPC_write_exec = false;
956 
957       /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
958       bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
959                Operand(instr->operands[0].physReg(), v1));
960    } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
961       ctx.has_VOPC_write_exec = false;
962    }
963 
964    /* VcmpxExecWARHazard
965     * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
966     */
967    if (!instr->isVALU() && instr->reads_exec()) {
968       ctx.has_nonVALU_exec_read = true;
969    } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
970       if (instr->writes_exec()) {
971          ctx.has_nonVALU_exec_read = false;
972 
973          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
974          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
975       } else if (instr_writes_sgpr(instr)) {
976          /* Any VALU instruction that writes an SGPR mitigates the problem */
977          ctx.has_nonVALU_exec_read = false;
978       }
979    } else if (sa_sdst == 0) {
980       ctx.has_nonVALU_exec_read = false;
981    }
982 
983    /* SMEMtoVectorWriteHazard
984     * Handle any VALU instruction writing an SGPR after an SMEM reads it.
985     */
986    if (instr->isSMEM()) {
987       /* Remember all SGPRs that are read by the SMEM instruction */
988       mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
989    } else if (VALU_writes_sgpr(instr)) {
990       /* Check if VALU writes an SGPR that was previously read by SMEM */
991       if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
992          ctx.sgprs_read_by_SMEM.reset();
993 
994          /* Insert s_mov to mitigate the problem */
995          bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
996       }
997    } else if (instr->isSALU()) {
998       wait_imm imm;
999       if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
1000          /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
1001          ctx.sgprs_read_by_SMEM.reset();
1002       } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1003          /* SALU can mitigate the hazard */
1004          ctx.sgprs_read_by_SMEM.reset();
1005       }
1006    }
1007 
1008    /* LdsBranchVmemWARHazard
1009     * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1010     */
1011    if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1012       if (ctx.has_branch_after_DS)
1013          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1014       ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1015       ctx.has_VMEM = true;
1016    } else if (instr->isDS()) {
1017       if (ctx.has_branch_after_VMEM)
1018          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1019       ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1020       ctx.has_DS = true;
1021    } else if (instr_is_branch(instr)) {
1022       ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1023       ctx.has_branch_after_DS |= ctx.has_DS;
1024       ctx.has_VMEM = ctx.has_DS = false;
1025    } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1026       /* Only s_waitcnt_vscnt can mitigate the hazard */
1027       const SALU_instruction& sopk = instr->salu();
1028       if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1029          ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1030    }
1031 
1032    /* NSAToVMEMBug
1033     * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1034     * 0).
1035     */
1036    if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1037       ctx.has_NSA_MIMG = true;
1038    } else if (ctx.has_NSA_MIMG) {
1039       ctx.has_NSA_MIMG = false;
1040 
1041       if (instr->isMUBUF() || instr->isMTBUF()) {
1042          uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1043          if (offset & 6)
1044             bld.sopp(aco_opcode::s_nop, 0);
1045       }
1046    }
1047 
1048    /* waNsaCannotFollowWritelane
1049     * Handles NSA MIMG immediately following a v_writelane_b32.
1050     */
1051    if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1052       ctx.has_writelane = true;
1053    } else if (ctx.has_writelane) {
1054       ctx.has_writelane = false;
1055       if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1056          bld.sopp(aco_opcode::s_nop, 0);
1057    }
1058 }
1059 
1060 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1061 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1062                   std::vector<aco_ptr<Instruction>>& new_instructions)
1063 {
1064    Builder bld(state.program, &new_instructions);
1065 
1066    size_t prev_count = new_instructions.size();
1067 
1068    /* VcmpxPermlaneHazard */
1069    if (ctx.has_VOPC_write_exec) {
1070       ctx.has_VOPC_write_exec = false;
1071       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1072 
1073       /* VALU mitigates VMEMtoScalarWriteHazard. */
1074       ctx.sgprs_read_by_VMEM.reset();
1075       ctx.sgprs_read_by_DS.reset();
1076       ctx.sgprs_read_by_VMEM_store.reset();
1077    }
1078 
1079    unsigned waitcnt_depctr = 0xffff;
1080 
1081    /* VMEMtoScalarWriteHazard */
1082    if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1083        ctx.sgprs_read_by_VMEM_store.any()) {
1084       ctx.sgprs_read_by_VMEM.reset();
1085       ctx.sgprs_read_by_DS.reset();
1086       ctx.sgprs_read_by_VMEM_store.reset();
1087       waitcnt_depctr &= 0xffe3;
1088    }
1089 
1090    /* VcmpxExecWARHazard */
1091    if (ctx.has_nonVALU_exec_read) {
1092       ctx.has_nonVALU_exec_read = false;
1093       waitcnt_depctr &= 0xfffe;
1094    }
1095 
1096    if (waitcnt_depctr != 0xffff)
1097       bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1098 
1099    /* SMEMtoVectorWriteHazard */
1100    if (ctx.sgprs_read_by_SMEM.any()) {
1101       ctx.sgprs_read_by_SMEM.reset();
1102       bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1103    }
1104 
1105    /* LdsBranchVmemWARHazard */
1106    if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1107       bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1108       ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1109    }
1110 
1111    /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1112    if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1113       ctx.has_NSA_MIMG = ctx.has_writelane = false;
1114       /* Any instruction resolves these hazards. */
1115       if (new_instructions.size() == prev_count)
1116          bld.sopp(aco_opcode::s_nop, 0);
1117    }
1118 }
1119 
1120 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1121 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1122 {
1123    if (reg.reg() < 256)
1124       return;
1125    for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1126       set.set(reg.reg() - 256 + i);
1127 }
1128 
1129 bool
test_vgpr_bitset(std::bitset<256> & set,Operand op)1130 test_vgpr_bitset(std::bitset<256>& set, Operand op)
1131 {
1132    if (op.physReg().reg() < 256)
1133       return false;
1134    for (unsigned i = 0; i < op.size(); i++) {
1135       if (set[op.physReg().reg() - 256 + i])
1136          return true;
1137    }
1138    return false;
1139 }
1140 
1141 /* GFX11 */
1142 struct LdsDirectVALUHazardGlobalState {
1143    unsigned wait_vdst = 15;
1144    PhysReg vgpr;
1145    std::set<unsigned> loop_headers_visited;
1146 };
1147 
1148 struct LdsDirectVALUHazardBlockState {
1149    unsigned num_valu = 0;
1150    bool has_trans = false;
1151 
1152    unsigned num_instrs = 0;
1153    unsigned num_blocks = 0;
1154 };
1155 
1156 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1157 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1158                                     LdsDirectVALUHazardBlockState& block_state,
1159                                     aco_ptr<Instruction>& instr)
1160 {
1161    if (instr->isVALU()) {
1162       block_state.has_trans |= instr->isTrans();
1163 
1164       bool uses_vgpr = false;
1165       for (Definition& def : instr->definitions)
1166          uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1167       for (Operand& op : instr->operands) {
1168          uses_vgpr |=
1169             !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1170       }
1171       if (uses_vgpr) {
1172          /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1173          global_state.wait_vdst =
1174             MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1175          return true;
1176       }
1177 
1178       block_state.num_valu++;
1179    }
1180 
1181    if (parse_depctr_wait(instr.get()).va_vdst == 0)
1182       return true;
1183 
1184    block_state.num_instrs++;
1185    if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1186       /* Exit to limit compile times and set wait_vdst to be safe. */
1187       global_state.wait_vdst =
1188          MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1189       return true;
1190    }
1191 
1192    return block_state.num_valu >= global_state.wait_vdst;
1193 }
1194 
1195 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1196 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1197                                     LdsDirectVALUHazardBlockState& block_state, Block* block)
1198 {
1199    if (block->kind & block_kind_loop_header) {
1200       if (global_state.loop_headers_visited.count(block->index))
1201          return false;
1202       global_state.loop_headers_visited.insert(block->index);
1203    }
1204 
1205    block_state.num_blocks++;
1206 
1207    return true;
1208 }
1209 
1210 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1211 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1212 {
1213    /* LdsDirectVALUHazard
1214     * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1215     */
1216    if (instr->ldsdir().wait_vdst == 0)
1217       return 0; /* early exit */
1218 
1219    LdsDirectVALUHazardGlobalState global_state;
1220    global_state.wait_vdst = instr->ldsdir().wait_vdst;
1221    global_state.vgpr = instr->definitions[0].physReg();
1222    LdsDirectVALUHazardBlockState block_state;
1223    search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1224                     &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1225       state, global_state, block_state);
1226    return global_state.wait_vdst;
1227 }
1228 
1229 enum VALUPartialForwardingHazardState : uint8_t {
1230    nothing_written,
1231    written_after_exec_write,
1232    exec_written,
1233 };
1234 
1235 struct VALUPartialForwardingHazardGlobalState {
1236    bool hazard_found = false;
1237    std::set<unsigned> loop_headers_visited;
1238 };
1239 
1240 struct VALUPartialForwardingHazardBlockState {
1241    /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1242    uint8_t num_vgprs_read = 0;
1243    BITSET_DECLARE(vgprs_read, 256) = {0};
1244    enum VALUPartialForwardingHazardState state = nothing_written;
1245    unsigned num_valu_since_read = 0;
1246    unsigned num_valu_since_write = 0;
1247 
1248    unsigned num_instrs = 0;
1249    unsigned num_blocks = 0;
1250 };
1251 
1252 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1253 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1254                                             VALUPartialForwardingHazardBlockState& block_state,
1255                                             aco_ptr<Instruction>& instr)
1256 {
1257    /* Check if there is already a hazard found on some other control flow path. */
1258    if (global_state.hazard_found)
1259       return true;
1260 
1261    if (instr->isSALU() && !instr->definitions.empty()) {
1262       if (block_state.state == written_after_exec_write && instr->writes_exec())
1263          block_state.state = exec_written;
1264    } else if (instr->isVALU()) {
1265       bool vgpr_write = false;
1266       for (Definition& def : instr->definitions) {
1267          if (def.physReg().reg() < 256)
1268             continue;
1269 
1270          for (unsigned i = 0; i < def.size(); i++) {
1271             unsigned reg = def.physReg().reg() - 256 + i;
1272             if (!BITSET_TEST(block_state.vgprs_read, reg))
1273                continue;
1274 
1275             if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1276                global_state.hazard_found = true;
1277                return true;
1278             }
1279 
1280             BITSET_CLEAR(block_state.vgprs_read, reg);
1281             block_state.num_vgprs_read--;
1282             vgpr_write = true;
1283          }
1284       }
1285 
1286       if (vgpr_write) {
1287          /* If the state is nothing_written: the check below should ensure that this write is
1288           * close enough to the read.
1289           *
1290           * If the state is exec_written: the current choice of second write has failed. Reset and
1291           * try with the current write as the second one, if it's close enough to the read.
1292           *
1293           * If the state is written_after_exec_write: a further second write would be better, if
1294           * it's close enough to the read.
1295           */
1296          if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1297             block_state.state = written_after_exec_write;
1298             block_state.num_valu_since_write = 0;
1299          } else {
1300             block_state.num_valu_since_write++;
1301          }
1302       } else {
1303          block_state.num_valu_since_write++;
1304       }
1305 
1306       block_state.num_valu_since_read++;
1307    } else if (parse_depctr_wait(instr.get()).va_vdst == 0) {
1308       return true;
1309    }
1310 
1311    if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1312       return true; /* Hazard not possible at this distance. */
1313    if (block_state.num_vgprs_read == 0)
1314       return true; /* All VGPRs have been written and a hazard was never found. */
1315 
1316    block_state.num_instrs++;
1317    if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1318       /* Exit to limit compile times and set hazard_found=true to be safe. */
1319       global_state.hazard_found = true;
1320       return true;
1321    }
1322 
1323    return false;
1324 }
1325 
1326 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1327 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1328                                             VALUPartialForwardingHazardBlockState& block_state,
1329                                             Block* block)
1330 {
1331    if (block->kind & block_kind_loop_header) {
1332       if (global_state.loop_headers_visited.count(block->index))
1333          return false;
1334       global_state.loop_headers_visited.insert(block->index);
1335    }
1336 
1337    block_state.num_blocks++;
1338 
1339    return true;
1340 }
1341 
1342 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1343 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1344 {
1345    /* VALUPartialForwardingHazard
1346     * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1347     * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1348     * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1349     */
1350    if (state.program->wave_size != 64 || !instr->isVALU())
1351       return false;
1352 
1353    unsigned num_vgprs = 0;
1354    for (Operand& op : instr->operands)
1355       num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1356    if (num_vgprs <= 1)
1357       return false; /* early exit */
1358 
1359    VALUPartialForwardingHazardBlockState block_state;
1360 
1361    for (unsigned i = 0; i < instr->operands.size(); i++) {
1362       Operand& op = instr->operands[i];
1363       if (op.physReg().reg() < 256)
1364          continue;
1365       for (unsigned j = 0; j < op.size(); j++)
1366          BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1367    }
1368    block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1369 
1370    if (block_state.num_vgprs_read <= 1)
1371       return false; /* early exit */
1372 
1373    VALUPartialForwardingHazardGlobalState global_state;
1374    search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1375                     &handle_valu_partial_forwarding_hazard_block,
1376                     &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1377    return global_state.hazard_found;
1378 }
1379 
1380 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1381 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1382                          std::vector<aco_ptr<Instruction>>& new_instructions)
1383 {
1384    Builder bld(state.program, &new_instructions);
1385 
1386    /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1387    if (instr->opcode == aco_opcode::s_sendmsg && instr->salu().imm == sendmsg_dealloc_vgprs &&
1388        (new_instructions.empty() || new_instructions.back()->opcode != aco_opcode::s_nop)) {
1389       bld.sopp(aco_opcode::s_nop, 0);
1390    }
1391 
1392    /* VcmpxPermlaneHazard
1393     * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1394     */
1395    if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1396       ctx.has_Vcmpx = true;
1397    } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1398                                 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1399                                 instr->opcode == aco_opcode::v_permlane64_b32 ||
1400                                 instr->opcode == aco_opcode::v_permlane16_var_b32 ||
1401                                 instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
1402       ctx.has_Vcmpx = false;
1403 
1404       /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1405       bld.vop1(aco_opcode::v_nop);
1406    } else if (instr->isVALU()) {
1407       ctx.has_Vcmpx = false;
1408    }
1409 
1410    depctr_wait wait = parse_depctr_wait(instr.get());
1411    unsigned va_vdst = wait.va_vdst;
1412    unsigned vm_vsrc = 7;
1413    unsigned sa_sdst = 1;
1414 
1415    if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1416       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
1417       va_vdst = 0;
1418       vm_vsrc = 0;
1419       sa_sdst = 0;
1420    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1421       /* va_vdst already obtained through parse_depctr_wait(). */
1422       vm_vsrc = (instr->salu().imm >> 2) & 0x7;
1423       sa_sdst = instr->salu().imm & 0x1;
1424    } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
1425       vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
1426    }
1427 
1428    if (instr->isLDSDIR()) {
1429       unsigned count = handle_lds_direct_valu_hazard(state, instr);
1430       LDSDIR_instruction* ldsdir = &instr->ldsdir();
1431       if (count < va_vdst) {
1432          ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1433          va_vdst = MIN2(va_vdst, count);
1434       }
1435    }
1436 
1437    /* VALUTransUseHazard
1438     * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1439     * in-between.
1440     */
1441    if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
1442       uint8_t num_valu = 15;
1443       uint8_t num_trans = 15;
1444       for (Operand& op : instr->operands) {
1445          if (op.physReg().reg() < 256)
1446             continue;
1447          for (unsigned i = 0; i < op.size(); i++) {
1448             PhysReg reg = op.physReg().advance(i * 4);
1449             num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(reg));
1450             num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(reg));
1451          }
1452       }
1453       if (num_trans <= 1 && num_valu <= 5) {
1454          bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1455          va_vdst = 0;
1456       }
1457    }
1458 
1459    if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
1460        handle_valu_partial_forwarding_hazard(state, instr)) {
1461       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1462       va_vdst = 0;
1463    }
1464 
1465    if (state.program->gfx_level < GFX12) {
1466       /* VALUMaskWriteHazard
1467        * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
1468        * VALU.
1469        */
1470       if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
1471           check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1472          bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1473          sa_sdst = 0;
1474       }
1475 
1476       if (va_vdst == 0) {
1477          ctx.valu_since_wr_by_trans.reset();
1478          ctx.trans_since_wr_by_trans.reset();
1479       }
1480 
1481       if (sa_sdst == 0)
1482          ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1483 
1484       if (state.program->wave_size == 64 && instr->isSALU() &&
1485           check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1486          unsigned reg = instr->definitions[0].physReg().reg();
1487          for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1488             ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg + i] = 1;
1489       }
1490 
1491       if (instr->isVALU()) {
1492          bool is_trans = instr->isTrans();
1493 
1494          ctx.valu_since_wr_by_trans.inc();
1495          if (is_trans)
1496             ctx.trans_since_wr_by_trans.inc();
1497 
1498          if (is_trans) {
1499             for (Definition& def : instr->definitions) {
1500                for (unsigned i = 0; i < def.size(); i++) {
1501                   PhysReg reg = def.physReg().advance(i * 4);
1502                   ctx.valu_since_wr_by_trans.set(reg);
1503                   ctx.trans_since_wr_by_trans.set(reg);
1504                }
1505             }
1506          }
1507 
1508          if (state.program->wave_size == 64) {
1509             for (Operand& op : instr->operands) {
1510                /* This should ignore exec reads */
1511                if (!op.isConstant() && op.physReg().reg() < 126)
1512                   ctx.sgpr_read_by_valu_as_lanemask.reset();
1513             }
1514             switch (instr->opcode) {
1515             case aco_opcode::v_addc_co_u32:
1516             case aco_opcode::v_subb_co_u32:
1517             case aco_opcode::v_subbrev_co_u32:
1518             case aco_opcode::v_cndmask_b16:
1519             case aco_opcode::v_cndmask_b32:
1520             case aco_opcode::v_div_fmas_f32:
1521             case aco_opcode::v_div_fmas_f64:
1522                if (instr->operands.back().physReg() != exec) {
1523                   ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1524                   ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1525                }
1526                break;
1527             default: break;
1528             }
1529          }
1530       }
1531    } else {
1532       /* VALUReadSGPRHazard
1533        * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
1534        */
1535       if (instr->isVALU() || instr->isSALU()) {
1536          unsigned expiry_count = instr->isSALU() ? 10 : 11;
1537          uint16_t imm = 0xffff;
1538 
1539          for (Operand& op : instr->operands) {
1540             if (op.physReg() >= m0)
1541                continue;
1542 
1543             for (unsigned i = 0; i < op.size(); i++) {
1544                PhysReg reg = op.physReg().advance(i * 4);
1545                if (ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
1546                   imm &= 0xfffe;
1547                   sa_sdst = 0;
1548                }
1549                if (instr->isVALU()) {
1550                   ctx.sgpr_read_by_valu.set(reg / 2);
1551 
1552                   /* s_wait_alu on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
1553                   if (ctx.sgpr_read_by_valu_then_wr_by_valu[reg]) {
1554                      bool is_vcc = reg == vcc || reg == vcc_hi;
1555                      imm &= is_vcc ? 0xfffd : 0xf1ff;
1556                      if (is_vcc)
1557                         wait.va_vcc = 0;
1558                      else
1559                         wait.va_sdst = 0;
1560                   }
1561                }
1562             }
1563          }
1564 
1565          if (imm != 0xffff)
1566             bld.sopp(aco_opcode::s_waitcnt_depctr, imm);
1567       }
1568 
1569       if (sa_sdst == 0)
1570          ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1571       else if (instr->isSALU() && !instr->isSOPP())
1572          ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
1573 
1574       if (wait.va_sdst == 0) {
1575          std::bitset<m0.reg()> old = ctx.sgpr_read_by_valu_then_wr_by_valu;
1576          ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
1577          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = old[vcc];
1578          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = old[vcc_hi];
1579       }
1580       if (wait.va_vcc == 0) {
1581          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
1582          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
1583       }
1584 
1585       if (instr->isVALU() && !instr->definitions.empty()) {
1586          PhysReg reg = instr->definitions[0].physReg();
1587          if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
1588             for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1589                ctx.sgpr_read_by_valu_then_wr_by_valu.set(reg + i);
1590          }
1591       } else if (instr->isSALU() && !instr->definitions.empty()) {
1592          PhysReg reg = instr->definitions[0].physReg();
1593          if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
1594             for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1595                ctx.sgpr_read_by_valu_then_wr_by_salu.set(reg.advance(i * 4));
1596          }
1597       }
1598    }
1599 
1600    /* LdsDirectVMEMHazard
1601     * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1602     */
1603    if (instr->isVMEM() || instr->isFlatLike()) {
1604       if (instr->definitions.empty()) {
1605          for (Operand& op : instr->operands)
1606             fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1607       } else {
1608          uint8_t vmem_type = state.program->gfx_level >= GFX12
1609                                 ? get_vmem_type(state.program->gfx_level, instr.get())
1610                                 : vmem_nosampler;
1611          std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
1612          if (vmem_type == vmem_sampler)
1613             vgprs = &ctx.vgpr_used_by_vmem_sample;
1614          else if (vmem_type == vmem_bvh)
1615             vgprs = &ctx.vgpr_used_by_vmem_bvh;
1616 
1617          for (Definition& def : instr->definitions)
1618             fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
1619          for (Operand& op : instr->operands)
1620             fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
1621       }
1622    }
1623    if (instr->isDS() || instr->isFlat()) {
1624       for (Definition& def : instr->definitions)
1625          fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1626       for (Operand& op : instr->operands)
1627          fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1628    }
1629    wait_imm imm;
1630    if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1631       ctx.vgpr_used_by_vmem_load.reset();
1632       ctx.vgpr_used_by_vmem_sample.reset();
1633       ctx.vgpr_used_by_vmem_bvh.reset();
1634       ctx.vgpr_used_by_vmem_store.reset();
1635       ctx.vgpr_used_by_ds.reset();
1636    } else if (imm.unpack(state.program->gfx_level, instr.get())) {
1637       if (imm.vm == 0)
1638          ctx.vgpr_used_by_vmem_load.reset();
1639       if (imm.sample == 0)
1640          ctx.vgpr_used_by_vmem_sample.reset();
1641       if (imm.bvh == 0)
1642          ctx.vgpr_used_by_vmem_bvh.reset();
1643       if (imm.lgkm == 0)
1644          ctx.vgpr_used_by_ds.reset();
1645       if (imm.vs == 0)
1646          ctx.vgpr_used_by_vmem_store.reset();
1647    }
1648    if (instr->isLDSDIR()) {
1649       if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1650           ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
1651           ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
1652           ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1653           ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1654          if (state.program->gfx_level >= GFX12)
1655             instr->ldsdir().wait_vsrc = 0;
1656          else
1657             bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
1658          ctx.vgpr_used_by_vmem_load.reset();
1659          ctx.vgpr_used_by_vmem_sample.reset();
1660          ctx.vgpr_used_by_vmem_bvh.reset();
1661          ctx.vgpr_used_by_vmem_store.reset();
1662          ctx.vgpr_used_by_ds.reset();
1663       }
1664    }
1665 
1666    /* WMMA Hazards */
1667    if (instr_info.classes[(int)instr->opcode] == instr_class::wmma) {
1668       assert(instr->operands.back().regClass() == instr->definitions[0].regClass());
1669 
1670       bool is_swmma = instr->operands.size() == 4;
1671       if (test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[0]) ||
1672           test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[1]) ||
1673           (is_swmma && test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[2]))) {
1674          bld.vop1(aco_opcode::v_nop);
1675       }
1676 
1677       ctx.vgpr_written_by_wmma.reset();
1678       fill_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->definitions[0].physReg(),
1679                        instr->definitions[0].bytes());
1680    } else if (instr->isVALU()) {
1681       ctx.vgpr_written_by_wmma.reset();
1682    }
1683 }
1684 
1685 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1686 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1687 {
1688    if (parse_depctr_wait(pred.get()).va_vdst == 0)
1689       return true;
1690 
1691    if (--block_state == 0) {
1692       global_state = false;
1693       return true;
1694    }
1695 
1696    if (pred->isVALU()) {
1697       bool vgpr_rd_or_wr = false;
1698       for (Definition def : pred->definitions) {
1699          if (def.physReg().reg() >= 256)
1700             vgpr_rd_or_wr = true;
1701       }
1702       for (Operand op : pred->operands) {
1703          if (op.physReg().reg() >= 256)
1704             vgpr_rd_or_wr = true;
1705       }
1706       if (vgpr_rd_or_wr) {
1707          global_state = false;
1708          return true;
1709       }
1710    }
1711 
1712    return false;
1713 }
1714 
1715 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1716 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1717                   std::vector<aco_ptr<Instruction>>& new_instructions)
1718 {
1719    Builder bld(state.program, &new_instructions);
1720 
1721    unsigned waitcnt_depctr = 0xffff;
1722    bool valu_read_sgpr = false;
1723 
1724    /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1725    bool has_vdst0_since_valu = true;
1726    unsigned depth = 16;
1727    search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1728       state, has_vdst0_since_valu, depth);
1729    if (!has_vdst0_since_valu) {
1730       waitcnt_depctr &= 0x0fff;
1731       ctx.valu_since_wr_by_trans.reset();
1732       ctx.trans_since_wr_by_trans.reset();
1733    }
1734 
1735    /* VcmpxPermlaneHazard/WMMAHazards */
1736    if (ctx.has_Vcmpx || ctx.vgpr_written_by_wmma.any()) {
1737       ctx.has_Vcmpx = false;
1738       ctx.vgpr_written_by_wmma.reset();
1739       bld.vop1(aco_opcode::v_nop);
1740    }
1741 
1742    /* VALUMaskWriteHazard */
1743    if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
1744       if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
1745          waitcnt_depctr &= 0xfffe;
1746          ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1747       }
1748       if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
1749          valu_read_sgpr = true;
1750          ctx.sgpr_read_by_valu_as_lanemask.reset();
1751       }
1752    }
1753 
1754    /* VALUReadSGPRHazard */
1755    if (state.program->gfx_level >= GFX12) {
1756       if (!ctx.sgpr_read_by_valu_then_wr_by_salu.empty())
1757          waitcnt_depctr &= 0xfffe;
1758 
1759       ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1760       if (ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] ||
1761           ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi]) {
1762          waitcnt_depctr &= 0xfffd;
1763          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
1764          ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
1765       }
1766       if (ctx.sgpr_read_by_valu_then_wr_by_valu.any()) {
1767          waitcnt_depctr &= 0xf1ff;
1768          ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
1769       }
1770    }
1771 
1772    /* LdsDirectVMEMHazard */
1773    if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1774        ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
1775        ctx.vgpr_used_by_vmem_bvh.any()) {
1776       waitcnt_depctr &= 0xffe3;
1777       ctx.vgpr_used_by_vmem_load.reset();
1778       ctx.vgpr_used_by_vmem_store.reset();
1779       ctx.vgpr_used_by_ds.reset();
1780    }
1781 
1782    if (waitcnt_depctr != 0xffff)
1783       bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1784 
1785    if (valu_read_sgpr) {
1786       /* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
1787        * other hazards. */
1788       bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
1789                Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
1790 
1791       /* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
1792       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1793    }
1794 }
1795 
1796 template <typename Ctx>
1797 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1798                              std::vector<aco_ptr<Instruction>>&);
1799 
1800 template <typename Ctx>
1801 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1802 
1803 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1804 void
handle_block(Program * program,Ctx & ctx,Block & block)1805 handle_block(Program* program, Ctx& ctx, Block& block)
1806 {
1807    if (block.instructions.empty())
1808       return;
1809 
1810    State state;
1811    state.program = program;
1812    state.block = &block;
1813    state.old_instructions = std::move(block.instructions);
1814 
1815    block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1816    block.instructions.reserve(state.old_instructions.size());
1817 
1818    bool found_end = false;
1819    for (aco_ptr<Instruction>& instr : state.old_instructions) {
1820       Handle(state, ctx, instr, block.instructions);
1821 
1822       /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1823       if (instr->opcode == aco_opcode::s_setpc_b64) {
1824          block.instructions.emplace_back(std::move(instr));
1825 
1826          std::vector<aco_ptr<Instruction>> resolve_instrs;
1827          Resolve(state, ctx, resolve_instrs);
1828          block.instructions.insert(std::prev(block.instructions.end()),
1829                                    std::move_iterator(resolve_instrs.begin()),
1830                                    std::move_iterator(resolve_instrs.end()));
1831 
1832          found_end = true;
1833          continue;
1834       }
1835 
1836       found_end |= instr->opcode == aco_opcode::s_endpgm;
1837       block.instructions.emplace_back(std::move(instr));
1838    }
1839 
1840    /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1841    if (block.linear_succs.empty() && !found_end)
1842       Resolve(state, ctx, block.instructions);
1843 }
1844 
1845 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1846 void
mitigate_hazards(Program * program,Ctx initial_ctx=Ctx ())1847 mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
1848 {
1849    std::vector<Ctx> all_ctx(program->blocks.size());
1850    std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1851 
1852    for (unsigned i = 0; i < program->blocks.size(); i++) {
1853       Block& block = program->blocks[i];
1854       Ctx& ctx = all_ctx[i];
1855 
1856       if (i == 0 || (block.kind & block_kind_resume))
1857          ctx = initial_ctx;
1858 
1859       if (block.kind & block_kind_loop_header) {
1860          loop_header_indices.push(i);
1861       } else if (block.kind & block_kind_loop_exit) {
1862          /* Go through the whole loop again */
1863          for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1864             Ctx loop_block_ctx;
1865             for (unsigned b : program->blocks[idx].linear_preds)
1866                loop_block_ctx.join(all_ctx[b]);
1867 
1868             handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1869 
1870             /* We only need to continue if the loop header context changed */
1871             if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1872                break;
1873 
1874             all_ctx[idx] = loop_block_ctx;
1875          }
1876 
1877          loop_header_indices.pop();
1878       }
1879 
1880       for (unsigned b : block.linear_preds)
1881          ctx.join(all_ctx[b]);
1882 
1883       handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1884    }
1885 }
1886 
1887 /* FeatureRequiredExportPriority in LLVM */
1888 void
required_export_priority(Program * program)1889 required_export_priority(Program* program)
1890 {
1891    /* Skip callees, assuming that the caller has already increased the priority. */
1892    bool increase_priority = !program->is_epilog && !program->info.vs.has_prolog &&
1893                             (!program->info.merged_shader_compiled_separately ||
1894                              program->stage.sw == SWStage::VS || program->stage.sw == SWStage::TES);
1895    increase_priority |= program->is_prolog;
1896 
1897    for (Block& block : program->blocks) {
1898       std::vector<aco_ptr<Instruction>> new_instructions;
1899       new_instructions.reserve(block.instructions.size() + 6);
1900 
1901       Builder bld(program, &new_instructions);
1902 
1903       if (increase_priority && block.index == 0) {
1904          if (!block.instructions.empty() && block.instructions[0]->opcode == aco_opcode::s_setprio)
1905             block.instructions[0]->salu().imm = MAX2(block.instructions[0]->salu().imm, 2);
1906          else
1907             bld.sopp(aco_opcode::s_setprio, 2);
1908       }
1909 
1910       for (unsigned i = 0; i < block.instructions.size(); i++) {
1911          Instruction* instr = block.instructions[i].get();
1912          new_instructions.push_back(std::move(block.instructions[i]));
1913 
1914          if (instr->opcode == aco_opcode::s_setprio) {
1915             instr->salu().imm = MAX2(instr->salu().imm, 2);
1916             continue;
1917          }
1918 
1919          bool end_of_export_sequence = instr->isEXP() && (i == block.instructions.size() - 1 ||
1920                                                           !block.instructions[i + 1]->isEXP());
1921          if (!end_of_export_sequence)
1922             continue;
1923 
1924          bool before_endpgm = false;
1925          if (i != block.instructions.size() - 1) {
1926             before_endpgm = block.instructions[i + 1]->opcode == aco_opcode::s_endpgm;
1927          } else {
1928             /* Does this fallthrough to a s_endpgm? */
1929             for (unsigned j = block.index + 1; j < program->blocks.size(); j++) {
1930                if (program->blocks[j].instructions.size() == 1 &&
1931                    program->blocks[j].instructions[0]->opcode == aco_opcode::s_endpgm)
1932                   before_endpgm = true;
1933                if (!program->blocks[j].instructions.empty())
1934                   break;
1935             }
1936          }
1937 
1938          bld.sopp(aco_opcode::s_setprio, 0);
1939          if (!before_endpgm)
1940             bld.sopk(aco_opcode::s_waitcnt_expcnt, Operand(sgpr_null, s1), 0);
1941          bld.sopp(aco_opcode::s_nop, 0);
1942          bld.sopp(aco_opcode::s_nop, 0);
1943          if (!before_endpgm)
1944             bld.sopp(aco_opcode::s_setprio, 2);
1945       }
1946 
1947       block.instructions = std::move(new_instructions);
1948    }
1949 }
1950 
1951 } /* end namespace */
1952 
1953 void
insert_NOPs(Program * program)1954 insert_NOPs(Program* program)
1955 {
1956    if (program->gfx_level >= GFX11) {
1957       NOP_ctx_gfx11 initial_ctx;
1958 
1959       bool has_previous_part =
1960          program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
1961          (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
1962           program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
1963       if (program->gfx_level >= GFX12 && has_previous_part) {
1964          /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
1965           * SGPR might have been read by VALU if there was a previous shader part.
1966           */
1967          initial_ctx.sgpr_read_by_valu.flip();
1968       }
1969 
1970       mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
1971                                                                                    initial_ctx);
1972    } else if (program->gfx_level >= GFX10_3) {
1973       ; /* no hazards/bugs to mitigate */
1974    } else if (program->gfx_level >= GFX10) {
1975       mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1976    } else {
1977       mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1978    }
1979 
1980    if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1981                                          program->stage.hw == AC_HW_PIXEL_SHADER))
1982       required_export_priority(program);
1983 }
1984 
1985 } // namespace aco
1986