1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include "util/bitset.h"
11
12 #include <algorithm>
13 #include <bitset>
14 #include <set>
15 #include <stack>
16 #include <vector>
17
18 namespace aco {
19 namespace {
20
21 struct State {
22 Program* program;
23 Block* block;
24 std::vector<aco_ptr<Instruction>> old_instructions;
25 };
26
27 struct NOP_ctx_gfx6 {
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx628 void join(const NOP_ctx_gfx6& other)
29 {
30 set_vskip_mode_then_vector =
31 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
32 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
33 salu_wr_m0_then_gds_msg_ttrace =
34 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
35 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
36 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
37 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
38 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
39 vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
40 smem_clause |= other.smem_clause;
41 smem_write |= other.smem_write;
42 for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
43 smem_clause_read_write[i] |= other.smem_clause_read_write[i];
44 smem_clause_write[i] |= other.smem_clause_write[i];
45 }
46 }
47
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx648 bool operator==(const NOP_ctx_gfx6& other)
49 {
50 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
51 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
52 vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
53 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
54 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
55 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
56 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
57 setreg_then_getsetreg == other.setreg_then_getsetreg &&
58 smem_clause == other.smem_clause && smem_write == other.smem_write &&
59 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
60 BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
61 }
62
add_wait_statesaco::__anon0afcfc6c0111::NOP_ctx_gfx663 void add_wait_states(unsigned amount)
64 {
65 if ((set_vskip_mode_then_vector -= amount) < 0)
66 set_vskip_mode_then_vector = 0;
67
68 if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
69 valu_wr_vcc_then_div_fmas = 0;
70
71 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
72 salu_wr_m0_then_gds_msg_ttrace = 0;
73
74 if ((valu_wr_exec_then_dpp -= amount) < 0)
75 valu_wr_exec_then_dpp = 0;
76
77 if ((salu_wr_m0_then_lds -= amount) < 0)
78 salu_wr_m0_then_lds = 0;
79
80 if ((salu_wr_m0_then_moverel -= amount) < 0)
81 salu_wr_m0_then_moverel = 0;
82
83 if ((setreg_then_getsetreg -= amount) < 0)
84 setreg_then_getsetreg = 0;
85
86 vmem_store_then_wr_data.reset();
87 }
88
89 /* setting MODE.vskip and then any vector op requires 2 wait states */
90 int8_t set_vskip_mode_then_vector = 0;
91
92 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
93 int8_t valu_wr_vcc_then_div_fmas = 0;
94
95 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
96 int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
97
98 /* VALU writing EXEC followed by DPP requires 5 wait states */
99 int8_t valu_wr_exec_then_dpp = 0;
100
101 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
102 int8_t salu_wr_m0_then_lds = 0;
103
104 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
105 int8_t salu_wr_m0_then_moverel = 0;
106
107 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
108 * currently we don't look at the actual register */
109 int8_t setreg_then_getsetreg = 0;
110
111 /* some memory instructions writing >64bit followed by a instructions
112 * writing the VGPRs holding the writedata requires 1 wait state */
113 std::bitset<256> vmem_store_then_wr_data;
114
115 /* we break up SMEM clauses that contain stores or overwrite an
116 * operand/definition of another instruction in the clause */
117 bool smem_clause = false;
118 bool smem_write = false;
119 BITSET_DECLARE(smem_clause_read_write, 128) = {0};
120 BITSET_DECLARE(smem_clause_write, 128) = {0};
121 };
122
123 struct NOP_ctx_gfx10 {
124 bool has_VOPC_write_exec = false;
125 bool has_nonVALU_exec_read = false;
126 bool has_VMEM = false;
127 bool has_branch_after_VMEM = false;
128 bool has_DS = false;
129 bool has_branch_after_DS = false;
130 bool has_NSA_MIMG = false;
131 bool has_writelane = false;
132 std::bitset<128> sgprs_read_by_VMEM;
133 std::bitset<128> sgprs_read_by_VMEM_store;
134 std::bitset<128> sgprs_read_by_DS;
135 std::bitset<128> sgprs_read_by_SMEM;
136
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx10137 void join(const NOP_ctx_gfx10& other)
138 {
139 has_VOPC_write_exec |= other.has_VOPC_write_exec;
140 has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
141 has_VMEM |= other.has_VMEM;
142 has_branch_after_VMEM |= other.has_branch_after_VMEM;
143 has_DS |= other.has_DS;
144 has_branch_after_DS |= other.has_branch_after_DS;
145 has_NSA_MIMG |= other.has_NSA_MIMG;
146 has_writelane |= other.has_writelane;
147 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
148 sgprs_read_by_DS |= other.sgprs_read_by_DS;
149 sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
150 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
151 }
152
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx10153 bool operator==(const NOP_ctx_gfx10& other)
154 {
155 return has_VOPC_write_exec == other.has_VOPC_write_exec &&
156 has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
157 has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
158 has_branch_after_DS == other.has_branch_after_DS &&
159 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
160 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
161 sgprs_read_by_DS == other.sgprs_read_by_DS &&
162 sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
163 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
164 }
165 };
166
167 template <int Max> struct RegCounterMap {
incaco::__anon0afcfc6c0111::RegCounterMap168 void inc() { base++; }
setaco::__anon0afcfc6c0111::RegCounterMap169 void set(PhysReg reg) { update(reg, 0); }
170
getaco::__anon0afcfc6c0111::RegCounterMap171 uint8_t get(PhysReg reg)
172 {
173 if (present.test(reg.reg() & 0x7F)) {
174 for (entry& e : list) {
175 if (e.reg == reg.reg())
176 return MIN2(base - e.val, Max);
177 }
178 }
179 return Max;
180 }
181
resetaco::__anon0afcfc6c0111::RegCounterMap182 void reset()
183 {
184 present.reset();
185 list.clear();
186 base = 0;
187 }
188
emptyaco::__anon0afcfc6c0111::RegCounterMap189 bool empty()
190 {
191 for (entry& e : list) {
192 if (base - e.val < Max)
193 return false;
194 }
195 return true;
196 }
197
join_minaco::__anon0afcfc6c0111::RegCounterMap198 void join_min(const RegCounterMap& other)
199 {
200 for (const entry& e : other.list) {
201 int idx = other.base - e.val;
202 if (idx >= Max)
203 continue;
204
205 update(e.reg, idx);
206 }
207 }
208
updateaco::__anon0afcfc6c0111::RegCounterMap209 void update(uint16_t reg, int idx)
210 {
211 int16_t val = base - idx;
212 for (entry& e : list) {
213 if (e.reg == reg) {
214 e.val = MAX2(e.val, val);
215 return;
216 }
217 }
218 list.push_back(entry{reg, val});
219 present.set(reg & 0x7F);
220 }
221
operator ==aco::__anon0afcfc6c0111::RegCounterMap222 bool operator==(const RegCounterMap& other) const
223 {
224 /* Two maps with different bases could also be equal, but for our use case,
225 * i.e. checking for changes at loop headers, this is sufficient since we
226 * always join the predecessors into an empty map with base=0.
227 */
228 return base == other.base && list == other.list;
229 }
230
231 private:
232 struct entry {
233 uint16_t reg;
234 int16_t val;
operator !=aco::__anon0afcfc6c0111::RegCounterMap::entry235 bool operator!=(const entry& other) const { return reg != other.reg || val != other.val; }
236 };
237
238 std::bitset<128> present;
239 small_vec<entry, 4> list;
240 int base = 0;
241 };
242
243 struct NOP_ctx_gfx11 {
244 /* VcmpxPermlaneHazard */
245 bool has_Vcmpx = false;
246
247 /* LdsDirectVMEMHazard */
248 std::bitset<256> vgpr_used_by_vmem_load;
249 std::bitset<256> vgpr_used_by_vmem_sample;
250 std::bitset<256> vgpr_used_by_vmem_bvh;
251 std::bitset<256> vgpr_used_by_vmem_store;
252 std::bitset<256> vgpr_used_by_ds;
253
254 /* VALUTransUseHazard */
255 RegCounterMap<6> valu_since_wr_by_trans;
256 RegCounterMap<2> trans_since_wr_by_trans;
257
258 /* VALUMaskWriteHazard */
259 std::bitset<128> sgpr_read_by_valu_as_lanemask;
260 std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
261
262 /* WMMAHazards */
263 std::bitset<256> vgpr_written_by_wmma;
264
265 /* VALUReadSGPRHazard */
266 std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
267 RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
268
joinaco::__anon0afcfc6c0111::NOP_ctx_gfx11269 void join(const NOP_ctx_gfx11& other)
270 {
271 has_Vcmpx |= other.has_Vcmpx;
272 vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
273 vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
274 vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
275 vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
276 vgpr_used_by_ds |= other.vgpr_used_by_ds;
277 valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
278 trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
279 sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
280 sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
281 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
282 vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
283 sgpr_read_by_valu |= other.sgpr_read_by_valu;
284 sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
285 }
286
operator ==aco::__anon0afcfc6c0111::NOP_ctx_gfx11287 bool operator==(const NOP_ctx_gfx11& other)
288 {
289 return has_Vcmpx == other.has_Vcmpx &&
290 vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
291 vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
292 vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
293 vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
294 vgpr_used_by_ds == other.vgpr_used_by_ds &&
295 valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
296 trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
297 sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
298 sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
299 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
300 vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
301 sgpr_read_by_valu == other.sgpr_read_by_valu &&
302 sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
303 }
304 };
305
306 int
get_wait_states(aco_ptr<Instruction> & instr)307 get_wait_states(aco_ptr<Instruction>& instr)
308 {
309 if (instr->opcode == aco_opcode::s_nop)
310 return instr->salu().imm + 1;
311 else if (instr->opcode == aco_opcode::p_constaddr)
312 return 3; /* lowered to 3 instructions in the assembler */
313 else
314 return 1;
315 }
316
317 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)318 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
319 {
320 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
321 }
322
323 template <typename GlobalState, typename BlockState,
324 bool (*block_cb)(GlobalState&, BlockState&, Block*),
325 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
326 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)327 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
328 Block* block, bool start_at_end)
329 {
330 if (block == state.block && start_at_end) {
331 /* If it's the current block, block->instructions is incomplete. */
332 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
333 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
334 if (!instr)
335 break; /* Instruction has been moved to block->instructions. */
336 if (instr_cb(global_state, block_state, instr))
337 return;
338 }
339 }
340
341 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
342 if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
343 return;
344 }
345
346 PRAGMA_DIAGNOSTIC_PUSH
347 PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
348 if (block_cb != nullptr && !block_cb(global_state, block_state, block))
349 return;
350 PRAGMA_DIAGNOSTIC_POP
351
352 for (unsigned lin_pred : block->linear_preds) {
353 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
354 state, global_state, block_state, &state.program->blocks[lin_pred], true);
355 }
356 }
357
358 template <typename GlobalState, typename BlockState,
359 bool (*block_cb)(GlobalState&, BlockState&, Block*),
360 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
361 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)362 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
363 {
364 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
365 state, global_state, block_state, state.block, false);
366 }
367
368 struct HandleRawHazardGlobalState {
369 PhysReg reg;
370 int nops_needed;
371 };
372
373 struct HandleRawHazardBlockState {
374 uint32_t mask;
375 int nops_needed;
376 };
377
378 template <bool Valu, bool Vintrp, bool Salu>
379 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)380 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
381 HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
382 {
383 unsigned mask_size = util_last_bit(block_state.mask);
384
385 uint32_t writemask = 0;
386 for (Definition& def : pred->definitions) {
387 if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
388 unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
389 unsigned end = MIN2(mask_size, start + def.size());
390 writemask |= u_bit_consecutive(start, end - start);
391 }
392 }
393
394 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
395 (pred->isSALU() && Salu));
396 if (is_hazard) {
397 global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
398 return true;
399 }
400
401 block_state.mask &= ~writemask;
402 block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
403
404 if (block_state.mask == 0)
405 block_state.nops_needed = 0;
406
407 return block_state.nops_needed == 0;
408 }
409
410 template <bool Valu, bool Vintrp, bool Salu>
411 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)412 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
413 {
414 if (*NOPs >= min_states)
415 return;
416
417 HandleRawHazardGlobalState global = {op.physReg(), 0};
418 HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
419
420 /* Loops require branch instructions, which count towards the wait
421 * states. So even with loops this should finish unless nops_needed is some
422 * huge value. */
423 search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
424 handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
425
426 *NOPs = MAX2(*NOPs, global.nops_needed);
427 }
428
429 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
430 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
431 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
432
433 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)434 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
435 {
436 unsigned end = start + size - 1;
437 unsigned start_mod = start % BITSET_WORDBITS;
438 if (start_mod + size <= BITSET_WORDBITS) {
439 BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
440 } else {
441 unsigned first_size = BITSET_WORDBITS - start_mod;
442 set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
443 set_bitset_range(words, start + first_size, size - first_size);
444 }
445 }
446
447 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)448 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
449 {
450 unsigned end = start + size - 1;
451 unsigned start_mod = start % BITSET_WORDBITS;
452 if (start_mod + size <= BITSET_WORDBITS) {
453 return BITSET_TEST_RANGE(words, start, end);
454 } else {
455 unsigned first_size = BITSET_WORDBITS - start_mod;
456 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
457 test_bitset_range(words, start + first_size, size - first_size);
458 }
459 }
460
461 /* A SMEM clause is any group of consecutive SMEM instructions. The
462 * instructions in this group may return out of order and/or may be replayed.
463 *
464 * To fix this potential hazard correctly, we have to make sure that when a
465 * clause has more than one instruction, no instruction in the clause writes
466 * to a register that is read by another instruction in the clause (including
467 * itself). In this case, we have to break the SMEM clause by inserting non
468 * SMEM instructions.
469 *
470 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
471 */
472 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)473 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
474 int* NOPs)
475 {
476 /* break off from previous SMEM clause if needed */
477 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
478 /* Don't allow clauses with store instructions since the clause's
479 * instructions may use the same address. */
480 if (ctx.smem_write || instr->definitions.empty() ||
481 instr_info.is_atomic[(unsigned)instr->opcode]) {
482 *NOPs = 1;
483 } else if (program->dev.xnack_enabled) {
484 for (Operand op : instr->operands) {
485 if (!op.isConstant() &&
486 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
487 *NOPs = 1;
488 break;
489 }
490 }
491
492 Definition def = instr->definitions[0];
493 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
494 *NOPs = 1;
495 }
496 }
497 }
498
499 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
500 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)501 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
502 std::vector<aco_ptr<Instruction>>& new_instructions)
503 {
504 /* check hazards */
505 int NOPs = 0;
506
507 if (instr->isSMEM()) {
508 if (state.program->gfx_level == GFX6) {
509 /* A read of an SGPR by SMRD instruction requires 4 wait states
510 * when the SGPR was written by a VALU instruction. According to LLVM,
511 * there is also an undocumented hardware behavior when the buffer
512 * descriptor is written by a SALU instruction */
513 for (unsigned i = 0; i < instr->operands.size(); i++) {
514 Operand op = instr->operands[i];
515 if (op.isConstant())
516 continue;
517
518 bool is_buffer_desc = i == 0 && op.size() > 2;
519 if (is_buffer_desc)
520 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
521 else
522 handle_valu_then_read_hazard(state, &NOPs, 4, op);
523 }
524 }
525
526 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
527 } else if (instr->isSALU()) {
528 if (instr->opcode == aco_opcode::s_setreg_b32 ||
529 instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
530 instr->opcode == aco_opcode::s_getreg_b32) {
531 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
532 }
533
534 if (state.program->gfx_level == GFX9) {
535 if (instr->opcode == aco_opcode::s_movrels_b32 ||
536 instr->opcode == aco_opcode::s_movrels_b64 ||
537 instr->opcode == aco_opcode::s_movreld_b32 ||
538 instr->opcode == aco_opcode::s_movreld_b64) {
539 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
540 }
541 }
542
543 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
544 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
545 } else if (instr->isDS() && instr->ds().gds) {
546 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
547 } else if (instr->isVALU() || instr->isVINTRP()) {
548 if (instr->isDPP()) {
549 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
550 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
551 }
552
553 for (Definition def : instr->definitions) {
554 if (def.regClass().type() != RegType::sgpr) {
555 for (unsigned i = 0; i < def.size(); i++)
556 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
557 }
558 }
559
560 if ((instr->opcode == aco_opcode::v_readlane_b32 ||
561 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
562 instr->opcode == aco_opcode::v_writelane_b32 ||
563 instr->opcode == aco_opcode::v_writelane_b32_e64) &&
564 !instr->operands[1].isConstant()) {
565 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
566 }
567
568 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
569 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
570 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
571 * This hazard isn't documented anywhere but AMD confirmed that hazard.
572 */
573 if (state.program->gfx_level == GFX6 &&
574 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
575 instr->opcode == aco_opcode::v_readfirstlane_b32)) {
576 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
577 }
578
579 if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
580 instr->opcode == aco_opcode::v_div_fmas_f64)
581 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
582 } else if (instr->isVMEM() || instr->isFlatLike()) {
583 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
584 for (Operand op : instr->operands) {
585 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
586 handle_valu_then_read_hazard(state, &NOPs, 5, op);
587 }
588 }
589
590 if (!instr->isSALU() && instr->format != Format::SMEM)
591 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
592
593 if (state.program->gfx_level == GFX9) {
594 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
595 if (instr->isVINTRP() || lds_scratch_global ||
596 instr->opcode == aco_opcode::ds_read_addtid_b32 ||
597 instr->opcode == aco_opcode::ds_write_addtid_b32 ||
598 instr->opcode == aco_opcode::buffer_store_lds_dword) {
599 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
600 }
601 }
602
603 ctx.add_wait_states(NOPs + get_wait_states(instr));
604
605 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
606 if (NOPs) {
607 /* create NOP */
608 aco_ptr<Instruction> nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)};
609 nop->salu().imm = NOPs - 1;
610 new_instructions.emplace_back(std::move(nop));
611 }
612
613 /* update information to check for later hazards */
614 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
615 ctx.smem_clause = false;
616 ctx.smem_write = false;
617
618 if (state.program->dev.xnack_enabled) {
619 BITSET_ZERO(ctx.smem_clause_read_write);
620 BITSET_ZERO(ctx.smem_clause_write);
621 }
622 }
623
624 if (instr->isSMEM()) {
625 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
626 ctx.smem_write = true;
627 } else {
628 ctx.smem_clause = true;
629
630 if (state.program->dev.xnack_enabled) {
631 for (Operand op : instr->operands) {
632 if (!op.isConstant()) {
633 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
634 }
635 }
636
637 Definition def = instr->definitions[0];
638 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
639 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
640 }
641 }
642 } else if (instr->isVALU()) {
643 for (Definition def : instr->definitions) {
644 if (def.regClass().type() == RegType::sgpr) {
645 if (def.physReg() == vcc || def.physReg() == vcc_hi) {
646 ctx.valu_wr_vcc_then_div_fmas = 4;
647 }
648 if (def.physReg() == exec || def.physReg() == exec_hi) {
649 ctx.valu_wr_exec_then_dpp = 5;
650 }
651 }
652 }
653 } else if (instr->isSALU()) {
654 if (!instr->definitions.empty()) {
655 /* all other definitions should be SCC */
656 Definition def = instr->definitions[0];
657 if (def.physReg() == m0) {
658 ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
659 ctx.salu_wr_m0_then_lds = 1;
660 ctx.salu_wr_m0_then_moverel = 1;
661 }
662 } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
663 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
664 SALU_instruction& sopk = instr->salu();
665 unsigned offset = (sopk.imm >> 6) & 0x1f;
666 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
667 unsigned reg = sopk.imm & 0x3f;
668 ctx.setreg_then_getsetreg = 2;
669
670 if (reg == 1 && offset >= 28 && size > (28 - offset))
671 ctx.set_vskip_mode_then_vector = 2;
672 }
673 } else if (instr->isVMEM() || instr->isFlatLike()) {
674 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
675 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
676 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
677 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
678 * store) */
679 bool consider_mimg = instr->isMIMG() &&
680 instr->operands[1].regClass().type() == RegType::vgpr &&
681 instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
682 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
683 bool consider_flat =
684 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
685 if (consider_buf || consider_mimg || consider_flat) {
686 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
687 unsigned size = instr->operands[consider_flat ? 2 : 3].size();
688 for (unsigned i = 0; i < size; i++)
689 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
690 }
691 }
692 }
693
694 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)695 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
696 {
697 if (pred->isVINTRP())
698 global_state = true;
699 return true;
700 }
701
702 template <bool Salu, bool Sgpr>
703 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)704 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
705 {
706 if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
707 for (Definition dst : pred->definitions) {
708 if ((dst.physReg().reg() < 256) == Sgpr) {
709 global_state = MAX2(global_state, block_state);
710 return true;
711 }
712 }
713 }
714
715 block_state -= get_wait_states(pred);
716 return block_state <= 0;
717 }
718
719 template <bool Salu, bool Sgpr>
720 void
handle_wr_hazard(State & state,int * NOPs,int min_states)721 handle_wr_hazard(State& state, int* NOPs, int min_states)
722 {
723 if (*NOPs >= min_states)
724 return;
725
726 int global = 0;
727 int block = min_states;
728 search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
729 *NOPs = MAX2(*NOPs, global);
730 }
731
732 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)733 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
734 std::vector<aco_ptr<Instruction>>& new_instructions)
735 {
736 int NOPs = 0;
737
738 /* SGPR->SMEM hazards */
739 if (state.program->gfx_level == GFX6) {
740 handle_wr_hazard<true, true>(state, &NOPs, 4);
741 handle_wr_hazard<false, true>(state, &NOPs, 4);
742 }
743
744 /* Break up SMEM clauses */
745 if (ctx.smem_clause || ctx.smem_write)
746 NOPs = MAX2(NOPs, 1);
747
748 /* SALU/GDS hazards */
749 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
750 if (state.program->gfx_level == GFX9)
751 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
752 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
753
754 /* VALU hazards */
755 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
756 if (state.program->gfx_level >= GFX8)
757 handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
758 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
759 if (state.program->gfx_level == GFX6) {
760 /* VINTRP->v_readlane_b32/etc */
761 bool vintrp = false;
762 search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
763 if (vintrp)
764 NOPs = MAX2(NOPs, 1);
765 }
766 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
767
768 /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
769 handle_wr_hazard<false, true>(state, &NOPs, 5);
770
771 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
772
773 if (state.program->gfx_level == GFX9)
774 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
775
776 ctx.add_wait_states(NOPs);
777 if (NOPs) {
778 Builder bld(state.program, &new_instructions);
779 bld.sopp(aco_opcode::s_nop, NOPs - 1);
780 }
781 }
782
783 template <std::size_t N>
784 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)785 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
786 {
787 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
788 [&check_regs](const Definition& def) -> bool
789 {
790 bool writes_any = false;
791 for (unsigned i = 0; i < def.size(); i++) {
792 unsigned def_reg = def.physReg() + i;
793 writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
794 }
795 return writes_any;
796 });
797 }
798
799 template <std::size_t N>
800 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)801 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
802 {
803 return std::any_of(instr->operands.begin(), instr->operands.end(),
804 [&check_regs](const Operand& op) -> bool
805 {
806 if (op.isConstant())
807 return false;
808 bool writes_any = false;
809 for (unsigned i = 0; i < op.size(); i++) {
810 unsigned op_reg = op.physReg() + i;
811 writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
812 }
813 return writes_any;
814 });
815 }
816
817 template <std::size_t N>
818 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)819 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
820 {
821 for (const Operand& op : instr->operands) {
822 for (unsigned i = 0; i < op.size(); i++) {
823 unsigned reg = op.physReg() + i;
824 if (reg < reg_reads.size())
825 reg_reads.set(reg);
826 }
827 }
828 }
829
830 template <std::size_t N>
831 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)832 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
833 {
834 mark_read_regs(instr, reg_reads);
835 reg_reads.set(exec);
836 if (state.program->wave_size == 64)
837 reg_reads.set(exec_hi);
838 }
839
840 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)841 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
842 {
843 if (instr->isVOPC())
844 return true;
845 if (instr->isVOP3() && instr->definitions.size() == 2)
846 return true;
847 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
848 instr->opcode == aco_opcode::v_readlane_b32 ||
849 instr->opcode == aco_opcode::v_readlane_b32_e64)
850 return true;
851 return false;
852 }
853
854 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)855 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
856 {
857 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
858 [](const Definition& def) -> bool
859 { return def.getTemp().type() == RegType::sgpr; });
860 }
861
862 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)863 instr_is_branch(const aco_ptr<Instruction>& instr)
864 {
865 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
866 instr->opcode == aco_opcode::s_cbranch_scc1 ||
867 instr->opcode == aco_opcode::s_cbranch_vccz ||
868 instr->opcode == aco_opcode::s_cbranch_vccnz ||
869 instr->opcode == aco_opcode::s_cbranch_execz ||
870 instr->opcode == aco_opcode::s_cbranch_execnz ||
871 instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
872 instr->opcode == aco_opcode::s_cbranch_cdbguser ||
873 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
874 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
875 instr->opcode == aco_opcode::s_subvector_loop_begin ||
876 instr->opcode == aco_opcode::s_subvector_loop_end ||
877 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
878 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
879 }
880
881 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)882 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
883 std::vector<aco_ptr<Instruction>>& new_instructions)
884 {
885 // TODO: s_dcache_inv needs to be in it's own group on GFX10
886
887 Builder bld(state.program, &new_instructions);
888
889 unsigned vm_vsrc = 7;
890 unsigned sa_sdst = 1;
891 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
892 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
893 vm_vsrc = 0;
894 sa_sdst = 0;
895 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
896 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
897 sa_sdst = instr->salu().imm & 0x1;
898 }
899
900 /* VMEMtoScalarWriteHazard
901 * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
902 * in-between.
903 */
904 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
905 /* Remember all SGPRs that are read by the VMEM/DS instruction */
906 if (instr->isVMEM() || instr->isFlatLike())
907 mark_read_regs_exec(
908 state, instr,
909 instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
910 if (instr->isFlat() || instr->isDS())
911 mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
912 } else if (instr->isSALU() || instr->isSMEM()) {
913 wait_imm imm;
914 if (imm.unpack(state.program->gfx_level, instr.get())) {
915 if (imm.vm == 0)
916 ctx.sgprs_read_by_VMEM.reset();
917 if (imm.lgkm == 0)
918 ctx.sgprs_read_by_DS.reset();
919 if (imm.vs == 0)
920 ctx.sgprs_read_by_VMEM_store.reset();
921 } else if (vm_vsrc == 0) {
922 ctx.sgprs_read_by_VMEM.reset();
923 ctx.sgprs_read_by_DS.reset();
924 ctx.sgprs_read_by_VMEM_store.reset();
925 }
926
927 /* Check if SALU writes an SGPR that was previously read by the VALU */
928 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
929 check_written_regs(instr, ctx.sgprs_read_by_DS) ||
930 check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
931 ctx.sgprs_read_by_VMEM.reset();
932 ctx.sgprs_read_by_DS.reset();
933 ctx.sgprs_read_by_VMEM_store.reset();
934
935 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
936 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
937 }
938 } else if (instr->isVALU()) {
939 /* Hazard is mitigated by any VALU instruction */
940 ctx.sgprs_read_by_VMEM.reset();
941 ctx.sgprs_read_by_DS.reset();
942 ctx.sgprs_read_by_VMEM_store.reset();
943 }
944
945 /* VcmpxPermlaneHazard
946 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
947 */
948 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
949 /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
950 ctx.has_VOPC_write_exec = true;
951 } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
952 instr->opcode == aco_opcode::v_permlanex16_b32)) {
953 ctx.has_VOPC_write_exec = false;
954
955 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
956 bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
957 Operand(instr->operands[0].physReg(), v1));
958 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
959 ctx.has_VOPC_write_exec = false;
960 }
961
962 /* VcmpxExecWARHazard
963 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
964 */
965 if (!instr->isVALU() && instr->reads_exec()) {
966 ctx.has_nonVALU_exec_read = true;
967 } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
968 if (instr->writes_exec()) {
969 ctx.has_nonVALU_exec_read = false;
970
971 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
972 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
973 } else if (instr_writes_sgpr(instr)) {
974 /* Any VALU instruction that writes an SGPR mitigates the problem */
975 ctx.has_nonVALU_exec_read = false;
976 }
977 } else if (sa_sdst == 0) {
978 ctx.has_nonVALU_exec_read = false;
979 }
980
981 /* SMEMtoVectorWriteHazard
982 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
983 */
984 if (instr->isSMEM()) {
985 /* Remember all SGPRs that are read by the SMEM instruction */
986 mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
987 } else if (VALU_writes_sgpr(instr)) {
988 /* Check if VALU writes an SGPR that was previously read by SMEM */
989 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
990 ctx.sgprs_read_by_SMEM.reset();
991
992 /* Insert s_mov to mitigate the problem */
993 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
994 }
995 } else if (instr->isSALU()) {
996 wait_imm imm;
997 if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
998 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
999 ctx.sgprs_read_by_SMEM.reset();
1000 } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1001 /* SALU can mitigate the hazard */
1002 ctx.sgprs_read_by_SMEM.reset();
1003 }
1004 }
1005
1006 /* LdsBranchVmemWARHazard
1007 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1008 */
1009 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1010 if (ctx.has_branch_after_DS)
1011 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1012 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1013 ctx.has_VMEM = true;
1014 } else if (instr->isDS()) {
1015 if (ctx.has_branch_after_VMEM)
1016 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1017 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1018 ctx.has_DS = true;
1019 } else if (instr_is_branch(instr)) {
1020 ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1021 ctx.has_branch_after_DS |= ctx.has_DS;
1022 ctx.has_VMEM = ctx.has_DS = false;
1023 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1024 /* Only s_waitcnt_vscnt can mitigate the hazard */
1025 const SALU_instruction& sopk = instr->salu();
1026 if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1027 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1028 }
1029
1030 /* NSAToVMEMBug
1031 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1032 * 0).
1033 */
1034 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1035 ctx.has_NSA_MIMG = true;
1036 } else if (ctx.has_NSA_MIMG) {
1037 ctx.has_NSA_MIMG = false;
1038
1039 if (instr->isMUBUF() || instr->isMTBUF()) {
1040 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1041 if (offset & 6)
1042 bld.sopp(aco_opcode::s_nop, 0);
1043 }
1044 }
1045
1046 /* waNsaCannotFollowWritelane
1047 * Handles NSA MIMG immediately following a v_writelane_b32.
1048 */
1049 if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1050 ctx.has_writelane = true;
1051 } else if (ctx.has_writelane) {
1052 ctx.has_writelane = false;
1053 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1054 bld.sopp(aco_opcode::s_nop, 0);
1055 }
1056 }
1057
1058 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1059 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1060 std::vector<aco_ptr<Instruction>>& new_instructions)
1061 {
1062 Builder bld(state.program, &new_instructions);
1063
1064 size_t prev_count = new_instructions.size();
1065
1066 /* VcmpxPermlaneHazard */
1067 if (ctx.has_VOPC_write_exec) {
1068 ctx.has_VOPC_write_exec = false;
1069 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1070
1071 /* VALU mitigates VMEMtoScalarWriteHazard. */
1072 ctx.sgprs_read_by_VMEM.reset();
1073 ctx.sgprs_read_by_DS.reset();
1074 ctx.sgprs_read_by_VMEM_store.reset();
1075 }
1076
1077 unsigned waitcnt_depctr = 0xffff;
1078
1079 /* VMEMtoScalarWriteHazard */
1080 if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1081 ctx.sgprs_read_by_VMEM_store.any()) {
1082 ctx.sgprs_read_by_VMEM.reset();
1083 ctx.sgprs_read_by_DS.reset();
1084 ctx.sgprs_read_by_VMEM_store.reset();
1085 waitcnt_depctr &= 0xffe3;
1086 }
1087
1088 /* VcmpxExecWARHazard */
1089 if (ctx.has_nonVALU_exec_read) {
1090 ctx.has_nonVALU_exec_read = false;
1091 waitcnt_depctr &= 0xfffe;
1092 }
1093
1094 if (waitcnt_depctr != 0xffff)
1095 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1096
1097 /* SMEMtoVectorWriteHazard */
1098 if (ctx.sgprs_read_by_SMEM.any()) {
1099 ctx.sgprs_read_by_SMEM.reset();
1100 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1101 }
1102
1103 /* LdsBranchVmemWARHazard */
1104 if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1105 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1106 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1107 }
1108
1109 /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1110 if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1111 ctx.has_NSA_MIMG = ctx.has_writelane = false;
1112 /* Any instruction resolves these hazards. */
1113 if (new_instructions.size() == prev_count)
1114 bld.sopp(aco_opcode::s_nop, 0);
1115 }
1116 }
1117
1118 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1119 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1120 {
1121 if (reg.reg() < 256)
1122 return;
1123 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1124 set.set(reg.reg() - 256 + i);
1125 }
1126
1127 bool
test_vgpr_bitset(std::bitset<256> & set,Operand op)1128 test_vgpr_bitset(std::bitset<256>& set, Operand op)
1129 {
1130 if (op.physReg().reg() < 256)
1131 return false;
1132 for (unsigned i = 0; i < op.size(); i++) {
1133 if (set[op.physReg().reg() - 256 + i])
1134 return true;
1135 }
1136 return false;
1137 }
1138
1139 /* GFX11 */
1140 struct LdsDirectVALUHazardGlobalState {
1141 unsigned wait_vdst = 15;
1142 PhysReg vgpr;
1143 std::set<unsigned> loop_headers_visited;
1144 };
1145
1146 struct LdsDirectVALUHazardBlockState {
1147 unsigned num_valu = 0;
1148 bool has_trans = false;
1149
1150 unsigned num_instrs = 0;
1151 unsigned num_blocks = 0;
1152 };
1153
1154 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1155 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1156 LdsDirectVALUHazardBlockState& block_state,
1157 aco_ptr<Instruction>& instr)
1158 {
1159 if (instr->isVALU()) {
1160 block_state.has_trans |= instr->isTrans();
1161
1162 bool uses_vgpr = false;
1163 for (Definition& def : instr->definitions)
1164 uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1165 for (Operand& op : instr->operands) {
1166 uses_vgpr |=
1167 !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1168 }
1169 if (uses_vgpr) {
1170 /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1171 global_state.wait_vdst =
1172 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1173 return true;
1174 }
1175
1176 block_state.num_valu++;
1177 }
1178
1179 if (parse_depctr_wait(instr.get()).va_vdst == 0)
1180 return true;
1181
1182 block_state.num_instrs++;
1183 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1184 /* Exit to limit compile times and set wait_vdst to be safe. */
1185 global_state.wait_vdst =
1186 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1187 return true;
1188 }
1189
1190 return block_state.num_valu >= global_state.wait_vdst;
1191 }
1192
1193 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1194 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1195 LdsDirectVALUHazardBlockState& block_state, Block* block)
1196 {
1197 if (block->kind & block_kind_loop_header) {
1198 if (global_state.loop_headers_visited.count(block->index))
1199 return false;
1200 global_state.loop_headers_visited.insert(block->index);
1201 }
1202
1203 block_state.num_blocks++;
1204
1205 return true;
1206 }
1207
1208 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1209 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1210 {
1211 /* LdsDirectVALUHazard
1212 * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1213 */
1214 if (instr->ldsdir().wait_vdst == 0)
1215 return 0; /* early exit */
1216
1217 LdsDirectVALUHazardGlobalState global_state;
1218 global_state.wait_vdst = instr->ldsdir().wait_vdst;
1219 global_state.vgpr = instr->definitions[0].physReg();
1220 LdsDirectVALUHazardBlockState block_state;
1221 search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1222 &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1223 state, global_state, block_state);
1224 return global_state.wait_vdst;
1225 }
1226
1227 enum VALUPartialForwardingHazardState : uint8_t {
1228 nothing_written,
1229 written_after_exec_write,
1230 exec_written,
1231 };
1232
1233 struct VALUPartialForwardingHazardGlobalState {
1234 bool hazard_found = false;
1235 std::set<unsigned> loop_headers_visited;
1236 };
1237
1238 struct VALUPartialForwardingHazardBlockState {
1239 /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1240 uint8_t num_vgprs_read = 0;
1241 BITSET_DECLARE(vgprs_read, 256) = {0};
1242 enum VALUPartialForwardingHazardState state = nothing_written;
1243 unsigned num_valu_since_read = 0;
1244 unsigned num_valu_since_write = 0;
1245
1246 unsigned num_instrs = 0;
1247 unsigned num_blocks = 0;
1248 };
1249
1250 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1251 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1252 VALUPartialForwardingHazardBlockState& block_state,
1253 aco_ptr<Instruction>& instr)
1254 {
1255 /* Check if there is already a hazard found on some other control flow path. */
1256 if (global_state.hazard_found)
1257 return true;
1258
1259 if (instr->isSALU() && !instr->definitions.empty()) {
1260 if (block_state.state == written_after_exec_write && instr->writes_exec())
1261 block_state.state = exec_written;
1262 } else if (instr->isVALU()) {
1263 bool vgpr_write = false;
1264 for (Definition& def : instr->definitions) {
1265 if (def.physReg().reg() < 256)
1266 continue;
1267
1268 for (unsigned i = 0; i < def.size(); i++) {
1269 unsigned reg = def.physReg().reg() - 256 + i;
1270 if (!BITSET_TEST(block_state.vgprs_read, reg))
1271 continue;
1272
1273 if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1274 global_state.hazard_found = true;
1275 return true;
1276 }
1277
1278 BITSET_CLEAR(block_state.vgprs_read, reg);
1279 block_state.num_vgprs_read--;
1280 vgpr_write = true;
1281 }
1282 }
1283
1284 if (vgpr_write) {
1285 /* If the state is nothing_written: the check below should ensure that this write is
1286 * close enough to the read.
1287 *
1288 * If the state is exec_written: the current choice of second write has failed. Reset and
1289 * try with the current write as the second one, if it's close enough to the read.
1290 *
1291 * If the state is written_after_exec_write: a further second write would be better, if
1292 * it's close enough to the read.
1293 */
1294 if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1295 block_state.state = written_after_exec_write;
1296 block_state.num_valu_since_write = 0;
1297 } else {
1298 block_state.num_valu_since_write++;
1299 }
1300 } else {
1301 block_state.num_valu_since_write++;
1302 }
1303
1304 block_state.num_valu_since_read++;
1305 } else if (parse_depctr_wait(instr.get()).va_vdst == 0) {
1306 return true;
1307 }
1308
1309 if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1310 return true; /* Hazard not possible at this distance. */
1311 if (block_state.num_vgprs_read == 0)
1312 return true; /* All VGPRs have been written and a hazard was never found. */
1313
1314 block_state.num_instrs++;
1315 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1316 /* Exit to limit compile times and set hazard_found=true to be safe. */
1317 global_state.hazard_found = true;
1318 return true;
1319 }
1320
1321 return false;
1322 }
1323
1324 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1325 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1326 VALUPartialForwardingHazardBlockState& block_state,
1327 Block* block)
1328 {
1329 if (block->kind & block_kind_loop_header) {
1330 if (global_state.loop_headers_visited.count(block->index))
1331 return false;
1332 global_state.loop_headers_visited.insert(block->index);
1333 }
1334
1335 block_state.num_blocks++;
1336
1337 return true;
1338 }
1339
1340 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1341 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1342 {
1343 /* VALUPartialForwardingHazard
1344 * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1345 * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1346 * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1347 */
1348 if (state.program->wave_size != 64 || !instr->isVALU())
1349 return false;
1350
1351 unsigned num_vgprs = 0;
1352 for (Operand& op : instr->operands)
1353 num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1354 if (num_vgprs <= 1)
1355 return false; /* early exit */
1356
1357 VALUPartialForwardingHazardBlockState block_state;
1358
1359 for (unsigned i = 0; i < instr->operands.size(); i++) {
1360 Operand& op = instr->operands[i];
1361 if (op.physReg().reg() < 256)
1362 continue;
1363 for (unsigned j = 0; j < op.size(); j++)
1364 BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1365 }
1366 block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1367
1368 if (block_state.num_vgprs_read <= 1)
1369 return false; /* early exit */
1370
1371 VALUPartialForwardingHazardGlobalState global_state;
1372 search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1373 &handle_valu_partial_forwarding_hazard_block,
1374 &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1375 return global_state.hazard_found;
1376 }
1377
1378 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1379 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1380 std::vector<aco_ptr<Instruction>>& new_instructions)
1381 {
1382 Builder bld(state.program, &new_instructions);
1383
1384 /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1385 if (instr->opcode == aco_opcode::s_sendmsg && instr->salu().imm == sendmsg_dealloc_vgprs &&
1386 (new_instructions.empty() || new_instructions.back()->opcode != aco_opcode::s_nop)) {
1387 bld.sopp(aco_opcode::s_nop, 0);
1388 }
1389
1390 /* VcmpxPermlaneHazard
1391 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1392 */
1393 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1394 ctx.has_Vcmpx = true;
1395 } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1396 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1397 instr->opcode == aco_opcode::v_permlane64_b32 ||
1398 instr->opcode == aco_opcode::v_permlane16_var_b32 ||
1399 instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
1400 ctx.has_Vcmpx = false;
1401
1402 /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1403 bld.vop1(aco_opcode::v_nop);
1404 } else if (instr->isVALU()) {
1405 ctx.has_Vcmpx = false;
1406 }
1407
1408 unsigned va_vdst = parse_depctr_wait(instr.get()).va_vdst;
1409 unsigned vm_vsrc = 7;
1410 unsigned sa_sdst = 1;
1411
1412 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1413 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
1414 va_vdst = 0;
1415 vm_vsrc = 0;
1416 sa_sdst = 0;
1417 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1418 /* va_vdst already obtained through parse_depctr_wait(). */
1419 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
1420 sa_sdst = instr->salu().imm & 0x1;
1421 } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
1422 vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
1423 }
1424
1425 if (instr->isLDSDIR()) {
1426 unsigned count = handle_lds_direct_valu_hazard(state, instr);
1427 LDSDIR_instruction* ldsdir = &instr->ldsdir();
1428 if (count < va_vdst) {
1429 ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1430 va_vdst = MIN2(va_vdst, count);
1431 }
1432 }
1433
1434 /* VALUTransUseHazard
1435 * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1436 * in-between.
1437 */
1438 if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
1439 uint8_t num_valu = 15;
1440 uint8_t num_trans = 15;
1441 for (Operand& op : instr->operands) {
1442 if (op.physReg().reg() < 256)
1443 continue;
1444 for (unsigned i = 0; i < op.size(); i++) {
1445 PhysReg reg = op.physReg().advance(i * 4);
1446 num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(reg));
1447 num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(reg));
1448 }
1449 }
1450 if (num_trans <= 1 && num_valu <= 5) {
1451 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1452 va_vdst = 0;
1453 }
1454 }
1455
1456 if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
1457 handle_valu_partial_forwarding_hazard(state, instr)) {
1458 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1459 va_vdst = 0;
1460 }
1461
1462 if (state.program->gfx_level < GFX12) {
1463 /* VALUMaskWriteHazard
1464 * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
1465 * VALU.
1466 */
1467 if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
1468 check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1469 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1470 sa_sdst = 0;
1471 }
1472
1473 if (va_vdst == 0) {
1474 ctx.valu_since_wr_by_trans.reset();
1475 ctx.trans_since_wr_by_trans.reset();
1476 }
1477
1478 if (sa_sdst == 0)
1479 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1480
1481 if (state.program->wave_size == 64 && instr->isSALU() &&
1482 check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1483 unsigned reg = instr->definitions[0].physReg().reg();
1484 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1485 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg + i] = 1;
1486 }
1487
1488 if (instr->isVALU()) {
1489 bool is_trans = instr->isTrans();
1490
1491 ctx.valu_since_wr_by_trans.inc();
1492 if (is_trans)
1493 ctx.trans_since_wr_by_trans.inc();
1494
1495 if (is_trans) {
1496 for (Definition& def : instr->definitions) {
1497 for (unsigned i = 0; i < def.size(); i++) {
1498 PhysReg reg = def.physReg().advance(i * 4);
1499 ctx.valu_since_wr_by_trans.set(reg);
1500 ctx.trans_since_wr_by_trans.set(reg);
1501 }
1502 }
1503 }
1504
1505 if (state.program->wave_size == 64) {
1506 for (Operand& op : instr->operands) {
1507 /* This should ignore exec reads */
1508 if (!op.isConstant() && op.physReg().reg() < 126)
1509 ctx.sgpr_read_by_valu_as_lanemask.reset();
1510 }
1511 switch (instr->opcode) {
1512 case aco_opcode::v_addc_co_u32:
1513 case aco_opcode::v_subb_co_u32:
1514 case aco_opcode::v_subbrev_co_u32:
1515 case aco_opcode::v_cndmask_b16:
1516 case aco_opcode::v_cndmask_b32:
1517 case aco_opcode::v_div_fmas_f32:
1518 case aco_opcode::v_div_fmas_f64:
1519 if (instr->operands.back().physReg() != exec) {
1520 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1521 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1522 }
1523 break;
1524 default: break;
1525 }
1526 }
1527 }
1528 } else {
1529 /* VALUReadSGPRHazard
1530 * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
1531 */
1532 if (instr->isVALU() || instr->isSALU()) {
1533 unsigned expiry_count = instr->isSALU() ? 10 : 11;
1534 for (Operand& op : instr->operands) {
1535 if (sa_sdst == 0)
1536 break;
1537
1538 for (unsigned i = 0; i < op.size(); i++) {
1539 PhysReg reg = op.physReg().advance(i * 4);
1540 if (reg <= m0 && ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
1541 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1542 sa_sdst = 0;
1543 break;
1544 }
1545 }
1546 }
1547 }
1548
1549 if (sa_sdst == 0)
1550 ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1551 else if (instr->isSALU() && !instr->isSOPP())
1552 ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
1553
1554 if (instr->isVALU()) {
1555 for (const Operand& op : instr->operands) {
1556 for (unsigned i = 0; i < DIV_ROUND_UP(op.size(), 2); i++) {
1557 unsigned reg = (op.physReg() / 2) + i;
1558 if (reg < ctx.sgpr_read_by_valu.size())
1559 ctx.sgpr_read_by_valu.set(reg);
1560 }
1561 }
1562 } else if (instr->isSALU() && !instr->definitions.empty()) {
1563 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
1564 PhysReg def_reg = instr->definitions[0].physReg().advance(i * 4);
1565 if ((def_reg / 2) < ctx.sgpr_read_by_valu.size() && ctx.sgpr_read_by_valu[def_reg / 2])
1566 ctx.sgpr_read_by_valu_then_wr_by_salu.set(def_reg);
1567 }
1568 }
1569 }
1570
1571 /* LdsDirectVMEMHazard
1572 * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1573 */
1574 if (instr->isVMEM() || instr->isFlatLike()) {
1575 if (instr->definitions.empty()) {
1576 for (Operand& op : instr->operands)
1577 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1578 } else {
1579 uint8_t vmem_type = state.program->gfx_level >= GFX12
1580 ? get_vmem_type(state.program->gfx_level, instr.get())
1581 : vmem_nosampler;
1582 std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
1583 if (vmem_type == vmem_sampler)
1584 vgprs = &ctx.vgpr_used_by_vmem_sample;
1585 else if (vmem_type == vmem_bvh)
1586 vgprs = &ctx.vgpr_used_by_vmem_bvh;
1587
1588 for (Definition& def : instr->definitions)
1589 fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
1590 for (Operand& op : instr->operands)
1591 fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
1592 }
1593 }
1594 if (instr->isDS() || instr->isFlat()) {
1595 for (Definition& def : instr->definitions)
1596 fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1597 for (Operand& op : instr->operands)
1598 fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1599 }
1600 wait_imm imm;
1601 if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1602 ctx.vgpr_used_by_vmem_load.reset();
1603 ctx.vgpr_used_by_vmem_sample.reset();
1604 ctx.vgpr_used_by_vmem_bvh.reset();
1605 ctx.vgpr_used_by_vmem_store.reset();
1606 ctx.vgpr_used_by_ds.reset();
1607 } else if (imm.unpack(state.program->gfx_level, instr.get())) {
1608 if (imm.vm == 0)
1609 ctx.vgpr_used_by_vmem_load.reset();
1610 if (imm.sample == 0)
1611 ctx.vgpr_used_by_vmem_sample.reset();
1612 if (imm.bvh == 0)
1613 ctx.vgpr_used_by_vmem_bvh.reset();
1614 if (imm.lgkm == 0)
1615 ctx.vgpr_used_by_ds.reset();
1616 if (imm.vs == 0)
1617 ctx.vgpr_used_by_vmem_store.reset();
1618 }
1619 if (instr->isLDSDIR()) {
1620 if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1621 ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
1622 ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
1623 ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1624 ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1625 if (state.program->gfx_level >= GFX12)
1626 instr->ldsdir().wait_vsrc = 0;
1627 else
1628 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
1629 ctx.vgpr_used_by_vmem_load.reset();
1630 ctx.vgpr_used_by_vmem_sample.reset();
1631 ctx.vgpr_used_by_vmem_bvh.reset();
1632 ctx.vgpr_used_by_vmem_store.reset();
1633 ctx.vgpr_used_by_ds.reset();
1634 }
1635 }
1636
1637 /* WMMA Hazards */
1638 if (instr_info.classes[(int)instr->opcode] == instr_class::wmma) {
1639 assert(instr->operands.back().regClass() == instr->definitions[0].regClass());
1640
1641 bool is_swmma = instr->operands.size() == 4;
1642 if (test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[0]) ||
1643 test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[1]) ||
1644 (is_swmma && test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[2]))) {
1645 bld.vop1(aco_opcode::v_nop);
1646 }
1647
1648 ctx.vgpr_written_by_wmma.reset();
1649 fill_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->definitions[0].physReg(),
1650 instr->definitions[0].bytes());
1651 } else if (instr->isVALU()) {
1652 ctx.vgpr_written_by_wmma.reset();
1653 }
1654 }
1655
1656 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1657 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1658 {
1659 if (parse_depctr_wait(pred.get()).va_vdst == 0)
1660 return true;
1661
1662 if (--block_state == 0) {
1663 global_state = false;
1664 return true;
1665 }
1666
1667 if (pred->isVALU()) {
1668 bool vgpr_rd_or_wr = false;
1669 for (Definition def : pred->definitions) {
1670 if (def.physReg().reg() >= 256)
1671 vgpr_rd_or_wr = true;
1672 }
1673 for (Operand op : pred->operands) {
1674 if (op.physReg().reg() >= 256)
1675 vgpr_rd_or_wr = true;
1676 }
1677 if (vgpr_rd_or_wr) {
1678 global_state = false;
1679 return true;
1680 }
1681 }
1682
1683 return false;
1684 }
1685
1686 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1687 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1688 std::vector<aco_ptr<Instruction>>& new_instructions)
1689 {
1690 Builder bld(state.program, &new_instructions);
1691
1692 unsigned waitcnt_depctr = 0xffff;
1693 bool valu_read_sgpr = false;
1694
1695 /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1696 bool has_vdst0_since_valu = true;
1697 unsigned depth = 16;
1698 search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1699 state, has_vdst0_since_valu, depth);
1700 if (!has_vdst0_since_valu) {
1701 waitcnt_depctr &= 0x0fff;
1702 ctx.valu_since_wr_by_trans.reset();
1703 ctx.trans_since_wr_by_trans.reset();
1704 }
1705
1706 /* VcmpxPermlaneHazard/WMMAHazards */
1707 if (ctx.has_Vcmpx || ctx.vgpr_written_by_wmma.any()) {
1708 ctx.has_Vcmpx = false;
1709 ctx.vgpr_written_by_wmma.reset();
1710 bld.vop1(aco_opcode::v_nop);
1711 }
1712
1713 /* VALUMaskWriteHazard */
1714 if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
1715 if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
1716 waitcnt_depctr &= 0xfffe;
1717 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1718 }
1719 if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
1720 valu_read_sgpr = true;
1721 ctx.sgpr_read_by_valu_as_lanemask.reset();
1722 }
1723 }
1724
1725 /* VALUReadSGPRHazard */
1726 if (state.program->gfx_level >= GFX12) {
1727 if (!ctx.sgpr_read_by_valu_then_wr_by_salu.empty())
1728 waitcnt_depctr &= 0xfffe;
1729
1730 ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1731 }
1732
1733 /* LdsDirectVMEMHazard */
1734 if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1735 ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
1736 ctx.vgpr_used_by_vmem_bvh.any()) {
1737 waitcnt_depctr &= 0xffe3;
1738 ctx.vgpr_used_by_vmem_load.reset();
1739 ctx.vgpr_used_by_vmem_store.reset();
1740 ctx.vgpr_used_by_ds.reset();
1741 }
1742
1743 if (waitcnt_depctr != 0xffff)
1744 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1745
1746 if (valu_read_sgpr) {
1747 /* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
1748 * other hazards. */
1749 bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
1750 Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
1751
1752 /* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
1753 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1754 }
1755 }
1756
1757 template <typename Ctx>
1758 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1759 std::vector<aco_ptr<Instruction>>&);
1760
1761 template <typename Ctx>
1762 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1763
1764 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1765 void
handle_block(Program * program,Ctx & ctx,Block & block)1766 handle_block(Program* program, Ctx& ctx, Block& block)
1767 {
1768 if (block.instructions.empty())
1769 return;
1770
1771 State state;
1772 state.program = program;
1773 state.block = █
1774 state.old_instructions = std::move(block.instructions);
1775
1776 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1777 block.instructions.reserve(state.old_instructions.size());
1778
1779 bool found_end = false;
1780 for (aco_ptr<Instruction>& instr : state.old_instructions) {
1781 Handle(state, ctx, instr, block.instructions);
1782
1783 /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1784 if (instr->opcode == aco_opcode::s_setpc_b64) {
1785 block.instructions.emplace_back(std::move(instr));
1786
1787 std::vector<aco_ptr<Instruction>> resolve_instrs;
1788 Resolve(state, ctx, resolve_instrs);
1789 block.instructions.insert(std::prev(block.instructions.end()),
1790 std::move_iterator(resolve_instrs.begin()),
1791 std::move_iterator(resolve_instrs.end()));
1792
1793 found_end = true;
1794 continue;
1795 }
1796
1797 found_end |= instr->opcode == aco_opcode::s_endpgm;
1798 block.instructions.emplace_back(std::move(instr));
1799 }
1800
1801 /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1802 if (block.linear_succs.empty() && !found_end)
1803 Resolve(state, ctx, block.instructions);
1804 }
1805
1806 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1807 void
mitigate_hazards(Program * program,Ctx initial_ctx=Ctx ())1808 mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
1809 {
1810 std::vector<Ctx> all_ctx(program->blocks.size());
1811 std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1812
1813 for (unsigned i = 0; i < program->blocks.size(); i++) {
1814 Block& block = program->blocks[i];
1815 Ctx& ctx = all_ctx[i];
1816
1817 if (i == 0 || (block.kind & block_kind_resume))
1818 ctx = initial_ctx;
1819
1820 if (block.kind & block_kind_loop_header) {
1821 loop_header_indices.push(i);
1822 } else if (block.kind & block_kind_loop_exit) {
1823 /* Go through the whole loop again */
1824 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1825 Ctx loop_block_ctx;
1826 for (unsigned b : program->blocks[idx].linear_preds)
1827 loop_block_ctx.join(all_ctx[b]);
1828
1829 handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1830
1831 /* We only need to continue if the loop header context changed */
1832 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1833 break;
1834
1835 all_ctx[idx] = loop_block_ctx;
1836 }
1837
1838 loop_header_indices.pop();
1839 }
1840
1841 for (unsigned b : block.linear_preds)
1842 ctx.join(all_ctx[b]);
1843
1844 handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1845 }
1846 }
1847
1848 /* FeatureRequiredExportPriority in LLVM */
1849 void
required_export_priority(Program * program)1850 required_export_priority(Program* program)
1851 {
1852 /* Skip callees, assuming that the caller has already increased the priority. */
1853 bool increase_priority = !program->is_epilog && !program->info.vs.has_prolog &&
1854 (!program->info.merged_shader_compiled_separately ||
1855 program->stage.sw == SWStage::VS || program->stage.sw == SWStage::TES);
1856 increase_priority |= program->is_prolog;
1857
1858 for (Block& block : program->blocks) {
1859 std::vector<aco_ptr<Instruction>> new_instructions;
1860 new_instructions.reserve(block.instructions.size() + 6);
1861
1862 Builder bld(program, &new_instructions);
1863
1864 if (increase_priority && block.index == 0) {
1865 if (!block.instructions.empty() && block.instructions[0]->opcode == aco_opcode::s_setprio)
1866 block.instructions[0]->salu().imm = MAX2(block.instructions[0]->salu().imm, 2);
1867 else
1868 bld.sopp(aco_opcode::s_setprio, 2);
1869 }
1870
1871 for (unsigned i = 0; i < block.instructions.size(); i++) {
1872 Instruction* instr = block.instructions[i].get();
1873 new_instructions.push_back(std::move(block.instructions[i]));
1874
1875 if (instr->opcode == aco_opcode::s_setprio) {
1876 instr->salu().imm = MAX2(instr->salu().imm, 2);
1877 continue;
1878 }
1879
1880 bool end_of_export_sequence = instr->isEXP() && (i == block.instructions.size() - 1 ||
1881 !block.instructions[i + 1]->isEXP());
1882 if (!end_of_export_sequence)
1883 continue;
1884
1885 bool before_endpgm = false;
1886 if (i != block.instructions.size() - 1) {
1887 before_endpgm = block.instructions[i + 1]->opcode == aco_opcode::s_endpgm;
1888 } else {
1889 /* Does this fallthrough to a s_endpgm? */
1890 for (unsigned j = block.index + 1; j < program->blocks.size(); j++) {
1891 if (program->blocks[j].instructions.size() == 1 &&
1892 program->blocks[j].instructions[0]->opcode == aco_opcode::s_endpgm)
1893 before_endpgm = true;
1894 if (!program->blocks[j].instructions.empty())
1895 break;
1896 }
1897 }
1898
1899 bld.sopp(aco_opcode::s_setprio, 0);
1900 if (!before_endpgm)
1901 bld.sopk(aco_opcode::s_waitcnt_expcnt, Operand(sgpr_null, s1), 0);
1902 bld.sopp(aco_opcode::s_nop, 0);
1903 bld.sopp(aco_opcode::s_nop, 0);
1904 if (!before_endpgm)
1905 bld.sopp(aco_opcode::s_setprio, 2);
1906 }
1907
1908 block.instructions = std::move(new_instructions);
1909 }
1910 }
1911
1912 } /* end namespace */
1913
1914 void
insert_NOPs(Program * program)1915 insert_NOPs(Program* program)
1916 {
1917 if (program->gfx_level >= GFX11) {
1918 NOP_ctx_gfx11 initial_ctx;
1919
1920 bool has_previous_part =
1921 program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
1922 (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
1923 program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
1924 if (program->gfx_level >= GFX12 && has_previous_part) {
1925 /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
1926 * SGPR might have been read by VALU if there was a previous shader part.
1927 */
1928 initial_ctx.sgpr_read_by_valu.flip();
1929 }
1930
1931 mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
1932 initial_ctx);
1933 } else if (program->gfx_level >= GFX10_3) {
1934 ; /* no hazards/bugs to mitigate */
1935 } else if (program->gfx_level >= GFX10) {
1936 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1937 } else {
1938 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1939 }
1940
1941 if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1942 program->stage.hw == AC_HW_PIXEL_SHADER))
1943 required_export_priority(program);
1944 }
1945
1946 } // namespace aco
1947