1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include "util/bitset.h"
11
12 #include <algorithm>
13 #include <bitset>
14 #include <set>
15 #include <stack>
16 #include <vector>
17
18 namespace aco {
19 namespace {
20
21 struct State {
22 Program* program;
23 Block* block;
24 std::vector<aco_ptr<Instruction>> old_instructions;
25 };
26
27 struct NOP_ctx_gfx6 {
joinaco::__anone77085690111::NOP_ctx_gfx628 void join(const NOP_ctx_gfx6& other)
29 {
30 set_vskip_mode_then_vector =
31 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
32 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
33 salu_wr_m0_then_gds_msg_ttrace =
34 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
35 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
36 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
37 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
38 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
39 vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
40 smem_clause |= other.smem_clause;
41 smem_write |= other.smem_write;
42 for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
43 smem_clause_read_write[i] |= other.smem_clause_read_write[i];
44 smem_clause_write[i] |= other.smem_clause_write[i];
45 }
46 }
47
operator ==aco::__anone77085690111::NOP_ctx_gfx648 bool operator==(const NOP_ctx_gfx6& other)
49 {
50 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
51 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
52 vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
53 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
54 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
55 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
56 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
57 setreg_then_getsetreg == other.setreg_then_getsetreg &&
58 smem_clause == other.smem_clause && smem_write == other.smem_write &&
59 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
60 BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
61 }
62
add_wait_statesaco::__anone77085690111::NOP_ctx_gfx663 void add_wait_states(unsigned amount)
64 {
65 if ((set_vskip_mode_then_vector -= amount) < 0)
66 set_vskip_mode_then_vector = 0;
67
68 if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
69 valu_wr_vcc_then_div_fmas = 0;
70
71 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
72 salu_wr_m0_then_gds_msg_ttrace = 0;
73
74 if ((valu_wr_exec_then_dpp -= amount) < 0)
75 valu_wr_exec_then_dpp = 0;
76
77 if ((salu_wr_m0_then_lds -= amount) < 0)
78 salu_wr_m0_then_lds = 0;
79
80 if ((salu_wr_m0_then_moverel -= amount) < 0)
81 salu_wr_m0_then_moverel = 0;
82
83 if ((setreg_then_getsetreg -= amount) < 0)
84 setreg_then_getsetreg = 0;
85
86 vmem_store_then_wr_data.reset();
87 }
88
89 /* setting MODE.vskip and then any vector op requires 2 wait states */
90 int8_t set_vskip_mode_then_vector = 0;
91
92 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
93 int8_t valu_wr_vcc_then_div_fmas = 0;
94
95 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
96 int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
97
98 /* VALU writing EXEC followed by DPP requires 5 wait states */
99 int8_t valu_wr_exec_then_dpp = 0;
100
101 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
102 int8_t salu_wr_m0_then_lds = 0;
103
104 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
105 int8_t salu_wr_m0_then_moverel = 0;
106
107 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
108 * currently we don't look at the actual register */
109 int8_t setreg_then_getsetreg = 0;
110
111 /* some memory instructions writing >64bit followed by a instructions
112 * writing the VGPRs holding the writedata requires 1 wait state */
113 std::bitset<256> vmem_store_then_wr_data;
114
115 /* we break up SMEM clauses that contain stores or overwrite an
116 * operand/definition of another instruction in the clause */
117 bool smem_clause = false;
118 bool smem_write = false;
119 BITSET_DECLARE(smem_clause_read_write, 128) = {0};
120 BITSET_DECLARE(smem_clause_write, 128) = {0};
121 };
122
123 struct NOP_ctx_gfx10 {
124 bool has_VOPC_write_exec = false;
125 bool has_nonVALU_exec_read = false;
126 bool has_VMEM = false;
127 bool has_branch_after_VMEM = false;
128 bool has_DS = false;
129 bool has_branch_after_DS = false;
130 bool has_NSA_MIMG = false;
131 bool has_writelane = false;
132 std::bitset<128> sgprs_read_by_VMEM;
133 std::bitset<128> sgprs_read_by_VMEM_store;
134 std::bitset<128> sgprs_read_by_DS;
135 std::bitset<128> sgprs_read_by_SMEM;
136
joinaco::__anone77085690111::NOP_ctx_gfx10137 void join(const NOP_ctx_gfx10& other)
138 {
139 has_VOPC_write_exec |= other.has_VOPC_write_exec;
140 has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
141 has_VMEM |= other.has_VMEM;
142 has_branch_after_VMEM |= other.has_branch_after_VMEM;
143 has_DS |= other.has_DS;
144 has_branch_after_DS |= other.has_branch_after_DS;
145 has_NSA_MIMG |= other.has_NSA_MIMG;
146 has_writelane |= other.has_writelane;
147 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
148 sgprs_read_by_DS |= other.sgprs_read_by_DS;
149 sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
150 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
151 }
152
operator ==aco::__anone77085690111::NOP_ctx_gfx10153 bool operator==(const NOP_ctx_gfx10& other)
154 {
155 return has_VOPC_write_exec == other.has_VOPC_write_exec &&
156 has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
157 has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
158 has_branch_after_DS == other.has_branch_after_DS &&
159 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
160 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
161 sgprs_read_by_DS == other.sgprs_read_by_DS &&
162 sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
163 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
164 }
165 };
166
167 template <int Max> struct RegCounterMap {
incaco::__anone77085690111::RegCounterMap168 void inc() { base++; }
setaco::__anone77085690111::RegCounterMap169 void set(PhysReg reg) { update(reg, 0); }
170
getaco::__anone77085690111::RegCounterMap171 uint8_t get(PhysReg reg)
172 {
173 if (present.test(reg.reg() & 0x7F)) {
174 for (entry& e : list) {
175 if (e.reg == reg.reg())
176 return MIN2(base - e.val, Max);
177 }
178 }
179 return Max;
180 }
181
resetaco::__anone77085690111::RegCounterMap182 void reset()
183 {
184 present.reset();
185 list.clear();
186 base = 0;
187 }
188
emptyaco::__anone77085690111::RegCounterMap189 bool empty()
190 {
191 for (entry& e : list) {
192 if (base - e.val < Max)
193 return false;
194 }
195 return true;
196 }
197
join_minaco::__anone77085690111::RegCounterMap198 void join_min(const RegCounterMap& other)
199 {
200 for (const entry& e : other.list) {
201 int idx = other.base - e.val;
202 if (idx >= Max)
203 continue;
204
205 update(e.reg, idx);
206 }
207 }
208
updateaco::__anone77085690111::RegCounterMap209 void update(uint16_t reg, int idx)
210 {
211 int16_t val = base - idx;
212 for (entry& e : list) {
213 if (e.reg == reg) {
214 e.val = MAX2(e.val, val);
215 return;
216 }
217 }
218 list.push_back(entry{reg, val});
219 present.set(reg & 0x7F);
220 }
221
operator ==aco::__anone77085690111::RegCounterMap222 bool operator==(const RegCounterMap& other) const
223 {
224 /* Two maps with different bases could also be equal, but for our use case,
225 * i.e. checking for changes at loop headers, this is sufficient since we
226 * always join the predecessors into an empty map with base=0.
227 */
228 return base == other.base && list == other.list;
229 }
230
231 private:
232 struct entry {
233 uint16_t reg;
234 int16_t val;
operator !=aco::__anone77085690111::RegCounterMap::entry235 bool operator!=(const entry& other) const { return reg != other.reg || val != other.val; }
236 };
237
238 std::bitset<128> present;
239 small_vec<entry, 4> list;
240 int base = 0;
241 };
242
243 struct NOP_ctx_gfx11 {
244 /* VcmpxPermlaneHazard */
245 bool has_Vcmpx = false;
246
247 /* LdsDirectVMEMHazard */
248 std::bitset<256> vgpr_used_by_vmem_load;
249 std::bitset<256> vgpr_used_by_vmem_sample;
250 std::bitset<256> vgpr_used_by_vmem_bvh;
251 std::bitset<256> vgpr_used_by_vmem_store;
252 std::bitset<256> vgpr_used_by_ds;
253
254 /* VALUTransUseHazard */
255 RegCounterMap<6> valu_since_wr_by_trans;
256 RegCounterMap<2> trans_since_wr_by_trans;
257
258 /* VALUMaskWriteHazard */
259 std::bitset<128> sgpr_read_by_valu_as_lanemask;
260 std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
261
262 /* WMMAHazards */
263 std::bitset<256> vgpr_written_by_wmma;
264
265 /* VALUReadSGPRHazard */
266 std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
267 std::bitset<m0.reg()> sgpr_read_by_valu_then_wr_by_valu;
268 RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
269
joinaco::__anone77085690111::NOP_ctx_gfx11270 void join(const NOP_ctx_gfx11& other)
271 {
272 has_Vcmpx |= other.has_Vcmpx;
273 vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
274 vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
275 vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
276 vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
277 vgpr_used_by_ds |= other.vgpr_used_by_ds;
278 valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
279 trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
280 sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
281 sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
282 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
283 vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
284 sgpr_read_by_valu |= other.sgpr_read_by_valu;
285 sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
286 sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
287 }
288
operator ==aco::__anone77085690111::NOP_ctx_gfx11289 bool operator==(const NOP_ctx_gfx11& other)
290 {
291 return has_Vcmpx == other.has_Vcmpx &&
292 vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
293 vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
294 vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
295 vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
296 vgpr_used_by_ds == other.vgpr_used_by_ds &&
297 valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
298 trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
299 sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
300 sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
301 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
302 vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
303 sgpr_read_by_valu == other.sgpr_read_by_valu &&
304 sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
305 }
306 };
307
308 int
get_wait_states(aco_ptr<Instruction> & instr)309 get_wait_states(aco_ptr<Instruction>& instr)
310 {
311 if (instr->opcode == aco_opcode::s_nop)
312 return instr->salu().imm + 1;
313 else if (instr->opcode == aco_opcode::p_constaddr)
314 return 3; /* lowered to 3 instructions in the assembler */
315 else
316 return 1;
317 }
318
319 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)320 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
321 {
322 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
323 }
324
325 template <typename GlobalState, typename BlockState,
326 bool (*block_cb)(GlobalState&, BlockState&, Block*),
327 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
328 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)329 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
330 Block* block, bool start_at_end)
331 {
332 if (block == state.block && start_at_end) {
333 /* If it's the current block, block->instructions is incomplete. */
334 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
335 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
336 if (!instr)
337 break; /* Instruction has been moved to block->instructions. */
338 if (instr_cb(global_state, block_state, instr))
339 return;
340 }
341 }
342
343 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
344 if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
345 return;
346 }
347
348 PRAGMA_DIAGNOSTIC_PUSH
349 PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
350 if (block_cb != nullptr && !block_cb(global_state, block_state, block))
351 return;
352 PRAGMA_DIAGNOSTIC_POP
353
354 for (unsigned lin_pred : block->linear_preds) {
355 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
356 state, global_state, block_state, &state.program->blocks[lin_pred], true);
357 }
358 }
359
360 template <typename GlobalState, typename BlockState,
361 bool (*block_cb)(GlobalState&, BlockState&, Block*),
362 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
363 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)364 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
365 {
366 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
367 state, global_state, block_state, state.block, false);
368 }
369
370 struct HandleRawHazardGlobalState {
371 PhysReg reg;
372 int nops_needed;
373 };
374
375 struct HandleRawHazardBlockState {
376 uint32_t mask;
377 int nops_needed;
378 };
379
380 template <bool Valu, bool Vintrp, bool Salu>
381 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)382 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
383 HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
384 {
385 unsigned mask_size = util_last_bit(block_state.mask);
386
387 uint32_t writemask = 0;
388 for (Definition& def : pred->definitions) {
389 if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
390 unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
391 unsigned end = MIN2(mask_size, start + def.size());
392 writemask |= u_bit_consecutive(start, end - start);
393 }
394 }
395
396 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
397 (pred->isSALU() && Salu));
398 if (is_hazard) {
399 global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
400 return true;
401 }
402
403 block_state.mask &= ~writemask;
404 block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
405
406 if (block_state.mask == 0)
407 block_state.nops_needed = 0;
408
409 return block_state.nops_needed == 0;
410 }
411
412 template <bool Valu, bool Vintrp, bool Salu>
413 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)414 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
415 {
416 if (*NOPs >= min_states)
417 return;
418
419 HandleRawHazardGlobalState global = {op.physReg(), 0};
420 HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
421
422 /* Loops require branch instructions, which count towards the wait
423 * states. So even with loops this should finish unless nops_needed is some
424 * huge value. */
425 search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
426 handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
427
428 *NOPs = MAX2(*NOPs, global.nops_needed);
429 }
430
431 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
432 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
433 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
434
435 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)436 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
437 {
438 unsigned end = start + size - 1;
439 unsigned start_mod = start % BITSET_WORDBITS;
440 if (start_mod + size <= BITSET_WORDBITS) {
441 BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
442 } else {
443 unsigned first_size = BITSET_WORDBITS - start_mod;
444 set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
445 set_bitset_range(words, start + first_size, size - first_size);
446 }
447 }
448
449 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)450 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
451 {
452 unsigned end = start + size - 1;
453 unsigned start_mod = start % BITSET_WORDBITS;
454 if (start_mod + size <= BITSET_WORDBITS) {
455 return BITSET_TEST_RANGE(words, start, end);
456 } else {
457 unsigned first_size = BITSET_WORDBITS - start_mod;
458 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
459 test_bitset_range(words, start + first_size, size - first_size);
460 }
461 }
462
463 /* A SMEM clause is any group of consecutive SMEM instructions. The
464 * instructions in this group may return out of order and/or may be replayed.
465 *
466 * To fix this potential hazard correctly, we have to make sure that when a
467 * clause has more than one instruction, no instruction in the clause writes
468 * to a register that is read by another instruction in the clause (including
469 * itself). In this case, we have to break the SMEM clause by inserting non
470 * SMEM instructions.
471 *
472 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
473 */
474 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)475 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
476 int* NOPs)
477 {
478 /* break off from previous SMEM clause if needed */
479 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
480 /* Don't allow clauses with store instructions since the clause's
481 * instructions may use the same address. */
482 if (ctx.smem_write || instr->definitions.empty() ||
483 instr_info.is_atomic[(unsigned)instr->opcode]) {
484 *NOPs = 1;
485 } else if (program->dev.xnack_enabled) {
486 for (Operand op : instr->operands) {
487 if (!op.isConstant() &&
488 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
489 *NOPs = 1;
490 break;
491 }
492 }
493
494 Definition def = instr->definitions[0];
495 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
496 *NOPs = 1;
497 }
498 }
499 }
500
501 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
502 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)503 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
504 std::vector<aco_ptr<Instruction>>& new_instructions)
505 {
506 /* check hazards */
507 int NOPs = 0;
508
509 if (instr->isSMEM()) {
510 if (state.program->gfx_level == GFX6) {
511 /* A read of an SGPR by SMRD instruction requires 4 wait states
512 * when the SGPR was written by a VALU instruction. According to LLVM,
513 * there is also an undocumented hardware behavior when the buffer
514 * descriptor is written by a SALU instruction */
515 for (unsigned i = 0; i < instr->operands.size(); i++) {
516 Operand op = instr->operands[i];
517 if (op.isConstant())
518 continue;
519
520 bool is_buffer_desc = i == 0 && op.size() > 2;
521 if (is_buffer_desc)
522 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
523 else
524 handle_valu_then_read_hazard(state, &NOPs, 4, op);
525 }
526 }
527
528 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
529 } else if (instr->isSALU()) {
530 if (instr->opcode == aco_opcode::s_setreg_b32 ||
531 instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
532 instr->opcode == aco_opcode::s_getreg_b32) {
533 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
534 }
535
536 if (state.program->gfx_level == GFX9) {
537 if (instr->opcode == aco_opcode::s_movrels_b32 ||
538 instr->opcode == aco_opcode::s_movrels_b64 ||
539 instr->opcode == aco_opcode::s_movreld_b32 ||
540 instr->opcode == aco_opcode::s_movreld_b64) {
541 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
542 }
543 }
544
545 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
546 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
547 } else if (instr->isDS() && instr->ds().gds) {
548 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
549 } else if (instr->isVALU() || instr->isVINTRP()) {
550 if (instr->isDPP()) {
551 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
552 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
553 }
554
555 for (Definition def : instr->definitions) {
556 if (def.regClass().type() != RegType::sgpr) {
557 for (unsigned i = 0; i < def.size(); i++)
558 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
559 }
560 }
561
562 if ((instr->opcode == aco_opcode::v_readlane_b32 ||
563 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
564 instr->opcode == aco_opcode::v_writelane_b32 ||
565 instr->opcode == aco_opcode::v_writelane_b32_e64) &&
566 !instr->operands[1].isConstant()) {
567 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
568 }
569
570 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
571 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
572 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
573 * This hazard isn't documented anywhere but AMD confirmed that hazard.
574 */
575 if (state.program->gfx_level == GFX6 &&
576 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
577 instr->opcode == aco_opcode::v_readfirstlane_b32)) {
578 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
579 }
580
581 if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
582 instr->opcode == aco_opcode::v_div_fmas_f64)
583 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
584 } else if (instr->isVMEM() || instr->isFlatLike()) {
585 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
586 for (Operand op : instr->operands) {
587 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
588 handle_valu_then_read_hazard(state, &NOPs, 5, op);
589 }
590 }
591
592 if (!instr->isSALU() && instr->format != Format::SMEM)
593 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
594
595 if (state.program->gfx_level == GFX9) {
596 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
597 if (instr->isVINTRP() || lds_scratch_global ||
598 instr->opcode == aco_opcode::ds_read_addtid_b32 ||
599 instr->opcode == aco_opcode::ds_write_addtid_b32 ||
600 instr->opcode == aco_opcode::buffer_store_lds_dword) {
601 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
602 }
603 }
604
605 ctx.add_wait_states(NOPs + get_wait_states(instr));
606
607 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
608 if (NOPs) {
609 /* create NOP */
610 aco_ptr<Instruction> nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)};
611 nop->salu().imm = NOPs - 1;
612 new_instructions.emplace_back(std::move(nop));
613 }
614
615 /* update information to check for later hazards */
616 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
617 ctx.smem_clause = false;
618 ctx.smem_write = false;
619
620 if (state.program->dev.xnack_enabled) {
621 BITSET_ZERO(ctx.smem_clause_read_write);
622 BITSET_ZERO(ctx.smem_clause_write);
623 }
624 }
625
626 if (instr->isSMEM()) {
627 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
628 ctx.smem_write = true;
629 } else {
630 ctx.smem_clause = true;
631
632 if (state.program->dev.xnack_enabled) {
633 for (Operand op : instr->operands) {
634 if (!op.isConstant()) {
635 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
636 }
637 }
638
639 Definition def = instr->definitions[0];
640 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
641 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
642 }
643 }
644 } else if (instr->isVALU()) {
645 for (Definition def : instr->definitions) {
646 if (def.regClass().type() == RegType::sgpr) {
647 if (def.physReg() == vcc || def.physReg() == vcc_hi) {
648 ctx.valu_wr_vcc_then_div_fmas = 4;
649 }
650 if (def.physReg() == exec || def.physReg() == exec_hi) {
651 ctx.valu_wr_exec_then_dpp = 5;
652 }
653 }
654 }
655 } else if (instr->isSALU()) {
656 if (!instr->definitions.empty()) {
657 /* all other definitions should be SCC */
658 Definition def = instr->definitions[0];
659 if (def.physReg() == m0) {
660 ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
661 ctx.salu_wr_m0_then_lds = 1;
662 ctx.salu_wr_m0_then_moverel = 1;
663 }
664 } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
665 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
666 SALU_instruction& sopk = instr->salu();
667 unsigned offset = (sopk.imm >> 6) & 0x1f;
668 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
669 unsigned reg = sopk.imm & 0x3f;
670 ctx.setreg_then_getsetreg = 2;
671
672 if (reg == 1 && offset >= 28 && size > (28 - offset))
673 ctx.set_vskip_mode_then_vector = 2;
674 }
675 } else if (instr->isVMEM() || instr->isFlatLike()) {
676 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
677 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
678 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
679 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
680 * store) */
681 bool consider_mimg = instr->isMIMG() &&
682 instr->operands[1].regClass().type() == RegType::vgpr &&
683 instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
684 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
685 bool consider_flat =
686 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
687 if (consider_buf || consider_mimg || consider_flat) {
688 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
689 unsigned size = instr->operands[consider_flat ? 2 : 3].size();
690 for (unsigned i = 0; i < size; i++)
691 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
692 }
693 }
694 }
695
696 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)697 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
698 {
699 if (pred->isVINTRP())
700 global_state = true;
701 return true;
702 }
703
704 template <bool Salu, bool Sgpr>
705 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)706 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
707 {
708 if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
709 for (Definition dst : pred->definitions) {
710 if ((dst.physReg().reg() < 256) == Sgpr) {
711 global_state = MAX2(global_state, block_state);
712 return true;
713 }
714 }
715 }
716
717 block_state -= get_wait_states(pred);
718 return block_state <= 0;
719 }
720
721 template <bool Salu, bool Sgpr>
722 void
handle_wr_hazard(State & state,int * NOPs,int min_states)723 handle_wr_hazard(State& state, int* NOPs, int min_states)
724 {
725 if (*NOPs >= min_states)
726 return;
727
728 int global = 0;
729 int block = min_states;
730 search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
731 *NOPs = MAX2(*NOPs, global);
732 }
733
734 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)735 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
736 std::vector<aco_ptr<Instruction>>& new_instructions)
737 {
738 int NOPs = 0;
739
740 /* SGPR->SMEM hazards */
741 if (state.program->gfx_level == GFX6) {
742 handle_wr_hazard<true, true>(state, &NOPs, 4);
743 handle_wr_hazard<false, true>(state, &NOPs, 4);
744 }
745
746 /* Break up SMEM clauses */
747 if (ctx.smem_clause || ctx.smem_write)
748 NOPs = MAX2(NOPs, 1);
749
750 /* SALU/GDS hazards */
751 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
752 if (state.program->gfx_level == GFX9)
753 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
754 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
755
756 /* VALU hazards */
757 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
758 if (state.program->gfx_level >= GFX8)
759 handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
760 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
761 if (state.program->gfx_level == GFX6) {
762 /* VINTRP->v_readlane_b32/etc */
763 bool vintrp = false;
764 search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
765 if (vintrp)
766 NOPs = MAX2(NOPs, 1);
767 }
768 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
769
770 /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
771 handle_wr_hazard<false, true>(state, &NOPs, 5);
772
773 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
774
775 if (state.program->gfx_level == GFX9)
776 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
777
778 ctx.add_wait_states(NOPs);
779 if (NOPs) {
780 Builder bld(state.program, &new_instructions);
781 bld.sopp(aco_opcode::s_nop, NOPs - 1);
782 }
783 }
784
785 template <std::size_t N>
786 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)787 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
788 {
789 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
790 [&check_regs](const Definition& def) -> bool
791 {
792 bool writes_any = false;
793 for (unsigned i = 0; i < def.size(); i++) {
794 unsigned def_reg = def.physReg() + i;
795 writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
796 }
797 return writes_any;
798 });
799 }
800
801 template <std::size_t N>
802 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)803 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
804 {
805 return std::any_of(instr->operands.begin(), instr->operands.end(),
806 [&check_regs](const Operand& op) -> bool
807 {
808 if (op.isConstant())
809 return false;
810 bool writes_any = false;
811 for (unsigned i = 0; i < op.size(); i++) {
812 unsigned op_reg = op.physReg() + i;
813 writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
814 }
815 return writes_any;
816 });
817 }
818
819 template <std::size_t N>
820 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)821 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
822 {
823 for (const Operand& op : instr->operands) {
824 for (unsigned i = 0; i < op.size(); i++) {
825 unsigned reg = op.physReg() + i;
826 if (reg < reg_reads.size())
827 reg_reads.set(reg);
828 }
829 }
830 }
831
832 template <std::size_t N>
833 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)834 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
835 {
836 mark_read_regs(instr, reg_reads);
837 reg_reads.set(exec);
838 if (state.program->wave_size == 64)
839 reg_reads.set(exec_hi);
840 }
841
842 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)843 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
844 {
845 if (instr->isVOPC())
846 return true;
847 if (instr->isVOP3() && instr->definitions.size() == 2)
848 return true;
849 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
850 instr->opcode == aco_opcode::v_readlane_b32 ||
851 instr->opcode == aco_opcode::v_readlane_b32_e64)
852 return true;
853 return false;
854 }
855
856 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)857 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
858 {
859 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
860 [](const Definition& def) -> bool
861 { return def.getTemp().type() == RegType::sgpr; });
862 }
863
864 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)865 instr_is_branch(const aco_ptr<Instruction>& instr)
866 {
867 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
868 instr->opcode == aco_opcode::s_cbranch_scc1 ||
869 instr->opcode == aco_opcode::s_cbranch_vccz ||
870 instr->opcode == aco_opcode::s_cbranch_vccnz ||
871 instr->opcode == aco_opcode::s_cbranch_execz ||
872 instr->opcode == aco_opcode::s_cbranch_execnz ||
873 instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
874 instr->opcode == aco_opcode::s_cbranch_cdbguser ||
875 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
876 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
877 instr->opcode == aco_opcode::s_subvector_loop_begin ||
878 instr->opcode == aco_opcode::s_subvector_loop_end ||
879 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
880 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
881 }
882
883 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)884 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
885 std::vector<aco_ptr<Instruction>>& new_instructions)
886 {
887 // TODO: s_dcache_inv needs to be in it's own group on GFX10
888
889 Builder bld(state.program, &new_instructions);
890
891 unsigned vm_vsrc = 7;
892 unsigned sa_sdst = 1;
893 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
894 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
895 vm_vsrc = 0;
896 sa_sdst = 0;
897 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
898 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
899 sa_sdst = instr->salu().imm & 0x1;
900 }
901
902 /* VMEMtoScalarWriteHazard
903 * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
904 * in-between.
905 */
906 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
907 /* Remember all SGPRs that are read by the VMEM/DS instruction */
908 if (instr->isVMEM() || instr->isFlatLike())
909 mark_read_regs_exec(
910 state, instr,
911 instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
912 if (instr->isFlat() || instr->isDS())
913 mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
914 } else if (instr->isSALU() || instr->isSMEM()) {
915 wait_imm imm;
916 if (imm.unpack(state.program->gfx_level, instr.get())) {
917 if (imm.vm == 0)
918 ctx.sgprs_read_by_VMEM.reset();
919 if (imm.lgkm == 0)
920 ctx.sgprs_read_by_DS.reset();
921 if (imm.vs == 0)
922 ctx.sgprs_read_by_VMEM_store.reset();
923 } else if (vm_vsrc == 0) {
924 ctx.sgprs_read_by_VMEM.reset();
925 ctx.sgprs_read_by_DS.reset();
926 ctx.sgprs_read_by_VMEM_store.reset();
927 }
928
929 /* Check if SALU writes an SGPR that was previously read by the VALU */
930 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
931 check_written_regs(instr, ctx.sgprs_read_by_DS) ||
932 check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
933 ctx.sgprs_read_by_VMEM.reset();
934 ctx.sgprs_read_by_DS.reset();
935 ctx.sgprs_read_by_VMEM_store.reset();
936
937 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
938 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
939 }
940 } else if (instr->isVALU()) {
941 /* Hazard is mitigated by any VALU instruction */
942 ctx.sgprs_read_by_VMEM.reset();
943 ctx.sgprs_read_by_DS.reset();
944 ctx.sgprs_read_by_VMEM_store.reset();
945 }
946
947 /* VcmpxPermlaneHazard
948 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
949 */
950 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
951 /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
952 ctx.has_VOPC_write_exec = true;
953 } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
954 instr->opcode == aco_opcode::v_permlanex16_b32)) {
955 ctx.has_VOPC_write_exec = false;
956
957 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
958 bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
959 Operand(instr->operands[0].physReg(), v1));
960 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
961 ctx.has_VOPC_write_exec = false;
962 }
963
964 /* VcmpxExecWARHazard
965 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
966 */
967 if (!instr->isVALU() && instr->reads_exec()) {
968 ctx.has_nonVALU_exec_read = true;
969 } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
970 if (instr->writes_exec()) {
971 ctx.has_nonVALU_exec_read = false;
972
973 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
974 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
975 } else if (instr_writes_sgpr(instr)) {
976 /* Any VALU instruction that writes an SGPR mitigates the problem */
977 ctx.has_nonVALU_exec_read = false;
978 }
979 } else if (sa_sdst == 0) {
980 ctx.has_nonVALU_exec_read = false;
981 }
982
983 /* SMEMtoVectorWriteHazard
984 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
985 */
986 if (instr->isSMEM()) {
987 /* Remember all SGPRs that are read by the SMEM instruction */
988 mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
989 } else if (VALU_writes_sgpr(instr)) {
990 /* Check if VALU writes an SGPR that was previously read by SMEM */
991 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
992 ctx.sgprs_read_by_SMEM.reset();
993
994 /* Insert s_mov to mitigate the problem */
995 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
996 }
997 } else if (instr->isSALU()) {
998 wait_imm imm;
999 if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
1000 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
1001 ctx.sgprs_read_by_SMEM.reset();
1002 } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1003 /* SALU can mitigate the hazard */
1004 ctx.sgprs_read_by_SMEM.reset();
1005 }
1006 }
1007
1008 /* LdsBranchVmemWARHazard
1009 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1010 */
1011 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1012 if (ctx.has_branch_after_DS)
1013 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1014 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1015 ctx.has_VMEM = true;
1016 } else if (instr->isDS()) {
1017 if (ctx.has_branch_after_VMEM)
1018 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1019 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1020 ctx.has_DS = true;
1021 } else if (instr_is_branch(instr)) {
1022 ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1023 ctx.has_branch_after_DS |= ctx.has_DS;
1024 ctx.has_VMEM = ctx.has_DS = false;
1025 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1026 /* Only s_waitcnt_vscnt can mitigate the hazard */
1027 const SALU_instruction& sopk = instr->salu();
1028 if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1029 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1030 }
1031
1032 /* NSAToVMEMBug
1033 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1034 * 0).
1035 */
1036 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1037 ctx.has_NSA_MIMG = true;
1038 } else if (ctx.has_NSA_MIMG) {
1039 ctx.has_NSA_MIMG = false;
1040
1041 if (instr->isMUBUF() || instr->isMTBUF()) {
1042 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1043 if (offset & 6)
1044 bld.sopp(aco_opcode::s_nop, 0);
1045 }
1046 }
1047
1048 /* waNsaCannotFollowWritelane
1049 * Handles NSA MIMG immediately following a v_writelane_b32.
1050 */
1051 if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1052 ctx.has_writelane = true;
1053 } else if (ctx.has_writelane) {
1054 ctx.has_writelane = false;
1055 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1056 bld.sopp(aco_opcode::s_nop, 0);
1057 }
1058 }
1059
1060 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1061 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1062 std::vector<aco_ptr<Instruction>>& new_instructions)
1063 {
1064 Builder bld(state.program, &new_instructions);
1065
1066 size_t prev_count = new_instructions.size();
1067
1068 /* VcmpxPermlaneHazard */
1069 if (ctx.has_VOPC_write_exec) {
1070 ctx.has_VOPC_write_exec = false;
1071 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1072
1073 /* VALU mitigates VMEMtoScalarWriteHazard. */
1074 ctx.sgprs_read_by_VMEM.reset();
1075 ctx.sgprs_read_by_DS.reset();
1076 ctx.sgprs_read_by_VMEM_store.reset();
1077 }
1078
1079 unsigned waitcnt_depctr = 0xffff;
1080
1081 /* VMEMtoScalarWriteHazard */
1082 if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1083 ctx.sgprs_read_by_VMEM_store.any()) {
1084 ctx.sgprs_read_by_VMEM.reset();
1085 ctx.sgprs_read_by_DS.reset();
1086 ctx.sgprs_read_by_VMEM_store.reset();
1087 waitcnt_depctr &= 0xffe3;
1088 }
1089
1090 /* VcmpxExecWARHazard */
1091 if (ctx.has_nonVALU_exec_read) {
1092 ctx.has_nonVALU_exec_read = false;
1093 waitcnt_depctr &= 0xfffe;
1094 }
1095
1096 if (waitcnt_depctr != 0xffff)
1097 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1098
1099 /* SMEMtoVectorWriteHazard */
1100 if (ctx.sgprs_read_by_SMEM.any()) {
1101 ctx.sgprs_read_by_SMEM.reset();
1102 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1103 }
1104
1105 /* LdsBranchVmemWARHazard */
1106 if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1107 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1108 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1109 }
1110
1111 /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1112 if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1113 ctx.has_NSA_MIMG = ctx.has_writelane = false;
1114 /* Any instruction resolves these hazards. */
1115 if (new_instructions.size() == prev_count)
1116 bld.sopp(aco_opcode::s_nop, 0);
1117 }
1118 }
1119
1120 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1121 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1122 {
1123 if (reg.reg() < 256)
1124 return;
1125 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1126 set.set(reg.reg() - 256 + i);
1127 }
1128
1129 bool
test_vgpr_bitset(std::bitset<256> & set,Operand op)1130 test_vgpr_bitset(std::bitset<256>& set, Operand op)
1131 {
1132 if (op.physReg().reg() < 256)
1133 return false;
1134 for (unsigned i = 0; i < op.size(); i++) {
1135 if (set[op.physReg().reg() - 256 + i])
1136 return true;
1137 }
1138 return false;
1139 }
1140
1141 /* GFX11 */
1142 struct LdsDirectVALUHazardGlobalState {
1143 unsigned wait_vdst = 15;
1144 PhysReg vgpr;
1145 std::set<unsigned> loop_headers_visited;
1146 };
1147
1148 struct LdsDirectVALUHazardBlockState {
1149 unsigned num_valu = 0;
1150 bool has_trans = false;
1151
1152 unsigned num_instrs = 0;
1153 unsigned num_blocks = 0;
1154 };
1155
1156 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1157 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1158 LdsDirectVALUHazardBlockState& block_state,
1159 aco_ptr<Instruction>& instr)
1160 {
1161 if (instr->isVALU()) {
1162 block_state.has_trans |= instr->isTrans();
1163
1164 bool uses_vgpr = false;
1165 for (Definition& def : instr->definitions)
1166 uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1167 for (Operand& op : instr->operands) {
1168 uses_vgpr |=
1169 !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1170 }
1171 if (uses_vgpr) {
1172 /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1173 global_state.wait_vdst =
1174 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1175 return true;
1176 }
1177
1178 block_state.num_valu++;
1179 }
1180
1181 if (parse_depctr_wait(instr.get()).va_vdst == 0)
1182 return true;
1183
1184 block_state.num_instrs++;
1185 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1186 /* Exit to limit compile times and set wait_vdst to be safe. */
1187 global_state.wait_vdst =
1188 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1189 return true;
1190 }
1191
1192 return block_state.num_valu >= global_state.wait_vdst;
1193 }
1194
1195 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1196 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1197 LdsDirectVALUHazardBlockState& block_state, Block* block)
1198 {
1199 if (block->kind & block_kind_loop_header) {
1200 if (global_state.loop_headers_visited.count(block->index))
1201 return false;
1202 global_state.loop_headers_visited.insert(block->index);
1203 }
1204
1205 block_state.num_blocks++;
1206
1207 return true;
1208 }
1209
1210 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1211 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1212 {
1213 /* LdsDirectVALUHazard
1214 * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1215 */
1216 if (instr->ldsdir().wait_vdst == 0)
1217 return 0; /* early exit */
1218
1219 LdsDirectVALUHazardGlobalState global_state;
1220 global_state.wait_vdst = instr->ldsdir().wait_vdst;
1221 global_state.vgpr = instr->definitions[0].physReg();
1222 LdsDirectVALUHazardBlockState block_state;
1223 search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1224 &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1225 state, global_state, block_state);
1226 return global_state.wait_vdst;
1227 }
1228
1229 enum VALUPartialForwardingHazardState : uint8_t {
1230 nothing_written,
1231 written_after_exec_write,
1232 exec_written,
1233 };
1234
1235 struct VALUPartialForwardingHazardGlobalState {
1236 bool hazard_found = false;
1237 std::set<unsigned> loop_headers_visited;
1238 };
1239
1240 struct VALUPartialForwardingHazardBlockState {
1241 /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1242 uint8_t num_vgprs_read = 0;
1243 BITSET_DECLARE(vgprs_read, 256) = {0};
1244 enum VALUPartialForwardingHazardState state = nothing_written;
1245 unsigned num_valu_since_read = 0;
1246 unsigned num_valu_since_write = 0;
1247
1248 unsigned num_instrs = 0;
1249 unsigned num_blocks = 0;
1250 };
1251
1252 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1253 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1254 VALUPartialForwardingHazardBlockState& block_state,
1255 aco_ptr<Instruction>& instr)
1256 {
1257 /* Check if there is already a hazard found on some other control flow path. */
1258 if (global_state.hazard_found)
1259 return true;
1260
1261 if (instr->isSALU() && !instr->definitions.empty()) {
1262 if (block_state.state == written_after_exec_write && instr->writes_exec())
1263 block_state.state = exec_written;
1264 } else if (instr->isVALU()) {
1265 bool vgpr_write = false;
1266 for (Definition& def : instr->definitions) {
1267 if (def.physReg().reg() < 256)
1268 continue;
1269
1270 for (unsigned i = 0; i < def.size(); i++) {
1271 unsigned reg = def.physReg().reg() - 256 + i;
1272 if (!BITSET_TEST(block_state.vgprs_read, reg))
1273 continue;
1274
1275 if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1276 global_state.hazard_found = true;
1277 return true;
1278 }
1279
1280 BITSET_CLEAR(block_state.vgprs_read, reg);
1281 block_state.num_vgprs_read--;
1282 vgpr_write = true;
1283 }
1284 }
1285
1286 if (vgpr_write) {
1287 /* If the state is nothing_written: the check below should ensure that this write is
1288 * close enough to the read.
1289 *
1290 * If the state is exec_written: the current choice of second write has failed. Reset and
1291 * try with the current write as the second one, if it's close enough to the read.
1292 *
1293 * If the state is written_after_exec_write: a further second write would be better, if
1294 * it's close enough to the read.
1295 */
1296 if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1297 block_state.state = written_after_exec_write;
1298 block_state.num_valu_since_write = 0;
1299 } else {
1300 block_state.num_valu_since_write++;
1301 }
1302 } else {
1303 block_state.num_valu_since_write++;
1304 }
1305
1306 block_state.num_valu_since_read++;
1307 } else if (parse_depctr_wait(instr.get()).va_vdst == 0) {
1308 return true;
1309 }
1310
1311 if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1312 return true; /* Hazard not possible at this distance. */
1313 if (block_state.num_vgprs_read == 0)
1314 return true; /* All VGPRs have been written and a hazard was never found. */
1315
1316 block_state.num_instrs++;
1317 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1318 /* Exit to limit compile times and set hazard_found=true to be safe. */
1319 global_state.hazard_found = true;
1320 return true;
1321 }
1322
1323 return false;
1324 }
1325
1326 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1327 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1328 VALUPartialForwardingHazardBlockState& block_state,
1329 Block* block)
1330 {
1331 if (block->kind & block_kind_loop_header) {
1332 if (global_state.loop_headers_visited.count(block->index))
1333 return false;
1334 global_state.loop_headers_visited.insert(block->index);
1335 }
1336
1337 block_state.num_blocks++;
1338
1339 return true;
1340 }
1341
1342 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1343 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1344 {
1345 /* VALUPartialForwardingHazard
1346 * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1347 * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1348 * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1349 */
1350 if (state.program->wave_size != 64 || !instr->isVALU())
1351 return false;
1352
1353 unsigned num_vgprs = 0;
1354 for (Operand& op : instr->operands)
1355 num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1356 if (num_vgprs <= 1)
1357 return false; /* early exit */
1358
1359 VALUPartialForwardingHazardBlockState block_state;
1360
1361 for (unsigned i = 0; i < instr->operands.size(); i++) {
1362 Operand& op = instr->operands[i];
1363 if (op.physReg().reg() < 256)
1364 continue;
1365 for (unsigned j = 0; j < op.size(); j++)
1366 BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1367 }
1368 block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1369
1370 if (block_state.num_vgprs_read <= 1)
1371 return false; /* early exit */
1372
1373 VALUPartialForwardingHazardGlobalState global_state;
1374 search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1375 &handle_valu_partial_forwarding_hazard_block,
1376 &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1377 return global_state.hazard_found;
1378 }
1379
1380 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1381 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1382 std::vector<aco_ptr<Instruction>>& new_instructions)
1383 {
1384 Builder bld(state.program, &new_instructions);
1385
1386 /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1387 if (instr->opcode == aco_opcode::s_sendmsg && instr->salu().imm == sendmsg_dealloc_vgprs &&
1388 (new_instructions.empty() || new_instructions.back()->opcode != aco_opcode::s_nop)) {
1389 bld.sopp(aco_opcode::s_nop, 0);
1390 }
1391
1392 /* VcmpxPermlaneHazard
1393 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1394 */
1395 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1396 ctx.has_Vcmpx = true;
1397 } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1398 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1399 instr->opcode == aco_opcode::v_permlane64_b32 ||
1400 instr->opcode == aco_opcode::v_permlane16_var_b32 ||
1401 instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
1402 ctx.has_Vcmpx = false;
1403
1404 /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1405 bld.vop1(aco_opcode::v_nop);
1406 } else if (instr->isVALU()) {
1407 ctx.has_Vcmpx = false;
1408 }
1409
1410 depctr_wait wait = parse_depctr_wait(instr.get());
1411 unsigned va_vdst = wait.va_vdst;
1412 unsigned vm_vsrc = 7;
1413 unsigned sa_sdst = 1;
1414
1415 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1416 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
1417 va_vdst = 0;
1418 vm_vsrc = 0;
1419 sa_sdst = 0;
1420 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1421 /* va_vdst already obtained through parse_depctr_wait(). */
1422 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
1423 sa_sdst = instr->salu().imm & 0x1;
1424 } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
1425 vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
1426 }
1427
1428 if (instr->isLDSDIR()) {
1429 unsigned count = handle_lds_direct_valu_hazard(state, instr);
1430 LDSDIR_instruction* ldsdir = &instr->ldsdir();
1431 if (count < va_vdst) {
1432 ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1433 va_vdst = MIN2(va_vdst, count);
1434 }
1435 }
1436
1437 /* VALUTransUseHazard
1438 * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1439 * in-between.
1440 */
1441 if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
1442 uint8_t num_valu = 15;
1443 uint8_t num_trans = 15;
1444 for (Operand& op : instr->operands) {
1445 if (op.physReg().reg() < 256)
1446 continue;
1447 for (unsigned i = 0; i < op.size(); i++) {
1448 PhysReg reg = op.physReg().advance(i * 4);
1449 num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(reg));
1450 num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(reg));
1451 }
1452 }
1453 if (num_trans <= 1 && num_valu <= 5) {
1454 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1455 va_vdst = 0;
1456 }
1457 }
1458
1459 if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
1460 handle_valu_partial_forwarding_hazard(state, instr)) {
1461 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1462 va_vdst = 0;
1463 }
1464
1465 if (state.program->gfx_level < GFX12) {
1466 /* VALUMaskWriteHazard
1467 * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
1468 * VALU.
1469 */
1470 if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
1471 check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1472 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1473 sa_sdst = 0;
1474 }
1475
1476 if (va_vdst == 0) {
1477 ctx.valu_since_wr_by_trans.reset();
1478 ctx.trans_since_wr_by_trans.reset();
1479 }
1480
1481 if (sa_sdst == 0)
1482 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1483
1484 if (state.program->wave_size == 64 && instr->isSALU() &&
1485 check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1486 unsigned reg = instr->definitions[0].physReg().reg();
1487 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1488 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg + i] = 1;
1489 }
1490
1491 if (instr->isVALU()) {
1492 bool is_trans = instr->isTrans();
1493
1494 ctx.valu_since_wr_by_trans.inc();
1495 if (is_trans)
1496 ctx.trans_since_wr_by_trans.inc();
1497
1498 if (is_trans) {
1499 for (Definition& def : instr->definitions) {
1500 for (unsigned i = 0; i < def.size(); i++) {
1501 PhysReg reg = def.physReg().advance(i * 4);
1502 ctx.valu_since_wr_by_trans.set(reg);
1503 ctx.trans_since_wr_by_trans.set(reg);
1504 }
1505 }
1506 }
1507
1508 if (state.program->wave_size == 64) {
1509 for (Operand& op : instr->operands) {
1510 /* This should ignore exec reads */
1511 if (!op.isConstant() && op.physReg().reg() < 126)
1512 ctx.sgpr_read_by_valu_as_lanemask.reset();
1513 }
1514 switch (instr->opcode) {
1515 case aco_opcode::v_addc_co_u32:
1516 case aco_opcode::v_subb_co_u32:
1517 case aco_opcode::v_subbrev_co_u32:
1518 case aco_opcode::v_cndmask_b16:
1519 case aco_opcode::v_cndmask_b32:
1520 case aco_opcode::v_div_fmas_f32:
1521 case aco_opcode::v_div_fmas_f64:
1522 if (instr->operands.back().physReg() != exec) {
1523 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1524 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1525 }
1526 break;
1527 default: break;
1528 }
1529 }
1530 }
1531 } else {
1532 /* VALUReadSGPRHazard
1533 * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
1534 */
1535 if (instr->isVALU() || instr->isSALU()) {
1536 unsigned expiry_count = instr->isSALU() ? 10 : 11;
1537 uint16_t imm = 0xffff;
1538
1539 for (Operand& op : instr->operands) {
1540 if (op.physReg() >= m0)
1541 continue;
1542
1543 for (unsigned i = 0; i < op.size(); i++) {
1544 PhysReg reg = op.physReg().advance(i * 4);
1545 if (ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
1546 imm &= 0xfffe;
1547 sa_sdst = 0;
1548 }
1549 if (instr->isVALU()) {
1550 ctx.sgpr_read_by_valu.set(reg / 2);
1551
1552 /* s_wait_alu on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
1553 if (ctx.sgpr_read_by_valu_then_wr_by_valu[reg]) {
1554 bool is_vcc = reg == vcc || reg == vcc_hi;
1555 imm &= is_vcc ? 0xfffd : 0xf1ff;
1556 if (is_vcc)
1557 wait.va_vcc = 0;
1558 else
1559 wait.va_sdst = 0;
1560 }
1561 }
1562 }
1563 }
1564
1565 if (imm != 0xffff)
1566 bld.sopp(aco_opcode::s_waitcnt_depctr, imm);
1567 }
1568
1569 if (sa_sdst == 0)
1570 ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1571 else if (instr->isSALU() && !instr->isSOPP())
1572 ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
1573
1574 if (wait.va_sdst == 0) {
1575 std::bitset<m0.reg()> old = ctx.sgpr_read_by_valu_then_wr_by_valu;
1576 ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
1577 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = old[vcc];
1578 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = old[vcc_hi];
1579 }
1580 if (wait.va_vcc == 0) {
1581 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
1582 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
1583 }
1584
1585 if (instr->isVALU() && !instr->definitions.empty()) {
1586 PhysReg reg = instr->definitions[0].physReg();
1587 if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
1588 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1589 ctx.sgpr_read_by_valu_then_wr_by_valu.set(reg + i);
1590 }
1591 } else if (instr->isSALU() && !instr->definitions.empty()) {
1592 PhysReg reg = instr->definitions[0].physReg();
1593 if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
1594 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1595 ctx.sgpr_read_by_valu_then_wr_by_salu.set(reg.advance(i * 4));
1596 }
1597 }
1598 }
1599
1600 /* LdsDirectVMEMHazard
1601 * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1602 */
1603 if (instr->isVMEM() || instr->isFlatLike()) {
1604 if (instr->definitions.empty()) {
1605 for (Operand& op : instr->operands)
1606 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1607 } else {
1608 uint8_t vmem_type = state.program->gfx_level >= GFX12
1609 ? get_vmem_type(state.program->gfx_level, instr.get())
1610 : vmem_nosampler;
1611 std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
1612 if (vmem_type == vmem_sampler)
1613 vgprs = &ctx.vgpr_used_by_vmem_sample;
1614 else if (vmem_type == vmem_bvh)
1615 vgprs = &ctx.vgpr_used_by_vmem_bvh;
1616
1617 for (Definition& def : instr->definitions)
1618 fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
1619 for (Operand& op : instr->operands)
1620 fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
1621 }
1622 }
1623 if (instr->isDS() || instr->isFlat()) {
1624 for (Definition& def : instr->definitions)
1625 fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1626 for (Operand& op : instr->operands)
1627 fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1628 }
1629 wait_imm imm;
1630 if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1631 ctx.vgpr_used_by_vmem_load.reset();
1632 ctx.vgpr_used_by_vmem_sample.reset();
1633 ctx.vgpr_used_by_vmem_bvh.reset();
1634 ctx.vgpr_used_by_vmem_store.reset();
1635 ctx.vgpr_used_by_ds.reset();
1636 } else if (imm.unpack(state.program->gfx_level, instr.get())) {
1637 if (imm.vm == 0)
1638 ctx.vgpr_used_by_vmem_load.reset();
1639 if (imm.sample == 0)
1640 ctx.vgpr_used_by_vmem_sample.reset();
1641 if (imm.bvh == 0)
1642 ctx.vgpr_used_by_vmem_bvh.reset();
1643 if (imm.lgkm == 0)
1644 ctx.vgpr_used_by_ds.reset();
1645 if (imm.vs == 0)
1646 ctx.vgpr_used_by_vmem_store.reset();
1647 }
1648 if (instr->isLDSDIR()) {
1649 if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1650 ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
1651 ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
1652 ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1653 ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1654 if (state.program->gfx_level >= GFX12)
1655 instr->ldsdir().wait_vsrc = 0;
1656 else
1657 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
1658 ctx.vgpr_used_by_vmem_load.reset();
1659 ctx.vgpr_used_by_vmem_sample.reset();
1660 ctx.vgpr_used_by_vmem_bvh.reset();
1661 ctx.vgpr_used_by_vmem_store.reset();
1662 ctx.vgpr_used_by_ds.reset();
1663 }
1664 }
1665
1666 /* WMMA Hazards */
1667 if (instr_info.classes[(int)instr->opcode] == instr_class::wmma) {
1668 assert(instr->operands.back().regClass() == instr->definitions[0].regClass());
1669
1670 bool is_swmma = instr->operands.size() == 4;
1671 if (test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[0]) ||
1672 test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[1]) ||
1673 (is_swmma && test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[2]))) {
1674 bld.vop1(aco_opcode::v_nop);
1675 }
1676
1677 ctx.vgpr_written_by_wmma.reset();
1678 fill_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->definitions[0].physReg(),
1679 instr->definitions[0].bytes());
1680 } else if (instr->isVALU()) {
1681 ctx.vgpr_written_by_wmma.reset();
1682 }
1683 }
1684
1685 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1686 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1687 {
1688 if (parse_depctr_wait(pred.get()).va_vdst == 0)
1689 return true;
1690
1691 if (--block_state == 0) {
1692 global_state = false;
1693 return true;
1694 }
1695
1696 if (pred->isVALU()) {
1697 bool vgpr_rd_or_wr = false;
1698 for (Definition def : pred->definitions) {
1699 if (def.physReg().reg() >= 256)
1700 vgpr_rd_or_wr = true;
1701 }
1702 for (Operand op : pred->operands) {
1703 if (op.physReg().reg() >= 256)
1704 vgpr_rd_or_wr = true;
1705 }
1706 if (vgpr_rd_or_wr) {
1707 global_state = false;
1708 return true;
1709 }
1710 }
1711
1712 return false;
1713 }
1714
1715 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1716 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1717 std::vector<aco_ptr<Instruction>>& new_instructions)
1718 {
1719 Builder bld(state.program, &new_instructions);
1720
1721 unsigned waitcnt_depctr = 0xffff;
1722 bool valu_read_sgpr = false;
1723
1724 /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1725 bool has_vdst0_since_valu = true;
1726 unsigned depth = 16;
1727 search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1728 state, has_vdst0_since_valu, depth);
1729 if (!has_vdst0_since_valu) {
1730 waitcnt_depctr &= 0x0fff;
1731 ctx.valu_since_wr_by_trans.reset();
1732 ctx.trans_since_wr_by_trans.reset();
1733 }
1734
1735 /* VcmpxPermlaneHazard/WMMAHazards */
1736 if (ctx.has_Vcmpx || ctx.vgpr_written_by_wmma.any()) {
1737 ctx.has_Vcmpx = false;
1738 ctx.vgpr_written_by_wmma.reset();
1739 bld.vop1(aco_opcode::v_nop);
1740 }
1741
1742 /* VALUMaskWriteHazard */
1743 if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
1744 if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
1745 waitcnt_depctr &= 0xfffe;
1746 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1747 }
1748 if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
1749 valu_read_sgpr = true;
1750 ctx.sgpr_read_by_valu_as_lanemask.reset();
1751 }
1752 }
1753
1754 /* VALUReadSGPRHazard */
1755 if (state.program->gfx_level >= GFX12) {
1756 if (!ctx.sgpr_read_by_valu_then_wr_by_salu.empty())
1757 waitcnt_depctr &= 0xfffe;
1758
1759 ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
1760 if (ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] ||
1761 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi]) {
1762 waitcnt_depctr &= 0xfffd;
1763 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
1764 ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
1765 }
1766 if (ctx.sgpr_read_by_valu_then_wr_by_valu.any()) {
1767 waitcnt_depctr &= 0xf1ff;
1768 ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
1769 }
1770 }
1771
1772 /* LdsDirectVMEMHazard */
1773 if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1774 ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
1775 ctx.vgpr_used_by_vmem_bvh.any()) {
1776 waitcnt_depctr &= 0xffe3;
1777 ctx.vgpr_used_by_vmem_load.reset();
1778 ctx.vgpr_used_by_vmem_store.reset();
1779 ctx.vgpr_used_by_ds.reset();
1780 }
1781
1782 if (waitcnt_depctr != 0xffff)
1783 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1784
1785 if (valu_read_sgpr) {
1786 /* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
1787 * other hazards. */
1788 bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
1789 Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
1790
1791 /* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
1792 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1793 }
1794 }
1795
1796 template <typename Ctx>
1797 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1798 std::vector<aco_ptr<Instruction>>&);
1799
1800 template <typename Ctx>
1801 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1802
1803 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1804 void
handle_block(Program * program,Ctx & ctx,Block & block)1805 handle_block(Program* program, Ctx& ctx, Block& block)
1806 {
1807 if (block.instructions.empty())
1808 return;
1809
1810 State state;
1811 state.program = program;
1812 state.block = █
1813 state.old_instructions = std::move(block.instructions);
1814
1815 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1816 block.instructions.reserve(state.old_instructions.size());
1817
1818 bool found_end = false;
1819 for (aco_ptr<Instruction>& instr : state.old_instructions) {
1820 Handle(state, ctx, instr, block.instructions);
1821
1822 /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1823 if (instr->opcode == aco_opcode::s_setpc_b64) {
1824 block.instructions.emplace_back(std::move(instr));
1825
1826 std::vector<aco_ptr<Instruction>> resolve_instrs;
1827 Resolve(state, ctx, resolve_instrs);
1828 block.instructions.insert(std::prev(block.instructions.end()),
1829 std::move_iterator(resolve_instrs.begin()),
1830 std::move_iterator(resolve_instrs.end()));
1831
1832 found_end = true;
1833 continue;
1834 }
1835
1836 found_end |= instr->opcode == aco_opcode::s_endpgm;
1837 block.instructions.emplace_back(std::move(instr));
1838 }
1839
1840 /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1841 if (block.linear_succs.empty() && !found_end)
1842 Resolve(state, ctx, block.instructions);
1843 }
1844
1845 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1846 void
mitigate_hazards(Program * program,Ctx initial_ctx=Ctx ())1847 mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
1848 {
1849 std::vector<Ctx> all_ctx(program->blocks.size());
1850 std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1851
1852 for (unsigned i = 0; i < program->blocks.size(); i++) {
1853 Block& block = program->blocks[i];
1854 Ctx& ctx = all_ctx[i];
1855
1856 if (i == 0 || (block.kind & block_kind_resume))
1857 ctx = initial_ctx;
1858
1859 if (block.kind & block_kind_loop_header) {
1860 loop_header_indices.push(i);
1861 } else if (block.kind & block_kind_loop_exit) {
1862 /* Go through the whole loop again */
1863 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1864 Ctx loop_block_ctx;
1865 for (unsigned b : program->blocks[idx].linear_preds)
1866 loop_block_ctx.join(all_ctx[b]);
1867
1868 handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1869
1870 /* We only need to continue if the loop header context changed */
1871 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1872 break;
1873
1874 all_ctx[idx] = loop_block_ctx;
1875 }
1876
1877 loop_header_indices.pop();
1878 }
1879
1880 for (unsigned b : block.linear_preds)
1881 ctx.join(all_ctx[b]);
1882
1883 handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1884 }
1885 }
1886
1887 /* FeatureRequiredExportPriority in LLVM */
1888 void
required_export_priority(Program * program)1889 required_export_priority(Program* program)
1890 {
1891 /* Skip callees, assuming that the caller has already increased the priority. */
1892 bool increase_priority = !program->is_epilog && !program->info.vs.has_prolog &&
1893 (!program->info.merged_shader_compiled_separately ||
1894 program->stage.sw == SWStage::VS || program->stage.sw == SWStage::TES);
1895 increase_priority |= program->is_prolog;
1896
1897 for (Block& block : program->blocks) {
1898 std::vector<aco_ptr<Instruction>> new_instructions;
1899 new_instructions.reserve(block.instructions.size() + 6);
1900
1901 Builder bld(program, &new_instructions);
1902
1903 if (increase_priority && block.index == 0) {
1904 if (!block.instructions.empty() && block.instructions[0]->opcode == aco_opcode::s_setprio)
1905 block.instructions[0]->salu().imm = MAX2(block.instructions[0]->salu().imm, 2);
1906 else
1907 bld.sopp(aco_opcode::s_setprio, 2);
1908 }
1909
1910 for (unsigned i = 0; i < block.instructions.size(); i++) {
1911 Instruction* instr = block.instructions[i].get();
1912 new_instructions.push_back(std::move(block.instructions[i]));
1913
1914 if (instr->opcode == aco_opcode::s_setprio) {
1915 instr->salu().imm = MAX2(instr->salu().imm, 2);
1916 continue;
1917 }
1918
1919 bool end_of_export_sequence = instr->isEXP() && (i == block.instructions.size() - 1 ||
1920 !block.instructions[i + 1]->isEXP());
1921 if (!end_of_export_sequence)
1922 continue;
1923
1924 bool before_endpgm = false;
1925 if (i != block.instructions.size() - 1) {
1926 before_endpgm = block.instructions[i + 1]->opcode == aco_opcode::s_endpgm;
1927 } else {
1928 /* Does this fallthrough to a s_endpgm? */
1929 for (unsigned j = block.index + 1; j < program->blocks.size(); j++) {
1930 if (program->blocks[j].instructions.size() == 1 &&
1931 program->blocks[j].instructions[0]->opcode == aco_opcode::s_endpgm)
1932 before_endpgm = true;
1933 if (!program->blocks[j].instructions.empty())
1934 break;
1935 }
1936 }
1937
1938 bld.sopp(aco_opcode::s_setprio, 0);
1939 if (!before_endpgm)
1940 bld.sopk(aco_opcode::s_waitcnt_expcnt, Operand(sgpr_null, s1), 0);
1941 bld.sopp(aco_opcode::s_nop, 0);
1942 bld.sopp(aco_opcode::s_nop, 0);
1943 if (!before_endpgm)
1944 bld.sopp(aco_opcode::s_setprio, 2);
1945 }
1946
1947 block.instructions = std::move(new_instructions);
1948 }
1949 }
1950
1951 } /* end namespace */
1952
1953 void
insert_NOPs(Program * program)1954 insert_NOPs(Program* program)
1955 {
1956 if (program->gfx_level >= GFX11) {
1957 NOP_ctx_gfx11 initial_ctx;
1958
1959 bool has_previous_part =
1960 program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
1961 (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
1962 program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
1963 if (program->gfx_level >= GFX12 && has_previous_part) {
1964 /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
1965 * SGPR might have been read by VALU if there was a previous shader part.
1966 */
1967 initial_ctx.sgpr_read_by_valu.flip();
1968 }
1969
1970 mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
1971 initial_ctx);
1972 } else if (program->gfx_level >= GFX10_3) {
1973 ; /* no hazards/bugs to mitigate */
1974 } else if (program->gfx_level >= GFX10) {
1975 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1976 } else {
1977 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1978 }
1979
1980 if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1981 program->stage.hw == AC_HW_PIXEL_SHADER))
1982 required_export_priority(program);
1983 }
1984
1985 } // namespace aco
1986