1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include "util/bitset.h"
29
30 #include <algorithm>
31 #include <bitset>
32 #include <set>
33 #include <stack>
34 #include <vector>
35
36 namespace aco {
37 namespace {
38
39 struct State {
40 Program* program;
41 Block* block;
42 std::vector<aco_ptr<Instruction>> old_instructions;
43 };
44
45 struct NOP_ctx_gfx6 {
joinaco::__anon9574be4a0111::NOP_ctx_gfx646 void join(const NOP_ctx_gfx6& other)
47 {
48 set_vskip_mode_then_vector =
49 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
50 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
51 salu_wr_m0_then_gds_msg_ttrace =
52 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
53 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
54 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
55 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
56 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
57 vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
58 smem_clause |= other.smem_clause;
59 smem_write |= other.smem_write;
60 for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
61 smem_clause_read_write[i] |= other.smem_clause_read_write[i];
62 smem_clause_write[i] |= other.smem_clause_write[i];
63 }
64 }
65
operator ==aco::__anon9574be4a0111::NOP_ctx_gfx666 bool operator==(const NOP_ctx_gfx6& other)
67 {
68 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
69 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
70 vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
71 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
72 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
73 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
74 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
75 setreg_then_getsetreg == other.setreg_then_getsetreg &&
76 smem_clause == other.smem_clause && smem_write == other.smem_write &&
77 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
78 BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
79 }
80
add_wait_statesaco::__anon9574be4a0111::NOP_ctx_gfx681 void add_wait_states(unsigned amount)
82 {
83 if ((set_vskip_mode_then_vector -= amount) < 0)
84 set_vskip_mode_then_vector = 0;
85
86 if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
87 valu_wr_vcc_then_div_fmas = 0;
88
89 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
90 salu_wr_m0_then_gds_msg_ttrace = 0;
91
92 if ((valu_wr_exec_then_dpp -= amount) < 0)
93 valu_wr_exec_then_dpp = 0;
94
95 if ((salu_wr_m0_then_lds -= amount) < 0)
96 salu_wr_m0_then_lds = 0;
97
98 if ((salu_wr_m0_then_moverel -= amount) < 0)
99 salu_wr_m0_then_moverel = 0;
100
101 if ((setreg_then_getsetreg -= amount) < 0)
102 setreg_then_getsetreg = 0;
103
104 vmem_store_then_wr_data.reset();
105 }
106
107 /* setting MODE.vskip and then any vector op requires 2 wait states */
108 int8_t set_vskip_mode_then_vector = 0;
109
110 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
111 int8_t valu_wr_vcc_then_div_fmas = 0;
112
113 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
114 int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
115
116 /* VALU writing EXEC followed by DPP requires 5 wait states */
117 int8_t valu_wr_exec_then_dpp = 0;
118
119 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
120 int8_t salu_wr_m0_then_lds = 0;
121
122 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
123 int8_t salu_wr_m0_then_moverel = 0;
124
125 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
126 * currently we don't look at the actual register */
127 int8_t setreg_then_getsetreg = 0;
128
129 /* some memory instructions writing >64bit followed by a instructions
130 * writing the VGPRs holding the writedata requires 1 wait state */
131 std::bitset<256> vmem_store_then_wr_data;
132
133 /* we break up SMEM clauses that contain stores or overwrite an
134 * operand/definition of another instruction in the clause */
135 bool smem_clause = false;
136 bool smem_write = false;
137 BITSET_DECLARE(smem_clause_read_write, 128) = {0};
138 BITSET_DECLARE(smem_clause_write, 128) = {0};
139 };
140
141 struct NOP_ctx_gfx10 {
142 bool has_VOPC_write_exec = false;
143 bool has_nonVALU_exec_read = false;
144 bool has_VMEM = false;
145 bool has_branch_after_VMEM = false;
146 bool has_DS = false;
147 bool has_branch_after_DS = false;
148 bool has_NSA_MIMG = false;
149 bool has_writelane = false;
150 std::bitset<128> sgprs_read_by_VMEM;
151 std::bitset<128> sgprs_read_by_VMEM_store;
152 std::bitset<128> sgprs_read_by_DS;
153 std::bitset<128> sgprs_read_by_SMEM;
154
joinaco::__anon9574be4a0111::NOP_ctx_gfx10155 void join(const NOP_ctx_gfx10& other)
156 {
157 has_VOPC_write_exec |= other.has_VOPC_write_exec;
158 has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
159 has_VMEM |= other.has_VMEM;
160 has_branch_after_VMEM |= other.has_branch_after_VMEM;
161 has_DS |= other.has_DS;
162 has_branch_after_DS |= other.has_branch_after_DS;
163 has_NSA_MIMG |= other.has_NSA_MIMG;
164 has_writelane |= other.has_writelane;
165 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
166 sgprs_read_by_DS |= other.sgprs_read_by_DS;
167 sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
168 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
169 }
170
operator ==aco::__anon9574be4a0111::NOP_ctx_gfx10171 bool operator==(const NOP_ctx_gfx10& other)
172 {
173 return has_VOPC_write_exec == other.has_VOPC_write_exec &&
174 has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
175 has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
176 has_branch_after_DS == other.has_branch_after_DS &&
177 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
178 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
179 sgprs_read_by_DS == other.sgprs_read_by_DS &&
180 sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
181 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
182 }
183 };
184
185 template <int Max> struct VGPRCounterMap {
186 public:
187 int base = 0;
188 BITSET_DECLARE(resident, 256);
189 int val[256];
190
191 /* Initializes all counters to Max. */
VGPRCounterMapaco::__anon9574be4a0111::VGPRCounterMap192 VGPRCounterMap() { BITSET_ZERO(resident); }
193
194 /* Increase all counters, clamping at Max. */
incaco::__anon9574be4a0111::VGPRCounterMap195 void inc() { base++; }
196
197 /* Set counter to 0. */
setaco::__anon9574be4a0111::VGPRCounterMap198 void set(unsigned idx)
199 {
200 val[idx] = -base;
201 BITSET_SET(resident, idx);
202 }
203
setaco::__anon9574be4a0111::VGPRCounterMap204 void set(PhysReg reg, unsigned bytes)
205 {
206 if (reg.reg() < 256)
207 return;
208
209 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
210 set(reg.reg() - 256 + i);
211 }
212
213 /* Reset all counters to Max. */
resetaco::__anon9574be4a0111::VGPRCounterMap214 void reset()
215 {
216 base = 0;
217 BITSET_ZERO(resident);
218 }
219
resetaco::__anon9574be4a0111::VGPRCounterMap220 void reset(PhysReg reg, unsigned bytes)
221 {
222 if (reg.reg() < 256)
223 return;
224
225 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
226 BITSET_CLEAR(resident, reg.reg() - 256 + i);
227 }
228
getaco::__anon9574be4a0111::VGPRCounterMap229 uint8_t get(unsigned idx)
230 {
231 return BITSET_TEST(resident, idx) ? MIN2(val[idx] + base, Max) : Max;
232 }
233
getaco::__anon9574be4a0111::VGPRCounterMap234 uint8_t get(PhysReg reg, unsigned offset = 0)
235 {
236 assert(reg.reg() >= 256);
237 return get(reg.reg() - 256 + offset);
238 }
239
join_minaco::__anon9574be4a0111::VGPRCounterMap240 void join_min(const VGPRCounterMap& other)
241 {
242 unsigned i;
243 BITSET_FOREACH_SET (i, other.resident, 256) {
244 if (BITSET_TEST(resident, i))
245 val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base;
246 else
247 val[i] = other.val[i] + other.base - base;
248 }
249 BITSET_OR(resident, resident, other.resident);
250 }
251
operator ==aco::__anon9574be4a0111::VGPRCounterMap252 bool operator==(const VGPRCounterMap& other) const
253 {
254 if (!BITSET_EQUAL(resident, other.resident))
255 return false;
256
257 unsigned i;
258 BITSET_FOREACH_SET (i, other.resident, 256) {
259 if (!BITSET_TEST(resident, i))
260 return false;
261 if (val[i] + base != other.val[i] + other.base)
262 return false;
263 }
264 return true;
265 }
266 };
267
268 struct NOP_ctx_gfx11 {
269 /* VcmpxPermlaneHazard */
270 bool has_Vcmpx = false;
271
272 /* LdsDirectVMEMHazard */
273 std::bitset<256> vgpr_used_by_vmem_load;
274 std::bitset<256> vgpr_used_by_vmem_store;
275 std::bitset<256> vgpr_used_by_ds;
276
277 /* VALUTransUseHazard */
278 VGPRCounterMap<15> valu_since_wr_by_trans;
279 VGPRCounterMap<2> trans_since_wr_by_trans;
280
281 /* VALUMaskWriteHazard */
282 std::bitset<128> sgpr_read_by_valu_as_lanemask;
283 std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
284
joinaco::__anon9574be4a0111::NOP_ctx_gfx11285 void join(const NOP_ctx_gfx11& other)
286 {
287 has_Vcmpx |= other.has_Vcmpx;
288 vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
289 vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
290 vgpr_used_by_ds |= other.vgpr_used_by_ds;
291 valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
292 trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
293 sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
294 sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
295 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
296 }
297
operator ==aco::__anon9574be4a0111::NOP_ctx_gfx11298 bool operator==(const NOP_ctx_gfx11& other)
299 {
300 return has_Vcmpx == other.has_Vcmpx &&
301 vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
302 vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
303 vgpr_used_by_ds == other.vgpr_used_by_ds &&
304 valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
305 trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
306 sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
307 sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
308 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
309 }
310 };
311
312 int
get_wait_states(aco_ptr<Instruction> & instr)313 get_wait_states(aco_ptr<Instruction>& instr)
314 {
315 if (instr->opcode == aco_opcode::s_nop)
316 return instr->sopp().imm + 1;
317 else if (instr->opcode == aco_opcode::p_constaddr)
318 return 3; /* lowered to 3 instructions in the assembler */
319 else
320 return 1;
321 }
322
323 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)324 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
325 {
326 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
327 }
328
329 template <typename GlobalState, typename BlockState,
330 bool (*block_cb)(GlobalState&, BlockState&, Block*),
331 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
332 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)333 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
334 Block* block, bool start_at_end)
335 {
336 if (block == state.block && start_at_end) {
337 /* If it's the current block, block->instructions is incomplete. */
338 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
339 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
340 if (!instr)
341 break; /* Instruction has been moved to block->instructions. */
342 if (instr_cb(global_state, block_state, instr))
343 return;
344 }
345 }
346
347 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
348 if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
349 return;
350 }
351
352 PRAGMA_DIAGNOSTIC_PUSH
353 PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
354 if (block_cb != nullptr && !block_cb(global_state, block_state, block))
355 return;
356 PRAGMA_DIAGNOSTIC_POP
357
358 for (unsigned lin_pred : block->linear_preds) {
359 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
360 state, global_state, block_state, &state.program->blocks[lin_pred], true);
361 }
362 }
363
364 template <typename GlobalState, typename BlockState,
365 bool (*block_cb)(GlobalState&, BlockState&, Block*),
366 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
367 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)368 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
369 {
370 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
371 state, global_state, block_state, state.block, false);
372 }
373
374 struct HandleRawHazardGlobalState {
375 PhysReg reg;
376 int nops_needed;
377 };
378
379 struct HandleRawHazardBlockState {
380 uint32_t mask;
381 int nops_needed;
382 };
383
384 template <bool Valu, bool Vintrp, bool Salu>
385 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)386 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
387 HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
388 {
389 unsigned mask_size = util_last_bit(block_state.mask);
390
391 uint32_t writemask = 0;
392 for (Definition& def : pred->definitions) {
393 if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
394 unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
395 unsigned end = MIN2(mask_size, start + def.size());
396 writemask |= u_bit_consecutive(start, end - start);
397 }
398 }
399
400 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
401 (pred->isSALU() && Salu));
402 if (is_hazard) {
403 global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
404 return true;
405 }
406
407 block_state.mask &= ~writemask;
408 block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
409
410 if (block_state.mask == 0)
411 block_state.nops_needed = 0;
412
413 return block_state.nops_needed == 0;
414 }
415
416 template <bool Valu, bool Vintrp, bool Salu>
417 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)418 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
419 {
420 if (*NOPs >= min_states)
421 return;
422
423 HandleRawHazardGlobalState global = {op.physReg(), 0};
424 HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
425
426 /* Loops require branch instructions, which count towards the wait
427 * states. So even with loops this should finish unless nops_needed is some
428 * huge value. */
429 search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
430 handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
431
432 *NOPs = MAX2(*NOPs, global.nops_needed);
433 }
434
435 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
436 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
437 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
438
439 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)440 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
441 {
442 unsigned end = start + size - 1;
443 unsigned start_mod = start % BITSET_WORDBITS;
444 if (start_mod + size <= BITSET_WORDBITS) {
445 BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
446 } else {
447 unsigned first_size = BITSET_WORDBITS - start_mod;
448 set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
449 set_bitset_range(words, start + first_size, size - first_size);
450 }
451 }
452
453 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)454 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
455 {
456 unsigned end = start + size - 1;
457 unsigned start_mod = start % BITSET_WORDBITS;
458 if (start_mod + size <= BITSET_WORDBITS) {
459 return BITSET_TEST_RANGE(words, start, end);
460 } else {
461 unsigned first_size = BITSET_WORDBITS - start_mod;
462 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
463 test_bitset_range(words, start + first_size, size - first_size);
464 }
465 }
466
467 /* A SMEM clause is any group of consecutive SMEM instructions. The
468 * instructions in this group may return out of order and/or may be replayed.
469 *
470 * To fix this potential hazard correctly, we have to make sure that when a
471 * clause has more than one instruction, no instruction in the clause writes
472 * to a register that is read by another instruction in the clause (including
473 * itself). In this case, we have to break the SMEM clause by inserting non
474 * SMEM instructions.
475 *
476 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
477 */
478 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)479 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
480 int* NOPs)
481 {
482 /* break off from previous SMEM clause if needed */
483 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
484 /* Don't allow clauses with store instructions since the clause's
485 * instructions may use the same address. */
486 if (ctx.smem_write || instr->definitions.empty() ||
487 instr_info.is_atomic[(unsigned)instr->opcode]) {
488 *NOPs = 1;
489 } else if (program->dev.xnack_enabled) {
490 for (Operand op : instr->operands) {
491 if (!op.isConstant() &&
492 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
493 *NOPs = 1;
494 break;
495 }
496 }
497
498 Definition def = instr->definitions[0];
499 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
500 *NOPs = 1;
501 }
502 }
503 }
504
505 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
506 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)507 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
508 std::vector<aco_ptr<Instruction>>& new_instructions)
509 {
510 /* check hazards */
511 int NOPs = 0;
512
513 if (instr->isSMEM()) {
514 if (state.program->gfx_level == GFX6) {
515 /* A read of an SGPR by SMRD instruction requires 4 wait states
516 * when the SGPR was written by a VALU instruction. According to LLVM,
517 * there is also an undocumented hardware behavior when the buffer
518 * descriptor is written by a SALU instruction */
519 for (unsigned i = 0; i < instr->operands.size(); i++) {
520 Operand op = instr->operands[i];
521 if (op.isConstant())
522 continue;
523
524 bool is_buffer_desc = i == 0 && op.size() > 2;
525 if (is_buffer_desc)
526 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
527 else
528 handle_valu_then_read_hazard(state, &NOPs, 4, op);
529 }
530 }
531
532 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
533 } else if (instr->isSALU()) {
534 if (instr->opcode == aco_opcode::s_setreg_b32 ||
535 instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
536 instr->opcode == aco_opcode::s_getreg_b32) {
537 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
538 }
539
540 if (state.program->gfx_level == GFX9) {
541 if (instr->opcode == aco_opcode::s_movrels_b32 ||
542 instr->opcode == aco_opcode::s_movrels_b64 ||
543 instr->opcode == aco_opcode::s_movreld_b32 ||
544 instr->opcode == aco_opcode::s_movreld_b64) {
545 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
546 }
547 }
548
549 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
550 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
551 } else if (instr->isDS() && instr->ds().gds) {
552 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
553 } else if (instr->isVALU() || instr->isVINTRP()) {
554 if (instr->isDPP()) {
555 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
556 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
557 }
558
559 for (Definition def : instr->definitions) {
560 if (def.regClass().type() != RegType::sgpr) {
561 for (unsigned i = 0; i < def.size(); i++)
562 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
563 }
564 }
565
566 if ((instr->opcode == aco_opcode::v_readlane_b32 ||
567 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
568 instr->opcode == aco_opcode::v_writelane_b32 ||
569 instr->opcode == aco_opcode::v_writelane_b32_e64) &&
570 !instr->operands[1].isConstant()) {
571 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
572 }
573
574 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
575 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
576 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
577 * This hazard isn't documented anywhere but AMD confirmed that hazard.
578 */
579 if (state.program->gfx_level == GFX6 &&
580 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
581 instr->opcode == aco_opcode::v_readfirstlane_b32)) {
582 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
583 }
584
585 if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
586 instr->opcode == aco_opcode::v_div_fmas_f64)
587 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
588 } else if (instr->isVMEM() || instr->isFlatLike()) {
589 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
590 for (Operand op : instr->operands) {
591 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
592 handle_valu_then_read_hazard(state, &NOPs, 5, op);
593 }
594 }
595
596 if (!instr->isSALU() && instr->format != Format::SMEM)
597 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
598
599 if (state.program->gfx_level == GFX9) {
600 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
601 if (instr->isVINTRP() || lds_scratch_global ||
602 instr->opcode == aco_opcode::ds_read_addtid_b32 ||
603 instr->opcode == aco_opcode::ds_write_addtid_b32 ||
604 instr->opcode == aco_opcode::buffer_store_lds_dword) {
605 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
606 }
607 }
608
609 ctx.add_wait_states(NOPs + get_wait_states(instr));
610
611 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
612 if (NOPs) {
613 /* create NOP */
614 aco_ptr<SOPP_instruction> nop{
615 create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
616 nop->imm = NOPs - 1;
617 nop->block = -1;
618 new_instructions.emplace_back(std::move(nop));
619 }
620
621 /* update information to check for later hazards */
622 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
623 ctx.smem_clause = false;
624 ctx.smem_write = false;
625
626 if (state.program->dev.xnack_enabled) {
627 BITSET_ZERO(ctx.smem_clause_read_write);
628 BITSET_ZERO(ctx.smem_clause_write);
629 }
630 }
631
632 if (instr->isSMEM()) {
633 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
634 ctx.smem_write = true;
635 } else {
636 ctx.smem_clause = true;
637
638 if (state.program->dev.xnack_enabled) {
639 for (Operand op : instr->operands) {
640 if (!op.isConstant()) {
641 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
642 }
643 }
644
645 Definition def = instr->definitions[0];
646 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
647 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
648 }
649 }
650 } else if (instr->isVALU()) {
651 for (Definition def : instr->definitions) {
652 if (def.regClass().type() == RegType::sgpr) {
653 if (def.physReg() == vcc || def.physReg() == vcc_hi) {
654 ctx.valu_wr_vcc_then_div_fmas = 4;
655 }
656 if (def.physReg() == exec || def.physReg() == exec_hi) {
657 ctx.valu_wr_exec_then_dpp = 5;
658 }
659 }
660 }
661 } else if (instr->isSALU()) {
662 if (!instr->definitions.empty()) {
663 /* all other definitions should be SCC */
664 Definition def = instr->definitions[0];
665 if (def.physReg() == m0) {
666 ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
667 ctx.salu_wr_m0_then_lds = 1;
668 ctx.salu_wr_m0_then_moverel = 1;
669 }
670 } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
671 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
672 SOPK_instruction& sopk = instr->sopk();
673 unsigned offset = (sopk.imm >> 6) & 0x1f;
674 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
675 unsigned reg = sopk.imm & 0x3f;
676 ctx.setreg_then_getsetreg = 2;
677
678 if (reg == 1 && offset >= 28 && size > (28 - offset))
679 ctx.set_vskip_mode_then_vector = 2;
680 }
681 } else if (instr->isVMEM() || instr->isFlatLike()) {
682 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
683 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
684 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
685 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
686 * store) */
687 bool consider_mimg = instr->isMIMG() &&
688 instr->operands[1].regClass().type() == RegType::vgpr &&
689 instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
690 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
691 bool consider_flat =
692 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
693 if (consider_buf || consider_mimg || consider_flat) {
694 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
695 unsigned size = instr->operands[consider_flat ? 2 : 3].size();
696 for (unsigned i = 0; i < size; i++)
697 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
698 }
699 }
700 }
701
702 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)703 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
704 {
705 if (pred->isVINTRP())
706 global_state = true;
707 return true;
708 }
709
710 template <bool Salu, bool Sgpr>
711 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)712 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
713 {
714 if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
715 for (Definition dst : pred->definitions) {
716 if ((dst.physReg().reg() < 256) == Sgpr) {
717 global_state = MAX2(global_state, block_state);
718 return true;
719 }
720 }
721 }
722
723 block_state -= get_wait_states(pred);
724 return block_state <= 0;
725 }
726
727 template <bool Salu, bool Sgpr>
728 void
handle_wr_hazard(State & state,int * NOPs,int min_states)729 handle_wr_hazard(State& state, int* NOPs, int min_states)
730 {
731 if (*NOPs >= min_states)
732 return;
733
734 int global = 0;
735 int block = min_states;
736 search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
737 *NOPs = MAX2(*NOPs, global);
738 }
739
740 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)741 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
742 std::vector<aco_ptr<Instruction>>& new_instructions)
743 {
744 int NOPs = 0;
745
746 /* SGPR->SMEM hazards */
747 if (state.program->gfx_level == GFX6) {
748 handle_wr_hazard<true, true>(state, &NOPs, 4);
749 handle_wr_hazard<false, true>(state, &NOPs, 4);
750 }
751
752 /* Break up SMEM clauses */
753 if (ctx.smem_clause || ctx.smem_write)
754 NOPs = MAX2(NOPs, 1);
755
756 /* SALU/GDS hazards */
757 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
758 if (state.program->gfx_level == GFX9)
759 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
760 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
761
762 /* VALU hazards */
763 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
764 if (state.program->gfx_level >= GFX8)
765 handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
766 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
767 if (state.program->gfx_level == GFX6) {
768 /* VINTRP->v_readlane_b32/etc */
769 bool vintrp = false;
770 search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
771 if (vintrp)
772 NOPs = MAX2(NOPs, 1);
773 }
774 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
775
776 /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
777 handle_wr_hazard<false, true>(state, &NOPs, 5);
778
779 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
780
781 if (state.program->gfx_level == GFX9)
782 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
783
784 ctx.add_wait_states(NOPs);
785 if (NOPs) {
786 Builder bld(state.program, &new_instructions);
787 bld.sopp(aco_opcode::s_nop, -1, NOPs - 1);
788 }
789 }
790
791 template <std::size_t N>
792 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)793 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
794 {
795 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
796 [&check_regs](const Definition& def) -> bool
797 {
798 bool writes_any = false;
799 for (unsigned i = 0; i < def.size(); i++) {
800 unsigned def_reg = def.physReg() + i;
801 writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
802 }
803 return writes_any;
804 });
805 }
806
807 template <std::size_t N>
808 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)809 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
810 {
811 return std::any_of(instr->operands.begin(), instr->operands.end(),
812 [&check_regs](const Operand& op) -> bool
813 {
814 if (op.isConstant())
815 return false;
816 bool writes_any = false;
817 for (unsigned i = 0; i < op.size(); i++) {
818 unsigned op_reg = op.physReg() + i;
819 writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
820 }
821 return writes_any;
822 });
823 }
824
825 template <std::size_t N>
826 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)827 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
828 {
829 for (const Operand& op : instr->operands) {
830 for (unsigned i = 0; i < op.size(); i++) {
831 unsigned reg = op.physReg() + i;
832 if (reg < reg_reads.size())
833 reg_reads.set(reg);
834 }
835 }
836 }
837
838 template <std::size_t N>
839 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)840 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
841 {
842 mark_read_regs(instr, reg_reads);
843 reg_reads.set(exec);
844 if (state.program->wave_size == 64)
845 reg_reads.set(exec_hi);
846 }
847
848 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)849 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
850 {
851 if (instr->isVOPC())
852 return true;
853 if (instr->isVOP3() && instr->definitions.size() == 2)
854 return true;
855 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
856 instr->opcode == aco_opcode::v_readlane_b32 ||
857 instr->opcode == aco_opcode::v_readlane_b32_e64)
858 return true;
859 return false;
860 }
861
862 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)863 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
864 {
865 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
866 [](const Definition& def) -> bool
867 { return def.getTemp().type() == RegType::sgpr; });
868 }
869
870 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)871 instr_is_branch(const aco_ptr<Instruction>& instr)
872 {
873 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
874 instr->opcode == aco_opcode::s_cbranch_scc1 ||
875 instr->opcode == aco_opcode::s_cbranch_vccz ||
876 instr->opcode == aco_opcode::s_cbranch_vccnz ||
877 instr->opcode == aco_opcode::s_cbranch_execz ||
878 instr->opcode == aco_opcode::s_cbranch_execnz ||
879 instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
880 instr->opcode == aco_opcode::s_cbranch_cdbguser ||
881 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
882 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
883 instr->opcode == aco_opcode::s_subvector_loop_begin ||
884 instr->opcode == aco_opcode::s_subvector_loop_end ||
885 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
886 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
887 }
888
889 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)890 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
891 std::vector<aco_ptr<Instruction>>& new_instructions)
892 {
893 // TODO: s_dcache_inv needs to be in it's own group on GFX10
894
895 Builder bld(state.program, &new_instructions);
896
897 unsigned vm_vsrc = 7;
898 unsigned sa_sdst = 1;
899 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
900 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0000);
901 vm_vsrc = 0;
902 sa_sdst = 0;
903 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
904 vm_vsrc = (instr->sopp().imm >> 2) & 0x7;
905 sa_sdst = instr->sopp().imm & 0x1;
906 }
907
908 /* VMEMtoScalarWriteHazard
909 * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
910 * in-between.
911 */
912 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
913 /* Remember all SGPRs that are read by the VMEM/DS instruction */
914 if (instr->isVMEM() || instr->isFlatLike())
915 mark_read_regs_exec(
916 state, instr,
917 instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
918 if (instr->isFlat() || instr->isDS())
919 mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
920 } else if (instr->isSALU() || instr->isSMEM()) {
921 if (instr->opcode == aco_opcode::s_waitcnt) {
922 wait_imm imm(state.program->gfx_level, instr->sopp().imm);
923 if (imm.vm == 0)
924 ctx.sgprs_read_by_VMEM.reset();
925 if (imm.lgkm == 0)
926 ctx.sgprs_read_by_DS.reset();
927 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->sopk().imm == 0) {
928 ctx.sgprs_read_by_VMEM_store.reset();
929 } else if (vm_vsrc == 0) {
930 ctx.sgprs_read_by_VMEM.reset();
931 ctx.sgprs_read_by_DS.reset();
932 ctx.sgprs_read_by_VMEM_store.reset();
933 }
934
935 /* Check if SALU writes an SGPR that was previously read by the VALU */
936 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
937 check_written_regs(instr, ctx.sgprs_read_by_DS) ||
938 check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
939 ctx.sgprs_read_by_VMEM.reset();
940 ctx.sgprs_read_by_DS.reset();
941 ctx.sgprs_read_by_VMEM_store.reset();
942
943 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
944 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xffe3);
945 }
946 } else if (instr->isVALU()) {
947 /* Hazard is mitigated by any VALU instruction */
948 ctx.sgprs_read_by_VMEM.reset();
949 ctx.sgprs_read_by_DS.reset();
950 ctx.sgprs_read_by_VMEM_store.reset();
951 }
952
953 /* VcmpxPermlaneHazard
954 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
955 */
956 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
957 /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
958 ctx.has_VOPC_write_exec = true;
959 } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
960 instr->opcode == aco_opcode::v_permlanex16_b32)) {
961 ctx.has_VOPC_write_exec = false;
962
963 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
964 bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
965 Operand(instr->operands[0].physReg(), v1));
966 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
967 ctx.has_VOPC_write_exec = false;
968 }
969
970 /* VcmpxExecWARHazard
971 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
972 */
973 if (!instr->isVALU() && instr->reads_exec()) {
974 ctx.has_nonVALU_exec_read = true;
975 } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
976 if (instr->writes_exec()) {
977 ctx.has_nonVALU_exec_read = false;
978
979 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
980 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
981 } else if (instr_writes_sgpr(instr)) {
982 /* Any VALU instruction that writes an SGPR mitigates the problem */
983 ctx.has_nonVALU_exec_read = false;
984 }
985 } else if (sa_sdst == 0) {
986 ctx.has_nonVALU_exec_read = false;
987 }
988
989 /* SMEMtoVectorWriteHazard
990 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
991 */
992 if (instr->isSMEM()) {
993 /* Remember all SGPRs that are read by the SMEM instruction */
994 mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
995 } else if (VALU_writes_sgpr(instr)) {
996 /* Check if VALU writes an SGPR that was previously read by SMEM */
997 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
998 ctx.sgprs_read_by_SMEM.reset();
999
1000 /* Insert s_mov to mitigate the problem */
1001 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1002 }
1003 } else if (instr->isSALU()) {
1004 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
1005 if (instr->opcode == aco_opcode::s_waitcnt_lgkmcnt) {
1006 const SOPK_instruction& sopk = instr->sopk();
1007 if (sopk.imm == 0 && sopk.operands[0].physReg() == sgpr_null)
1008 ctx.sgprs_read_by_SMEM.reset();
1009 } else if (instr->opcode == aco_opcode::s_waitcnt) {
1010 wait_imm imm(state.program->gfx_level, instr->sopp().imm);
1011 if (imm.lgkm == 0)
1012 ctx.sgprs_read_by_SMEM.reset();
1013 } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1014 /* SALU can mitigate the hazard */
1015 ctx.sgprs_read_by_SMEM.reset();
1016 }
1017 }
1018
1019 /* LdsBranchVmemWARHazard
1020 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1021 */
1022 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1023 if (ctx.has_branch_after_DS)
1024 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1025 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1026 ctx.has_VMEM = true;
1027 } else if (instr->isDS()) {
1028 if (ctx.has_branch_after_VMEM)
1029 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1030 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1031 ctx.has_DS = true;
1032 } else if (instr_is_branch(instr)) {
1033 ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1034 ctx.has_branch_after_DS |= ctx.has_DS;
1035 ctx.has_VMEM = ctx.has_DS = false;
1036 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1037 /* Only s_waitcnt_vscnt can mitigate the hazard */
1038 const SOPK_instruction& sopk = instr->sopk();
1039 if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1040 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1041 }
1042
1043 /* NSAToVMEMBug
1044 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1045 * 0).
1046 */
1047 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1048 ctx.has_NSA_MIMG = true;
1049 } else if (ctx.has_NSA_MIMG) {
1050 ctx.has_NSA_MIMG = false;
1051
1052 if (instr->isMUBUF() || instr->isMTBUF()) {
1053 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1054 if (offset & 6)
1055 bld.sopp(aco_opcode::s_nop, -1, 0);
1056 }
1057 }
1058
1059 /* waNsaCannotFollowWritelane
1060 * Handles NSA MIMG immediately following a v_writelane_b32.
1061 */
1062 if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1063 ctx.has_writelane = true;
1064 } else if (ctx.has_writelane) {
1065 ctx.has_writelane = false;
1066 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1067 bld.sopp(aco_opcode::s_nop, -1, 0);
1068 }
1069 }
1070
1071 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1072 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1073 std::vector<aco_ptr<Instruction>>& new_instructions)
1074 {
1075 Builder bld(state.program, &new_instructions);
1076
1077 size_t prev_count = new_instructions.size();
1078
1079 /* VcmpxPermlaneHazard */
1080 if (ctx.has_VOPC_write_exec) {
1081 ctx.has_VOPC_write_exec = false;
1082 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1083
1084 /* VALU mitigates VMEMtoScalarWriteHazard. */
1085 ctx.sgprs_read_by_VMEM.reset();
1086 ctx.sgprs_read_by_DS.reset();
1087 ctx.sgprs_read_by_VMEM_store.reset();
1088 }
1089
1090 unsigned waitcnt_depctr = 0xffff;
1091
1092 /* VMEMtoScalarWriteHazard */
1093 if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1094 ctx.sgprs_read_by_VMEM_store.any()) {
1095 ctx.sgprs_read_by_VMEM.reset();
1096 ctx.sgprs_read_by_DS.reset();
1097 ctx.sgprs_read_by_VMEM_store.reset();
1098 waitcnt_depctr &= 0xffe3;
1099 }
1100
1101 /* VcmpxExecWARHazard */
1102 if (ctx.has_nonVALU_exec_read) {
1103 ctx.has_nonVALU_exec_read = false;
1104 waitcnt_depctr &= 0xfffe;
1105 }
1106
1107 if (waitcnt_depctr != 0xffff)
1108 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, waitcnt_depctr);
1109
1110 /* SMEMtoVectorWriteHazard */
1111 if (ctx.sgprs_read_by_SMEM.any()) {
1112 ctx.sgprs_read_by_SMEM.reset();
1113 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1114 }
1115
1116 /* LdsBranchVmemWARHazard */
1117 if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1118 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1119 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1120 }
1121
1122 /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1123 if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1124 ctx.has_NSA_MIMG = ctx.has_writelane = false;
1125 /* Any instruction resolves these hazards. */
1126 if (new_instructions.size() == prev_count)
1127 bld.sopp(aco_opcode::s_nop, -1, 0);
1128 }
1129 }
1130
1131 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1132 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1133 {
1134 if (reg.reg() < 256)
1135 return;
1136 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1137 set.set(reg.reg() - 256 + i);
1138 }
1139
1140 /* GFX11 */
1141 unsigned
parse_vdst_wait(aco_ptr<Instruction> & instr)1142 parse_vdst_wait(aco_ptr<Instruction>& instr)
1143 {
1144 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP())
1145 return 0;
1146 else if (instr->isLDSDIR())
1147 return instr->ldsdir().wait_vdst;
1148 else if (instr->opcode == aco_opcode::s_waitcnt_depctr)
1149 return (instr->sopp().imm >> 12) & 0xf;
1150 else
1151 return 15;
1152 }
1153
1154 struct LdsDirectVALUHazardGlobalState {
1155 unsigned wait_vdst = 15;
1156 PhysReg vgpr;
1157 std::set<unsigned> loop_headers_visited;
1158 };
1159
1160 struct LdsDirectVALUHazardBlockState {
1161 unsigned num_valu = 0;
1162 bool has_trans = false;
1163
1164 unsigned num_instrs = 0;
1165 unsigned num_blocks = 0;
1166 };
1167
1168 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1169 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1170 LdsDirectVALUHazardBlockState& block_state,
1171 aco_ptr<Instruction>& instr)
1172 {
1173 if (instr->isVALU()) {
1174 block_state.has_trans |= instr->isTrans();
1175
1176 bool uses_vgpr = false;
1177 for (Definition& def : instr->definitions)
1178 uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1179 for (Operand& op : instr->operands) {
1180 uses_vgpr |=
1181 !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1182 }
1183 if (uses_vgpr) {
1184 /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1185 global_state.wait_vdst =
1186 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1187 return true;
1188 }
1189
1190 block_state.num_valu++;
1191 }
1192
1193 if (parse_vdst_wait(instr) == 0)
1194 return true;
1195
1196 block_state.num_instrs++;
1197 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1198 /* Exit to limit compile times and set wait_vdst to be safe. */
1199 global_state.wait_vdst =
1200 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1201 return true;
1202 }
1203
1204 return block_state.num_valu >= global_state.wait_vdst;
1205 }
1206
1207 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1208 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1209 LdsDirectVALUHazardBlockState& block_state, Block* block)
1210 {
1211 if (block->kind & block_kind_loop_header) {
1212 if (global_state.loop_headers_visited.count(block->index))
1213 return false;
1214 global_state.loop_headers_visited.insert(block->index);
1215 }
1216
1217 block_state.num_blocks++;
1218
1219 return true;
1220 }
1221
1222 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1223 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1224 {
1225 /* LdsDirectVALUHazard
1226 * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1227 */
1228 if (instr->ldsdir().wait_vdst == 0)
1229 return 0; /* early exit */
1230
1231 LdsDirectVALUHazardGlobalState global_state;
1232 global_state.wait_vdst = instr->ldsdir().wait_vdst;
1233 global_state.vgpr = instr->definitions[0].physReg();
1234 LdsDirectVALUHazardBlockState block_state;
1235 search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1236 &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1237 state, global_state, block_state);
1238 return global_state.wait_vdst;
1239 }
1240
1241 enum VALUPartialForwardingHazardState : uint8_t {
1242 nothing_written,
1243 written_after_exec_write,
1244 exec_written,
1245 };
1246
1247 struct VALUPartialForwardingHazardGlobalState {
1248 bool hazard_found = false;
1249 std::set<unsigned> loop_headers_visited;
1250 };
1251
1252 struct VALUPartialForwardingHazardBlockState {
1253 /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1254 uint8_t num_vgprs_read = 0;
1255 BITSET_DECLARE(vgprs_read, 256) = {0};
1256 enum VALUPartialForwardingHazardState state = nothing_written;
1257 unsigned num_valu_since_read = 0;
1258 unsigned num_valu_since_write = 0;
1259
1260 unsigned num_instrs = 0;
1261 unsigned num_blocks = 0;
1262 };
1263
1264 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1265 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1266 VALUPartialForwardingHazardBlockState& block_state,
1267 aco_ptr<Instruction>& instr)
1268 {
1269 if (instr->isSALU() && !instr->definitions.empty()) {
1270 if (block_state.state == written_after_exec_write && instr->writes_exec())
1271 block_state.state = exec_written;
1272 } else if (instr->isVALU()) {
1273 bool vgpr_write = false;
1274 for (Definition& def : instr->definitions) {
1275 if (def.physReg().reg() < 256)
1276 continue;
1277
1278 for (unsigned i = 0; i < def.size(); i++) {
1279 unsigned reg = def.physReg().reg() - 256 + i;
1280 if (!BITSET_TEST(block_state.vgprs_read, reg))
1281 continue;
1282
1283 if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1284 global_state.hazard_found = true;
1285 return true;
1286 }
1287
1288 BITSET_CLEAR(block_state.vgprs_read, reg);
1289 block_state.num_vgprs_read--;
1290 vgpr_write = true;
1291 }
1292 }
1293
1294 if (vgpr_write) {
1295 /* If the state is nothing_written: the check below should ensure that this write is
1296 * close enough to the read.
1297 *
1298 * If the state is exec_written: the current choice of second write has failed. Reset and
1299 * try with the current write as the second one, if it's close enough to the read.
1300 *
1301 * If the state is written_after_exec_write: a further second write would be better, if
1302 * it's close enough to the read.
1303 */
1304 if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1305 block_state.state = written_after_exec_write;
1306 block_state.num_valu_since_write = 0;
1307 } else {
1308 block_state.num_valu_since_write++;
1309 }
1310 } else {
1311 block_state.num_valu_since_write++;
1312 }
1313
1314 block_state.num_valu_since_read++;
1315 } else if (parse_vdst_wait(instr) == 0) {
1316 return true;
1317 }
1318
1319 if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1320 return true; /* Hazard not possible at this distance. */
1321 if (block_state.num_vgprs_read == 0)
1322 return true; /* All VGPRs have been written and a hazard was never found. */
1323
1324 block_state.num_instrs++;
1325 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1326 /* Exit to limit compile times and set hazard_found=true to be safe. */
1327 global_state.hazard_found = true;
1328 return true;
1329 }
1330
1331 return false;
1332 }
1333
1334 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1335 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1336 VALUPartialForwardingHazardBlockState& block_state,
1337 Block* block)
1338 {
1339 if (block->kind & block_kind_loop_header) {
1340 if (global_state.loop_headers_visited.count(block->index))
1341 return false;
1342 global_state.loop_headers_visited.insert(block->index);
1343 }
1344
1345 block_state.num_blocks++;
1346
1347 return true;
1348 }
1349
1350 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1351 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1352 {
1353 /* VALUPartialForwardingHazard
1354 * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1355 * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1356 * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1357 */
1358 if (state.program->wave_size != 64 || !instr->isVALU())
1359 return false;
1360
1361 unsigned num_vgprs = 0;
1362 for (Operand& op : instr->operands)
1363 num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1364 if (num_vgprs <= 1)
1365 return false; /* early exit */
1366
1367 VALUPartialForwardingHazardBlockState block_state;
1368
1369 for (unsigned i = 0; i < instr->operands.size(); i++) {
1370 Operand& op = instr->operands[i];
1371 if (op.physReg().reg() < 256)
1372 continue;
1373 for (unsigned j = 0; j < op.size(); j++)
1374 BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1375 }
1376 block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1377
1378 if (block_state.num_vgprs_read <= 1)
1379 return false; /* early exit */
1380
1381 VALUPartialForwardingHazardGlobalState global_state;
1382 search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1383 &handle_valu_partial_forwarding_hazard_block,
1384 &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1385 return global_state.hazard_found;
1386 }
1387
1388 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1389 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1390 std::vector<aco_ptr<Instruction>>& new_instructions)
1391 {
1392 Builder bld(state.program, &new_instructions);
1393
1394 /* VcmpxPermlaneHazard
1395 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1396 */
1397 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1398 ctx.has_Vcmpx = true;
1399 } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1400 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1401 instr->opcode == aco_opcode::v_permlane64_b32)) {
1402 ctx.has_Vcmpx = false;
1403
1404 /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1405 bld.vop1(aco_opcode::v_nop);
1406 } else if (instr->isVALU()) {
1407 ctx.has_Vcmpx = false;
1408 }
1409
1410 unsigned va_vdst = parse_vdst_wait(instr);
1411 unsigned vm_vsrc = 7;
1412 unsigned sa_sdst = 1;
1413
1414 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1415 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0000);
1416 va_vdst = 0;
1417 vm_vsrc = 0;
1418 sa_sdst = 0;
1419 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1420 /* va_vdst already obtained through parse_vdst_wait(). */
1421 vm_vsrc = (instr->sopp().imm >> 2) & 0x7;
1422 sa_sdst = instr->sopp().imm & 0x1;
1423 }
1424
1425 if (instr->isLDSDIR()) {
1426 unsigned count = handle_lds_direct_valu_hazard(state, instr);
1427 LDSDIR_instruction* ldsdir = &instr->ldsdir();
1428 if (count < va_vdst) {
1429 ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1430 va_vdst = MIN2(va_vdst, count);
1431 }
1432 }
1433
1434 /* VALUTransUseHazard
1435 * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1436 * in-between.
1437 */
1438 if (va_vdst > 0 && instr->isVALU()) {
1439 uint8_t num_valu = 15;
1440 uint8_t num_trans = 15;
1441 for (Operand& op : instr->operands) {
1442 if (op.physReg().reg() < 256)
1443 continue;
1444 for (unsigned i = 0; i < op.size(); i++) {
1445 num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(op.physReg(), i));
1446 num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(op.physReg(), i));
1447 }
1448 }
1449 if (num_trans <= 1 && num_valu <= 5) {
1450 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
1451 va_vdst = 0;
1452 }
1453 }
1454
1455 if (va_vdst > 0 && handle_valu_partial_forwarding_hazard(state, instr)) {
1456 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
1457 va_vdst = 0;
1458 }
1459
1460 /* VALUMaskWriteHazard
1461 * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
1462 */
1463 if (state.program->wave_size == 64 && instr->isSALU() &&
1464 check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1465 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
1466 ctx.sgpr_read_by_valu_as_lanemask.reset();
1467 } else if (state.program->wave_size == 64 && instr->isSALU() &&
1468 check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1469 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
1470 sa_sdst = 0;
1471 }
1472
1473 if (va_vdst == 0) {
1474 ctx.valu_since_wr_by_trans.reset();
1475 ctx.trans_since_wr_by_trans.reset();
1476 }
1477
1478 if (sa_sdst == 0)
1479 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1480
1481 if (instr->isVALU()) {
1482 bool is_trans = instr->isTrans();
1483
1484 ctx.valu_since_wr_by_trans.inc();
1485 if (is_trans)
1486 ctx.trans_since_wr_by_trans.inc();
1487
1488 if (is_trans) {
1489 for (Definition& def : instr->definitions) {
1490 ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
1491 ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
1492 }
1493 }
1494
1495 if (state.program->wave_size == 64) {
1496 for (Operand& op : instr->operands) {
1497 if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
1498 ctx.sgpr_read_by_valu_as_lanemask.reset();
1499 }
1500 switch (instr->opcode) {
1501 case aco_opcode::v_addc_co_u32:
1502 case aco_opcode::v_subb_co_u32:
1503 case aco_opcode::v_subbrev_co_u32:
1504 case aco_opcode::v_cndmask_b16:
1505 case aco_opcode::v_cndmask_b32:
1506 case aco_opcode::v_div_fmas_f32:
1507 case aco_opcode::v_div_fmas_f64:
1508 if (instr->operands.back().physReg() != exec) {
1509 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1510 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1511 }
1512 break;
1513 default: break;
1514 }
1515 }
1516 }
1517
1518 /* LdsDirectVMEMHazard
1519 * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1520 */
1521 if (instr->isVMEM() || instr->isFlatLike()) {
1522 for (Definition& def : instr->definitions)
1523 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, def.physReg(), def.bytes());
1524 if (instr->definitions.empty()) {
1525 for (Operand& op : instr->operands)
1526 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1527 } else {
1528 for (Operand& op : instr->operands)
1529 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes());
1530 }
1531 }
1532 if (instr->isDS() || instr->isFlat()) {
1533 for (Definition& def : instr->definitions)
1534 fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1535 for (Operand& op : instr->operands)
1536 fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1537 }
1538 if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1539 ctx.vgpr_used_by_vmem_load.reset();
1540 ctx.vgpr_used_by_vmem_store.reset();
1541 ctx.vgpr_used_by_ds.reset();
1542 } else if (instr->opcode == aco_opcode::s_waitcnt) {
1543 wait_imm imm(GFX11, instr->sopp().imm);
1544 if (imm.vm == 0)
1545 ctx.vgpr_used_by_vmem_load.reset();
1546 if (imm.lgkm == 0)
1547 ctx.vgpr_used_by_ds.reset();
1548 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->sopk().imm == 0) {
1549 ctx.vgpr_used_by_vmem_store.reset();
1550 }
1551 if (instr->isLDSDIR()) {
1552 if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1553 ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1554 ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1555 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xffe3);
1556 ctx.vgpr_used_by_vmem_load.reset();
1557 ctx.vgpr_used_by_vmem_store.reset();
1558 ctx.vgpr_used_by_ds.reset();
1559 }
1560 }
1561 }
1562
1563 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1564 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1565 {
1566 if (parse_vdst_wait(pred) == 0)
1567 return true;
1568
1569 if (--block_state == 0) {
1570 global_state = false;
1571 return true;
1572 }
1573
1574 if (pred->isVALU()) {
1575 bool vgpr_rd_or_wr = false;
1576 for (Definition def : pred->definitions) {
1577 if (def.physReg().reg() >= 256)
1578 vgpr_rd_or_wr = true;
1579 }
1580 for (Operand op : pred->operands) {
1581 if (op.physReg().reg() >= 256)
1582 vgpr_rd_or_wr = true;
1583 }
1584 if (vgpr_rd_or_wr) {
1585 global_state = false;
1586 return true;
1587 }
1588 }
1589
1590 return false;
1591 }
1592
1593 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1594 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1595 std::vector<aco_ptr<Instruction>>& new_instructions)
1596 {
1597 Builder bld(state.program, &new_instructions);
1598
1599 unsigned waitcnt_depctr = 0xffff;
1600
1601 /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1602 bool has_vdst0_since_valu = true;
1603 unsigned depth = 16;
1604 search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1605 state, has_vdst0_since_valu, depth);
1606 if (!has_vdst0_since_valu) {
1607 waitcnt_depctr &= 0x0fff;
1608 ctx.valu_since_wr_by_trans.reset();
1609 ctx.trans_since_wr_by_trans.reset();
1610 }
1611
1612 /* VcmpxPermlaneHazard */
1613 if (ctx.has_Vcmpx) {
1614 ctx.has_Vcmpx = false;
1615 bld.vop1(aco_opcode::v_nop);
1616 }
1617
1618 /* VALUMaskWriteHazard */
1619 if (state.program->wave_size == 64 &&
1620 (ctx.sgpr_read_by_valu_as_lanemask.any() ||
1621 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
1622 waitcnt_depctr &= 0xfffe;
1623 ctx.sgpr_read_by_valu_as_lanemask.reset();
1624 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1625 }
1626
1627 /* LdsDirectVMEMHazard */
1628 if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1629 ctx.vgpr_used_by_ds.any()) {
1630 waitcnt_depctr &= 0xffe3;
1631 ctx.vgpr_used_by_vmem_load.reset();
1632 ctx.vgpr_used_by_vmem_store.reset();
1633 ctx.vgpr_used_by_ds.reset();
1634 }
1635
1636 if (waitcnt_depctr != 0xffff)
1637 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, waitcnt_depctr);
1638 }
1639
1640 template <typename Ctx>
1641 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1642 std::vector<aco_ptr<Instruction>>&);
1643
1644 template <typename Ctx>
1645 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1646
1647 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1648 void
handle_block(Program * program,Ctx & ctx,Block & block)1649 handle_block(Program* program, Ctx& ctx, Block& block)
1650 {
1651 if (block.instructions.empty())
1652 return;
1653
1654 State state;
1655 state.program = program;
1656 state.block = █
1657 state.old_instructions = std::move(block.instructions);
1658
1659 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1660 block.instructions.reserve(state.old_instructions.size());
1661
1662 bool found_end = false;
1663 for (aco_ptr<Instruction>& instr : state.old_instructions) {
1664 Handle(state, ctx, instr, block.instructions);
1665
1666 /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1667 if (instr->opcode == aco_opcode::s_setpc_b64) {
1668 block.instructions.emplace_back(std::move(instr));
1669
1670 std::vector<aco_ptr<Instruction>> resolve_instrs;
1671 Resolve(state, ctx, resolve_instrs);
1672 block.instructions.insert(std::prev(block.instructions.end()),
1673 std::move_iterator(resolve_instrs.begin()),
1674 std::move_iterator(resolve_instrs.end()));
1675
1676 found_end = true;
1677 continue;
1678 }
1679
1680 found_end |= instr->opcode == aco_opcode::s_endpgm;
1681 block.instructions.emplace_back(std::move(instr));
1682 }
1683
1684 /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1685 if (block.linear_succs.empty() && !found_end)
1686 Resolve(state, ctx, block.instructions);
1687 }
1688
1689 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1690 void
mitigate_hazards(Program * program)1691 mitigate_hazards(Program* program)
1692 {
1693 std::vector<Ctx> all_ctx(program->blocks.size());
1694 std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1695
1696 for (unsigned i = 0; i < program->blocks.size(); i++) {
1697 Block& block = program->blocks[i];
1698 Ctx& ctx = all_ctx[i];
1699
1700 if (block.kind & block_kind_loop_header) {
1701 loop_header_indices.push(i);
1702 } else if (block.kind & block_kind_loop_exit) {
1703 /* Go through the whole loop again */
1704 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1705 Ctx loop_block_ctx;
1706 for (unsigned b : program->blocks[idx].linear_preds)
1707 loop_block_ctx.join(all_ctx[b]);
1708
1709 handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1710
1711 /* We only need to continue if the loop header context changed */
1712 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1713 break;
1714
1715 all_ctx[idx] = loop_block_ctx;
1716 }
1717
1718 loop_header_indices.pop();
1719 }
1720
1721 for (unsigned b : block.linear_preds)
1722 ctx.join(all_ctx[b]);
1723
1724 handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1725 }
1726 }
1727
1728 } /* end namespace */
1729
1730 void
insert_NOPs(Program * program)1731 insert_NOPs(Program* program)
1732 {
1733 if (program->gfx_level >= GFX11)
1734 mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program);
1735 else if (program->gfx_level >= GFX10_3)
1736 ; /* no hazards/bugs to mitigate */
1737 else if (program->gfx_level >= GFX10)
1738 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1739 else
1740 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1741 }
1742
1743 } // namespace aco
1744