1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include <algorithm>
29 #include <bitset>
30 #include <stack>
31 #include <vector>
32
33 namespace aco {
34 namespace {
35
36 struct State {
37 Program* program;
38 Block* block;
39 std::vector<aco_ptr<Instruction>> old_instructions;
40 };
41
42 struct NOP_ctx_gfx6 {
joinaco::__anona977b2740111::NOP_ctx_gfx643 void join(const NOP_ctx_gfx6& other)
44 {
45 set_vskip_mode_then_vector =
46 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
47 valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
48 valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
49 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
50 salu_wr_m0_then_gds_msg_ttrace =
51 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
52 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
53 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
54 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
55 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
56 vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
57 smem_clause |= other.smem_clause;
58 smem_write |= other.smem_write;
59 for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
60 smem_clause_read_write[i] |= other.smem_clause_read_write[i];
61 smem_clause_write[i] |= other.smem_clause_write[i];
62 }
63 }
64
operator ==aco::__anona977b2740111::NOP_ctx_gfx665 bool operator==(const NOP_ctx_gfx6& other)
66 {
67 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
68 valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
69 valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
70 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
71 vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
72 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
73 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
74 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
75 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
76 setreg_then_getsetreg == other.setreg_then_getsetreg &&
77 smem_clause == other.smem_clause && smem_write == other.smem_write &&
78 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
79 BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
80 }
81
add_wait_statesaco::__anona977b2740111::NOP_ctx_gfx682 void add_wait_states(unsigned amount)
83 {
84 if ((set_vskip_mode_then_vector -= amount) < 0)
85 set_vskip_mode_then_vector = 0;
86
87 if ((valu_wr_vcc_then_vccz -= amount) < 0)
88 valu_wr_vcc_then_vccz = 0;
89
90 if ((valu_wr_exec_then_execz -= amount) < 0)
91 valu_wr_exec_then_execz = 0;
92
93 if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
94 valu_wr_vcc_then_div_fmas = 0;
95
96 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
97 salu_wr_m0_then_gds_msg_ttrace = 0;
98
99 if ((valu_wr_exec_then_dpp -= amount) < 0)
100 valu_wr_exec_then_dpp = 0;
101
102 if ((salu_wr_m0_then_lds -= amount) < 0)
103 salu_wr_m0_then_lds = 0;
104
105 if ((salu_wr_m0_then_moverel -= amount) < 0)
106 salu_wr_m0_then_moverel = 0;
107
108 if ((setreg_then_getsetreg -= amount) < 0)
109 setreg_then_getsetreg = 0;
110
111 vmem_store_then_wr_data.reset();
112 }
113
114 /* setting MODE.vskip and then any vector op requires 2 wait states */
115 int8_t set_vskip_mode_then_vector = 0;
116
117 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
118 int8_t valu_wr_vcc_then_vccz = 0;
119 int8_t valu_wr_exec_then_execz = 0;
120
121 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
122 int8_t valu_wr_vcc_then_div_fmas = 0;
123
124 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
125 int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
126
127 /* VALU writing EXEC followed by DPP requires 5 wait states */
128 int8_t valu_wr_exec_then_dpp = 0;
129
130 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
131 int8_t salu_wr_m0_then_lds = 0;
132
133 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
134 int8_t salu_wr_m0_then_moverel = 0;
135
136 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
137 * currently we don't look at the actual register */
138 int8_t setreg_then_getsetreg = 0;
139
140 /* some memory instructions writing >64bit followed by a instructions
141 * writing the VGPRs holding the writedata requires 1 wait state */
142 std::bitset<256> vmem_store_then_wr_data;
143
144 /* we break up SMEM clauses that contain stores or overwrite an
145 * operand/definition of another instruction in the clause */
146 bool smem_clause = false;
147 bool smem_write = false;
148 BITSET_DECLARE(smem_clause_read_write, 128) = {0};
149 BITSET_DECLARE(smem_clause_write, 128) = {0};
150 };
151
152 struct NOP_ctx_gfx10 {
153 bool has_VOPC = false;
154 bool has_nonVALU_exec_read = false;
155 bool has_VMEM = false;
156 bool has_branch_after_VMEM = false;
157 bool has_DS = false;
158 bool has_branch_after_DS = false;
159 bool has_NSA_MIMG = false;
160 bool has_writelane = false;
161 std::bitset<128> sgprs_read_by_VMEM;
162 std::bitset<128> sgprs_read_by_VMEM_store;
163 std::bitset<128> sgprs_read_by_DS;
164 std::bitset<128> sgprs_read_by_SMEM;
165
joinaco::__anona977b2740111::NOP_ctx_gfx10166 void join(const NOP_ctx_gfx10& other)
167 {
168 has_VOPC |= other.has_VOPC;
169 has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
170 has_VMEM |= other.has_VMEM;
171 has_branch_after_VMEM |= other.has_branch_after_VMEM;
172 has_DS |= other.has_DS;
173 has_branch_after_DS |= other.has_branch_after_DS;
174 has_NSA_MIMG |= other.has_NSA_MIMG;
175 has_writelane |= other.has_writelane;
176 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
177 sgprs_read_by_DS |= other.sgprs_read_by_DS;
178 sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
179 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
180 }
181
operator ==aco::__anona977b2740111::NOP_ctx_gfx10182 bool operator==(const NOP_ctx_gfx10& other)
183 {
184 return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
185 has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
186 has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
187 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
188 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
189 sgprs_read_by_DS == other.sgprs_read_by_DS &&
190 sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
191 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
192 }
193 };
194
195 int
get_wait_states(aco_ptr<Instruction> & instr)196 get_wait_states(aco_ptr<Instruction>& instr)
197 {
198 if (instr->opcode == aco_opcode::s_nop)
199 return instr->sopp().imm + 1;
200 else if (instr->opcode == aco_opcode::p_constaddr)
201 return 3; /* lowered to 3 instructions in the assembler */
202 else
203 return 1;
204 }
205
206 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)207 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
208 {
209 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
210 }
211
212 template <bool Valu, bool Vintrp, bool Salu>
213 bool
handle_raw_hazard_instr(aco_ptr<Instruction> & pred,PhysReg reg,int * nops_needed,uint32_t * mask)214 handle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask)
215 {
216 unsigned mask_size = util_last_bit(*mask);
217
218 uint32_t writemask = 0;
219 for (Definition& def : pred->definitions) {
220 if (regs_intersect(reg, mask_size, def.physReg(), def.size())) {
221 unsigned start = def.physReg() > reg ? def.physReg() - reg : 0;
222 unsigned end = MIN2(mask_size, start + def.size());
223 writemask |= u_bit_consecutive(start, end - start);
224 }
225 }
226
227 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
228 (pred->isSALU() && Salu));
229 if (is_hazard)
230 return true;
231
232 *mask &= ~writemask;
233 *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0);
234
235 if (*mask == 0)
236 *nops_needed = 0;
237
238 return *nops_needed == 0;
239 }
240
241 template <bool Valu, bool Vintrp, bool Salu>
242 int
handle_raw_hazard_internal(State & state,Block * block,int nops_needed,PhysReg reg,uint32_t mask,bool start_at_end)243 handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask,
244 bool start_at_end)
245 {
246 if (block == state.block && start_at_end) {
247 /* If it's the current block, block->instructions is incomplete. */
248 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
249 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
250 if (!instr)
251 break; /* Instruction has been moved to block->instructions. */
252 if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask))
253 return nops_needed;
254 }
255 }
256 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
257 if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg,
258 &nops_needed, &mask))
259 return nops_needed;
260 }
261
262 int res = 0;
263
264 /* Loops require branch instructions, which count towards the wait
265 * states. So even with loops this should finish unless nops_needed is some
266 * huge value. */
267 for (unsigned lin_pred : block->linear_preds) {
268 res =
269 std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
270 state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true));
271 }
272 return res;
273 }
274
275 template <bool Valu, bool Vintrp, bool Salu>
276 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)277 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
278 {
279 if (*NOPs >= min_states)
280 return;
281 int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
282 state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false);
283 *NOPs = MAX2(*NOPs, res);
284 }
285
286 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
287 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
288 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
289
290 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)291 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
292 {
293 unsigned end = start + size - 1;
294 unsigned start_mod = start % BITSET_WORDBITS;
295 if (start_mod + size <= BITSET_WORDBITS) {
296 BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
297 } else {
298 unsigned first_size = BITSET_WORDBITS - start_mod;
299 set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
300 set_bitset_range(words, start + first_size, size - first_size);
301 }
302 }
303
304 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)305 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
306 {
307 unsigned end = start + size - 1;
308 unsigned start_mod = start % BITSET_WORDBITS;
309 if (start_mod + size <= BITSET_WORDBITS) {
310 return BITSET_TEST_RANGE(words, start, end);
311 } else {
312 unsigned first_size = BITSET_WORDBITS - start_mod;
313 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
314 test_bitset_range(words, start + first_size, size - first_size);
315 }
316 }
317
318 /* A SMEM clause is any group of consecutive SMEM instructions. The
319 * instructions in this group may return out of order and/or may be replayed.
320 *
321 * To fix this potential hazard correctly, we have to make sure that when a
322 * clause has more than one instruction, no instruction in the clause writes
323 * to a register that is read by another instruction in the clause (including
324 * itself). In this case, we have to break the SMEM clause by inserting non
325 * SMEM instructions.
326 *
327 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
328 */
329 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)330 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
331 int* NOPs)
332 {
333 /* break off from previous SMEM clause if needed */
334 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
335 /* Don't allow clauses with store instructions since the clause's
336 * instructions may use the same address. */
337 if (ctx.smem_write || instr->definitions.empty() ||
338 instr_info.is_atomic[(unsigned)instr->opcode]) {
339 *NOPs = 1;
340 } else if (program->dev.xnack_enabled) {
341 for (Operand op : instr->operands) {
342 if (!op.isConstant() &&
343 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
344 *NOPs = 1;
345 break;
346 }
347 }
348
349 Definition def = instr->definitions[0];
350 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
351 *NOPs = 1;
352 }
353 }
354 }
355
356 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
357 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)358 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
359 std::vector<aco_ptr<Instruction>>& new_instructions)
360 {
361 /* check hazards */
362 int NOPs = 0;
363
364 if (instr->isSMEM()) {
365 if (state.program->gfx_level == GFX6) {
366 /* A read of an SGPR by SMRD instruction requires 4 wait states
367 * when the SGPR was written by a VALU instruction. According to LLVM,
368 * there is also an undocumented hardware behavior when the buffer
369 * descriptor is written by a SALU instruction */
370 for (unsigned i = 0; i < instr->operands.size(); i++) {
371 Operand op = instr->operands[i];
372 if (op.isConstant())
373 continue;
374
375 bool is_buffer_desc = i == 0 && op.size() > 2;
376 if (is_buffer_desc)
377 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
378 else
379 handle_valu_then_read_hazard(state, &NOPs, 4, op);
380 }
381 }
382
383 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
384 } else if (instr->isSALU()) {
385 if (instr->opcode == aco_opcode::s_setreg_b32 ||
386 instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
387 instr->opcode == aco_opcode::s_getreg_b32) {
388 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
389 }
390
391 if (state.program->gfx_level == GFX9) {
392 if (instr->opcode == aco_opcode::s_movrels_b32 ||
393 instr->opcode == aco_opcode::s_movrels_b64 ||
394 instr->opcode == aco_opcode::s_movreld_b32 ||
395 instr->opcode == aco_opcode::s_movreld_b64) {
396 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
397 }
398 }
399
400 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
401 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
402 } else if (instr->isDS() && instr->ds().gds) {
403 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
404 } else if (instr->isVALU() || instr->isVINTRP()) {
405 for (Operand op : instr->operands) {
406 if (op.physReg() == vccz)
407 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz);
408 if (op.physReg() == execz)
409 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz);
410 }
411
412 if (instr->isDPP()) {
413 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
414 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
415 }
416
417 for (Definition def : instr->definitions) {
418 if (def.regClass().type() != RegType::sgpr) {
419 for (unsigned i = 0; i < def.size(); i++)
420 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
421 }
422 }
423
424 if ((instr->opcode == aco_opcode::v_readlane_b32 ||
425 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
426 instr->opcode == aco_opcode::v_writelane_b32 ||
427 instr->opcode == aco_opcode::v_writelane_b32_e64) &&
428 !instr->operands[1].isConstant()) {
429 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
430 }
431
432 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
433 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
434 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
435 * This hazard isn't documented anywhere but AMD confirmed that hazard.
436 */
437 if (state.program->gfx_level == GFX6 &&
438 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
439 instr->opcode == aco_opcode::v_readfirstlane_b32)) {
440 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
441 }
442
443 if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
444 instr->opcode == aco_opcode::v_div_fmas_f64)
445 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
446 } else if (instr->isVMEM() || instr->isFlatLike()) {
447 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
448 for (Operand op : instr->operands) {
449 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
450 handle_valu_then_read_hazard(state, &NOPs, 5, op);
451 }
452 }
453
454 if (!instr->isSALU() && instr->format != Format::SMEM)
455 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
456
457 if (state.program->gfx_level == GFX9) {
458 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
459 if (instr->isVINTRP() || lds_scratch_global ||
460 instr->opcode == aco_opcode::ds_read_addtid_b32 ||
461 instr->opcode == aco_opcode::ds_write_addtid_b32 ||
462 instr->opcode == aco_opcode::buffer_store_lds_dword) {
463 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
464 }
465 }
466
467 ctx.add_wait_states(NOPs + get_wait_states(instr));
468
469 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
470 if (NOPs) {
471 /* create NOP */
472 aco_ptr<SOPP_instruction> nop{
473 create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
474 nop->imm = NOPs - 1;
475 nop->block = -1;
476 new_instructions.emplace_back(std::move(nop));
477 }
478
479 /* update information to check for later hazards */
480 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
481 ctx.smem_clause = false;
482 ctx.smem_write = false;
483
484 if (state.program->dev.xnack_enabled) {
485 BITSET_ZERO(ctx.smem_clause_read_write);
486 BITSET_ZERO(ctx.smem_clause_write);
487 }
488 }
489
490 if (instr->isSMEM()) {
491 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
492 ctx.smem_write = true;
493 } else {
494 ctx.smem_clause = true;
495
496 if (state.program->dev.xnack_enabled) {
497 for (Operand op : instr->operands) {
498 if (!op.isConstant()) {
499 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
500 }
501 }
502
503 Definition def = instr->definitions[0];
504 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
505 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
506 }
507 }
508 } else if (instr->isVALU()) {
509 for (Definition def : instr->definitions) {
510 if (def.regClass().type() == RegType::sgpr) {
511 if (def.physReg() == vcc || def.physReg() == vcc_hi) {
512 ctx.valu_wr_vcc_then_vccz = 5;
513 ctx.valu_wr_vcc_then_div_fmas = 4;
514 }
515 if (def.physReg() == exec || def.physReg() == exec_hi) {
516 ctx.valu_wr_exec_then_execz = 5;
517 ctx.valu_wr_exec_then_dpp = 5;
518 }
519 }
520 }
521 } else if (instr->isSALU() && !instr->definitions.empty()) {
522 if (!instr->definitions.empty()) {
523 /* all other definitions should be SCC */
524 Definition def = instr->definitions[0];
525 if (def.physReg() == m0) {
526 ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
527 ctx.salu_wr_m0_then_lds = 1;
528 ctx.salu_wr_m0_then_moverel = 1;
529 }
530 } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
531 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
532 SOPK_instruction& sopk = instr->sopk();
533 unsigned offset = (sopk.imm >> 6) & 0x1f;
534 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
535 unsigned reg = sopk.imm & 0x3f;
536 ctx.setreg_then_getsetreg = 2;
537
538 if (reg == 1 && offset >= 28 && size > (28 - offset))
539 ctx.set_vskip_mode_then_vector = 2;
540 }
541 } else if (instr->isVMEM() || instr->isFlatLike()) {
542 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
543 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
544 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
545 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
546 * store) */
547 bool consider_mimg = instr->isMIMG() &&
548 instr->operands[1].regClass().type() == RegType::vgpr &&
549 instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
550 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
551 bool consider_flat =
552 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
553 if (consider_buf || consider_mimg || consider_flat) {
554 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
555 unsigned size = instr->operands[consider_flat ? 2 : 3].size();
556 for (unsigned i = 0; i < size; i++)
557 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
558 }
559 }
560 }
561
562 template <std::size_t N>
563 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)564 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
565 {
566 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
567 [&check_regs](const Definition& def) -> bool
568 {
569 bool writes_any = false;
570 for (unsigned i = 0; i < def.size(); i++) {
571 unsigned def_reg = def.physReg() + i;
572 writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
573 }
574 return writes_any;
575 });
576 }
577
578 template <std::size_t N>
579 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)580 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
581 {
582 for (const Operand& op : instr->operands) {
583 for (unsigned i = 0; i < op.size(); i++) {
584 unsigned reg = op.physReg() + i;
585 if (reg < reg_reads.size())
586 reg_reads.set(reg);
587 }
588 }
589 }
590
591 template <std::size_t N>
592 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)593 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
594 {
595 mark_read_regs(instr, reg_reads);
596 reg_reads.set(exec);
597 if (state.program->wave_size == 64)
598 reg_reads.set(exec_hi);
599 }
600
601 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)602 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
603 {
604 if (instr->isVOPC())
605 return true;
606 if (instr->isVOP3() && instr->definitions.size() == 2)
607 return true;
608 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
609 instr->opcode == aco_opcode::v_readlane_b32 ||
610 instr->opcode == aco_opcode::v_readlane_b32_e64)
611 return true;
612 return false;
613 }
614
615 bool
instr_writes_exec(const aco_ptr<Instruction> & instr)616 instr_writes_exec(const aco_ptr<Instruction>& instr)
617 {
618 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
619 [](const Definition& def) -> bool
620 { return def.physReg() == exec_lo || def.physReg() == exec_hi; });
621 }
622
623 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)624 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
625 {
626 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
627 [](const Definition& def) -> bool
628 { return def.getTemp().type() == RegType::sgpr; });
629 }
630
631 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)632 instr_is_branch(const aco_ptr<Instruction>& instr)
633 {
634 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
635 instr->opcode == aco_opcode::s_cbranch_scc1 ||
636 instr->opcode == aco_opcode::s_cbranch_vccz ||
637 instr->opcode == aco_opcode::s_cbranch_vccnz ||
638 instr->opcode == aco_opcode::s_cbranch_execz ||
639 instr->opcode == aco_opcode::s_cbranch_execnz ||
640 instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
641 instr->opcode == aco_opcode::s_cbranch_cdbguser ||
642 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
643 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
644 instr->opcode == aco_opcode::s_subvector_loop_begin ||
645 instr->opcode == aco_opcode::s_subvector_loop_end ||
646 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
647 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
648 }
649
650 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)651 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
652 std::vector<aco_ptr<Instruction>>& new_instructions)
653 {
654 // TODO: s_dcache_inv needs to be in it's own group on GFX10
655
656 /* VMEMtoScalarWriteHazard
657 * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
658 * in-between.
659 */
660 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
661 /* Remember all SGPRs that are read by the VMEM/DS instruction */
662 if (instr->isVMEM() || instr->isFlatLike())
663 mark_read_regs_exec(
664 state, instr,
665 instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
666 if (instr->isFlat() || instr->isDS())
667 mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
668 } else if (instr->isSALU() || instr->isSMEM()) {
669 if (instr->opcode == aco_opcode::s_waitcnt) {
670 wait_imm imm(state.program->gfx_level, instr->sopp().imm);
671 if (imm.vm == 0)
672 ctx.sgprs_read_by_VMEM.reset();
673 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && instr->sopp().imm == 0xffe3) {
674 /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */
675 ctx.sgprs_read_by_VMEM.reset();
676 ctx.sgprs_read_by_DS.reset();
677 ctx.sgprs_read_by_VMEM_store.reset();
678 }
679
680 /* Check if SALU writes an SGPR that was previously read by the VALU */
681 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
682 check_written_regs(instr, ctx.sgprs_read_by_DS) ||
683 check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
684 ctx.sgprs_read_by_VMEM.reset();
685 ctx.sgprs_read_by_DS.reset();
686 ctx.sgprs_read_by_VMEM_store.reset();
687
688 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
689 aco_ptr<SOPP_instruction> depctr{
690 create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
691 depctr->imm = 0xffe3;
692 depctr->block = -1;
693 new_instructions.emplace_back(std::move(depctr));
694 }
695 } else if (instr->isVALU()) {
696 /* Hazard is mitigated by any VALU instruction */
697 ctx.sgprs_read_by_VMEM.reset();
698 ctx.sgprs_read_by_DS.reset();
699 ctx.sgprs_read_by_VMEM_store.reset();
700 }
701
702 /* VcmpxPermlaneHazard
703 * Handle any permlane following a VOPC instruction, insert v_mov between them.
704 */
705 if (instr->isVOPC()) {
706 ctx.has_VOPC = true;
707 } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
708 instr->opcode == aco_opcode::v_permlanex16_b32)) {
709 ctx.has_VOPC = false;
710
711 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
712 aco_ptr<VOP1_instruction> v_mov{
713 create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
714 v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
715 v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
716 new_instructions.emplace_back(std::move(v_mov));
717 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
718 ctx.has_VOPC = false;
719 }
720
721 /* VcmpxExecWARHazard
722 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
723 */
724 if (!instr->isVALU() && instr->reads_exec()) {
725 ctx.has_nonVALU_exec_read = true;
726 } else if (instr->isVALU()) {
727 if (instr_writes_exec(instr)) {
728 ctx.has_nonVALU_exec_read = false;
729
730 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
731 aco_ptr<SOPP_instruction> depctr{
732 create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
733 depctr->imm = 0xfffe;
734 depctr->block = -1;
735 new_instructions.emplace_back(std::move(depctr));
736 } else if (instr_writes_sgpr(instr)) {
737 /* Any VALU instruction that writes an SGPR mitigates the problem */
738 ctx.has_nonVALU_exec_read = false;
739 }
740 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
741 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
742 if ((instr->sopp().imm & 0xfffe) == 0xfffe)
743 ctx.has_nonVALU_exec_read = false;
744 }
745
746 /* SMEMtoVectorWriteHazard
747 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
748 */
749 if (instr->isSMEM()) {
750 /* Remember all SGPRs that are read by the SMEM instruction */
751 mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
752 } else if (VALU_writes_sgpr(instr)) {
753 /* Check if VALU writes an SGPR that was previously read by SMEM */
754 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
755 ctx.sgprs_read_by_SMEM.reset();
756
757 /* Insert s_mov to mitigate the problem */
758 aco_ptr<SOP1_instruction> s_mov{
759 create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
760 s_mov->definitions[0] = Definition(sgpr_null, s1);
761 s_mov->operands[0] = Operand::zero();
762 new_instructions.emplace_back(std::move(s_mov));
763 }
764 } else if (instr->isSALU()) {
765 if (instr->format != Format::SOPP) {
766 /* SALU can mitigate the hazard */
767 ctx.sgprs_read_by_SMEM.reset();
768 } else {
769 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
770 const SOPP_instruction& sopp = instr->sopp();
771 if (sopp.opcode == aco_opcode::s_waitcnt_lgkmcnt) {
772 if (sopp.imm == 0 && sopp.definitions[0].physReg() == sgpr_null)
773 ctx.sgprs_read_by_SMEM.reset();
774 } else if (sopp.opcode == aco_opcode::s_waitcnt) {
775 unsigned lgkm = (sopp.imm >> 8) & 0x3f;
776 if (lgkm == 0)
777 ctx.sgprs_read_by_SMEM.reset();
778 }
779 }
780 }
781
782 /* LdsBranchVmemWARHazard
783 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
784 */
785 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
786 ctx.has_VMEM = true;
787 ctx.has_branch_after_VMEM = false;
788 /* Mitigation for DS is needed only if there was already a branch after */
789 ctx.has_DS = ctx.has_branch_after_DS;
790 } else if (instr->isDS()) {
791 ctx.has_DS = true;
792 ctx.has_branch_after_DS = false;
793 /* Mitigation for VMEM is needed only if there was already a branch after */
794 ctx.has_VMEM = ctx.has_branch_after_VMEM;
795 } else if (instr_is_branch(instr)) {
796 ctx.has_branch_after_VMEM = ctx.has_VMEM;
797 ctx.has_branch_after_DS = ctx.has_DS;
798 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
799 /* Only s_waitcnt_vscnt can mitigate the hazard */
800 const SOPK_instruction& sopk = instr->sopk();
801 if (sopk.definitions[0].physReg() == sgpr_null && sopk.imm == 0)
802 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
803 }
804 if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) {
805 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
806
807 /* Insert s_waitcnt_vscnt to mitigate the problem */
808 aco_ptr<SOPK_instruction> wait{
809 create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
810 wait->definitions[0] = Definition(sgpr_null, s1);
811 wait->imm = 0;
812 new_instructions.emplace_back(std::move(wait));
813
814 ctx.has_VMEM = instr->isVMEM() || instr->isGlobal() || instr->isScratch();
815 ctx.has_DS = instr->isDS();
816 }
817
818 /* NSAToVMEMBug
819 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
820 * 0).
821 */
822 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
823 ctx.has_NSA_MIMG = true;
824 } else if (ctx.has_NSA_MIMG) {
825 ctx.has_NSA_MIMG = false;
826
827 if (instr->isMUBUF() || instr->isMTBUF()) {
828 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
829 if (offset & 6)
830 Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
831 }
832 }
833
834 /* waNsaCannotFollowWritelane
835 * Handles NSA MIMG immediately following a v_writelane_b32.
836 */
837 if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
838 ctx.has_writelane = true;
839 } else if (ctx.has_writelane) {
840 ctx.has_writelane = false;
841 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
842 Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
843 }
844 }
845
846 template <typename Ctx>
847 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
848 std::vector<aco_ptr<Instruction>>&);
849
850 template <typename Ctx, HandleInstr<Ctx> Handle>
851 void
handle_block(Program * program,Ctx & ctx,Block & block)852 handle_block(Program* program, Ctx& ctx, Block& block)
853 {
854 if (block.instructions.empty())
855 return;
856
857 State state;
858 state.program = program;
859 state.block = █
860 state.old_instructions = std::move(block.instructions);
861
862 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
863 block.instructions.reserve(state.old_instructions.size());
864
865 for (aco_ptr<Instruction>& instr : state.old_instructions) {
866 Handle(state, ctx, instr, block.instructions);
867 block.instructions.emplace_back(std::move(instr));
868 }
869 }
870
871 template <typename Ctx, HandleInstr<Ctx> Handle>
872 void
mitigate_hazards(Program * program)873 mitigate_hazards(Program* program)
874 {
875 std::vector<Ctx> all_ctx(program->blocks.size());
876 std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
877
878 for (unsigned i = 0; i < program->blocks.size(); i++) {
879 Block& block = program->blocks[i];
880 Ctx& ctx = all_ctx[i];
881
882 if (block.kind & block_kind_loop_header) {
883 loop_header_indices.push(i);
884 } else if (block.kind & block_kind_loop_exit) {
885 /* Go through the whole loop again */
886 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
887 Ctx loop_block_ctx;
888 for (unsigned b : program->blocks[idx].linear_preds)
889 loop_block_ctx.join(all_ctx[b]);
890
891 handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]);
892
893 /* We only need to continue if the loop header context changed */
894 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
895 break;
896
897 all_ctx[idx] = loop_block_ctx;
898 }
899
900 loop_header_indices.pop();
901 }
902
903 for (unsigned b : block.linear_preds)
904 ctx.join(all_ctx[b]);
905
906 handle_block<Ctx, Handle>(program, ctx, block);
907 }
908 }
909
910 } /* end namespace */
911
912 void
insert_NOPs(Program * program)913 insert_NOPs(Program* program)
914 {
915 if (program->gfx_level >= GFX10_3)
916 ; /* no hazards/bugs to mitigate */
917 else if (program->gfx_level >= GFX10)
918 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program);
919 else
920 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
921 }
922
923 } // namespace aco
924