1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include <map>
11 #include <stack>
12 #include <vector>
13
14 namespace aco {
15
16 namespace {
17
18 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
19 * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
20 * that we should switch to a different wave and contains info on dependencies as to
21 * when we can switch back.
22 *
23 * This seems to apply only for ALU->ALU dependencies as other instructions have better
24 * integration with the frontend.
25 *
26 * Note that if we do not emit s_delay_alu things will still be correct, but the wave
27 * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
28 * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
29 * instructions can take a different number of cycles based on the exec mask)
30 */
31 struct alu_delay_info {
32 /* These are the values directly above the max representable value, i.e. the wait
33 * would turn into a no-op when we try to wait for something further back than
34 * this.
35 */
36 static constexpr int8_t valu_nop = 5;
37 static constexpr int8_t trans_nop = 4;
38
39 /* How many VALU instructions ago this value was written */
40 int8_t valu_instrs = valu_nop;
41 /* Cycles until the writing VALU instruction is finished */
42 int8_t valu_cycles = 0;
43
44 /* How many Transcedent instructions ago this value was written */
45 int8_t trans_instrs = trans_nop;
46 /* Cycles until the writing Transcendent instruction is finished */
47 int8_t trans_cycles = 0;
48
49 /* Cycles until the writing SALU instruction is finished*/
50 int8_t salu_cycles = 0;
51
52 /* VALU wrote this as lane mask. */
53 bool lane_mask_forwarding = true;
54
combineaco::__anon58b1cc7c0111::alu_delay_info55 bool combine(const alu_delay_info& other)
56 {
57 bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
58 other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
59 other.trans_cycles > trans_cycles;
60 valu_instrs = std::min(valu_instrs, other.valu_instrs);
61 trans_instrs = std::min(trans_instrs, other.trans_instrs);
62 salu_cycles = std::max(salu_cycles, other.salu_cycles);
63 valu_cycles = std::max(valu_cycles, other.valu_cycles);
64 trans_cycles = std::max(trans_cycles, other.trans_cycles);
65 lane_mask_forwarding &= other.lane_mask_forwarding;
66 return changed;
67 }
68
69 /* Needs to be called after any change to keep the data consistent. */
fixupaco::__anon58b1cc7c0111::alu_delay_info70 bool fixup()
71 {
72 if (valu_instrs >= valu_nop || valu_cycles <= 0) {
73 valu_instrs = valu_nop;
74 valu_cycles = 0;
75 }
76
77 if (trans_instrs >= trans_nop || trans_cycles <= 0) {
78 trans_instrs = trans_nop;
79 trans_cycles = 0;
80 }
81
82 salu_cycles = std::max<int8_t>(salu_cycles, 0);
83
84 return empty();
85 }
86
87 /* Returns true if a wait would be a no-op */
emptyaco::__anon58b1cc7c0111::alu_delay_info88 bool empty() const
89 {
90 return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
91 }
92
printaco::__anon58b1cc7c0111::alu_delay_info93 UNUSED void print(FILE* output) const
94 {
95 if (valu_instrs != valu_nop)
96 fprintf(output, "valu_instrs: %u\n", valu_instrs);
97 if (valu_cycles)
98 fprintf(output, "valu_cycles: %u\n", valu_cycles);
99 if (trans_instrs != trans_nop)
100 fprintf(output, "trans_instrs: %u\n", trans_instrs);
101 if (trans_cycles)
102 fprintf(output, "trans_cycles: %u\n", trans_cycles);
103 if (salu_cycles)
104 fprintf(output, "salu_cycles: %u\n", salu_cycles);
105 }
106 };
107
108 struct delay_ctx {
109 Program* program;
110 std::map<PhysReg, alu_delay_info> gpr_map;
111
delay_ctxaco::__anon58b1cc7c0111::delay_ctx112 delay_ctx() {}
delay_ctxaco::__anon58b1cc7c0111::delay_ctx113 delay_ctx(Program* program_) : program(program_) {}
114
printaco::__anon58b1cc7c0111::delay_ctx115 UNUSED void print(FILE* output) const
116 {
117 for (const auto& entry : gpr_map) {
118 fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
119 entry.first.reg() & 0xff);
120 entry.second.print(output);
121 fprintf(output, "}\n");
122 }
123 }
124 };
125
126 void
check_alu(delay_ctx & ctx,alu_delay_info & delay,Instruction * instr)127 check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
128 {
129 for (unsigned i = 0; i < instr->operands.size(); i++) {
130 const Operand op = instr->operands[i];
131 alu_delay_info op_delay;
132 if (op.isConstant() || op.isUndefined())
133 continue;
134
135 /* check consecutively read gprs */
136 for (unsigned j = 0; j < op.size(); j++) {
137 std::map<PhysReg, alu_delay_info>::iterator it =
138 ctx.gpr_map.find(PhysReg{op.physReg() + j});
139 if (it != ctx.gpr_map.end())
140 op_delay.combine(it->second);
141 }
142
143 bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 ||
144 instr->opcode == aco_opcode::v_cndmask_b16 ||
145 instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
146 i == 2;
147 fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
148 i + 1 == instr->operands.size();
149 if (!op_delay.lane_mask_forwarding || !fast_forward)
150 delay.combine(op_delay);
151 }
152 }
153
154 void
update_alu(delay_ctx & ctx,bool is_valu,bool is_trans,int cycles)155 update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
156 {
157 std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
158 while (it != ctx.gpr_map.end()) {
159 alu_delay_info& entry = it->second;
160 entry.valu_instrs += is_valu ? 1 : 0;
161 entry.trans_instrs += is_trans ? 1 : 0;
162 entry.salu_cycles -= cycles;
163 entry.valu_cycles -= cycles;
164 entry.trans_cycles -= cycles;
165 it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
166 }
167 }
168
169 void
kill_alu(alu_delay_info & delay,Instruction * instr,delay_ctx & ctx)170 kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
171 {
172 /* Consider frontend waits first. These are automatically done by the hardware,
173 * so we don't need to insert s_delay_alu.
174 * They are also lower granularity, waiting for accesses of a counter instead
175 * of only the real per register dependencies.
176 */
177 depctr_wait wait = parse_depctr_wait(instr);
178
179 int8_t implict_cycles = 0;
180 if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec ||
181 !wait.va_exec) {
182 std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
183 while (it != ctx.gpr_map.end()) {
184 alu_delay_info& entry = it->second;
185 bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) ||
186 (it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) ||
187 (it->first >= exec && it->first <= exec_hi && !wait.va_exec);
188 if (wait_valu) {
189 implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
190 entry.valu_cycles = 0;
191 entry.trans_cycles = 0;
192 }
193 bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) ||
194 (it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
195 if (wait_salu) {
196 implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
197 entry.salu_cycles = 0;
198 }
199 it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
200 }
201 }
202
203 /* Previous alu progresses as usual while the frontend waits. */
204 if (implict_cycles != 0)
205 update_alu(ctx, false, false, implict_cycles);
206
207 if (instr->isVALU() || instr->isSALU())
208 check_alu(ctx, delay, instr);
209
210 if (!delay.empty()) {
211 update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
212
213 /* remove all gprs with higher counter from map */
214 std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
215 while (it != ctx.gpr_map.end()) {
216 if (delay.valu_instrs <= it->second.valu_instrs)
217 it->second.valu_instrs = alu_delay_info::valu_nop;
218 if (delay.trans_instrs <= it->second.trans_instrs)
219 it->second.trans_instrs = alu_delay_info::trans_nop;
220 it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
221 }
222 }
223 }
224
225 void
gen_alu(Instruction * instr,delay_ctx & ctx)226 gen_alu(Instruction* instr, delay_ctx& ctx)
227 {
228 Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
229 bool is_valu = instr->isVALU();
230 bool is_trans = instr->isTrans();
231
232 if (is_trans || is_valu || instr->isSALU()) {
233 alu_delay_info delay;
234 delay.lane_mask_forwarding = false;
235 if (is_trans) {
236 delay.trans_instrs = 0;
237 delay.trans_cycles = cycle_info.latency;
238 } else if (is_valu) {
239 delay.valu_instrs = 0;
240 delay.valu_cycles = cycle_info.latency;
241 } else if (instr->isSALU()) {
242 delay.salu_cycles = cycle_info.latency;
243 }
244
245 for (Definition& def : instr->definitions) {
246 if (is_valu && def.regClass() == ctx.program->lane_mask) {
247 delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
248 instr->opcode != aco_opcode::v_readfirstlane_b32;
249 }
250
251 for (unsigned j = 0; j < def.size(); j++) {
252 auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
253 if (!it.second)
254 it.first->second.combine(delay);
255 }
256 }
257 }
258
259 update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
260 cycle_info.issue_cycles);
261 }
262
263 void
emit_delay_alu(delay_ctx & ctx,std::vector<aco_ptr<Instruction>> & instructions,alu_delay_info & delay)264 emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
265 alu_delay_info& delay)
266 {
267 uint32_t imm = 0;
268 if (delay.trans_instrs != delay.trans_nop) {
269 imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
270 }
271
272 if (delay.valu_instrs != delay.valu_nop) {
273 imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
274 }
275
276 /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
277 * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
278 * getting this wrong isn't an issue. */
279 if (delay.salu_cycles && imm <= 0xf) {
280 unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
281 imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
282 }
283
284 Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
285 inst->salu().imm = imm;
286 instructions.emplace_back(inst);
287 delay = alu_delay_info();
288 }
289
290 void
handle_block(Program * program,Block & block,delay_ctx & ctx)291 handle_block(Program* program, Block& block, delay_ctx& ctx)
292 {
293 std::vector<aco_ptr<Instruction>> new_instructions;
294 alu_delay_info queued_delay;
295
296 for (size_t i = 0; i < block.instructions.size(); i++) {
297 aco_ptr<Instruction>& instr = block.instructions[i];
298 assert(instr->opcode != aco_opcode::s_delay_alu);
299
300 kill_alu(queued_delay, instr.get(), ctx);
301 gen_alu(instr.get(), ctx);
302
303 if (!queued_delay.empty())
304 emit_delay_alu(ctx, new_instructions, queued_delay);
305 new_instructions.emplace_back(std::move(instr));
306 }
307
308 block.instructions.swap(new_instructions);
309 }
310
311 } /* end namespace */
312
313 void
insert_delay_alu(Program * program)314 insert_delay_alu(Program* program)
315 {
316 /* per BB ctx */
317 delay_ctx ctx(program);
318
319 for (unsigned i = 0; i < program->blocks.size(); i++) {
320 Block& current = program->blocks[i];
321
322 if (current.instructions.empty())
323 continue;
324
325 handle_block(program, current, ctx);
326
327 /* Reset ctx if there is a jump, assuming ALU will be done
328 * because branch latency is pretty high.
329 */
330 if (current.linear_succs.empty() ||
331 current.instructions.back()->opcode == aco_opcode::s_branch) {
332 ctx = delay_ctx(program);
333 }
334 }
335 }
336
337 void
combine_delay_alu(Program * program)338 combine_delay_alu(Program* program)
339 {
340 /* Combine s_delay_alu using the skip field. */
341 for (Block& block : program->blocks) {
342 int i = 0;
343 int prev_delay_alu = -1;
344 for (aco_ptr<Instruction>& instr : block.instructions) {
345 if (instr->opcode != aco_opcode::s_delay_alu) {
346 block.instructions[i++] = std::move(instr);
347 continue;
348 }
349
350 uint16_t imm = instr->salu().imm;
351 int skip = i - prev_delay_alu - 1;
352 if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
353 if (imm >> 7 == 0)
354 prev_delay_alu = i;
355 block.instructions[i++] = std::move(instr);
356 continue;
357 }
358
359 block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
360 prev_delay_alu = -1;
361 }
362 block.instructions.resize(i);
363 }
364 }
365
366 } // namespace aco
367