• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include <map>
11 #include <stack>
12 #include <vector>
13 
14 namespace aco {
15 
16 namespace {
17 
18 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
19  * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
20  * that we should switch to a different wave and contains info on dependencies as to
21  * when we can switch back.
22  *
23  * This seems to apply only for ALU->ALU dependencies as other instructions have better
24  * integration with the frontend.
25  *
26  * Note that if we do not emit s_delay_alu things will still be correct, but the wave
27  * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
28  * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
29  * instructions can take a different number of cycles based on the exec mask)
30  */
31 struct alu_delay_info {
32    /* These are the values directly above the max representable value, i.e. the wait
33     * would turn into a no-op when we try to wait for something further back than
34     * this.
35     */
36    static constexpr int8_t valu_nop = 5;
37    static constexpr int8_t trans_nop = 4;
38 
39    /* How many VALU instructions ago this value was written */
40    int8_t valu_instrs = valu_nop;
41    /* Cycles until the writing VALU instruction is finished */
42    int8_t valu_cycles = 0;
43 
44    /* How many Transcedent instructions ago this value was written */
45    int8_t trans_instrs = trans_nop;
46    /* Cycles until the writing Transcendent instruction is finished */
47    int8_t trans_cycles = 0;
48 
49    /* Cycles until the writing SALU instruction is finished*/
50    int8_t salu_cycles = 0;
51 
52    /* VALU wrote this as lane mask. */
53    bool lane_mask_forwarding = true;
54 
combineaco::__anon58b1cc7c0111::alu_delay_info55    bool combine(const alu_delay_info& other)
56    {
57       bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
58                      other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
59                      other.trans_cycles > trans_cycles;
60       valu_instrs = std::min(valu_instrs, other.valu_instrs);
61       trans_instrs = std::min(trans_instrs, other.trans_instrs);
62       salu_cycles = std::max(salu_cycles, other.salu_cycles);
63       valu_cycles = std::max(valu_cycles, other.valu_cycles);
64       trans_cycles = std::max(trans_cycles, other.trans_cycles);
65       lane_mask_forwarding &= other.lane_mask_forwarding;
66       return changed;
67    }
68 
69    /* Needs to be called after any change to keep the data consistent. */
fixupaco::__anon58b1cc7c0111::alu_delay_info70    bool fixup()
71    {
72       if (valu_instrs >= valu_nop || valu_cycles <= 0) {
73          valu_instrs = valu_nop;
74          valu_cycles = 0;
75       }
76 
77       if (trans_instrs >= trans_nop || trans_cycles <= 0) {
78          trans_instrs = trans_nop;
79          trans_cycles = 0;
80       }
81 
82       salu_cycles = std::max<int8_t>(salu_cycles, 0);
83 
84       return empty();
85    }
86 
87    /* Returns true if a wait would be a no-op */
emptyaco::__anon58b1cc7c0111::alu_delay_info88    bool empty() const
89    {
90       return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
91    }
92 
printaco::__anon58b1cc7c0111::alu_delay_info93    UNUSED void print(FILE* output) const
94    {
95       if (valu_instrs != valu_nop)
96          fprintf(output, "valu_instrs: %u\n", valu_instrs);
97       if (valu_cycles)
98          fprintf(output, "valu_cycles: %u\n", valu_cycles);
99       if (trans_instrs != trans_nop)
100          fprintf(output, "trans_instrs: %u\n", trans_instrs);
101       if (trans_cycles)
102          fprintf(output, "trans_cycles: %u\n", trans_cycles);
103       if (salu_cycles)
104          fprintf(output, "salu_cycles: %u\n", salu_cycles);
105    }
106 };
107 
108 struct delay_ctx {
109    Program* program;
110    std::map<PhysReg, alu_delay_info> gpr_map;
111 
delay_ctxaco::__anon58b1cc7c0111::delay_ctx112    delay_ctx() {}
delay_ctxaco::__anon58b1cc7c0111::delay_ctx113    delay_ctx(Program* program_) : program(program_) {}
114 
printaco::__anon58b1cc7c0111::delay_ctx115    UNUSED void print(FILE* output) const
116    {
117       for (const auto& entry : gpr_map) {
118          fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
119                  entry.first.reg() & 0xff);
120          entry.second.print(output);
121          fprintf(output, "}\n");
122       }
123    }
124 };
125 
126 void
check_alu(delay_ctx & ctx,alu_delay_info & delay,Instruction * instr)127 check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
128 {
129    for (unsigned i = 0; i < instr->operands.size(); i++) {
130       const Operand op = instr->operands[i];
131       alu_delay_info op_delay;
132       if (op.isConstant() || op.isUndefined())
133          continue;
134 
135       /* check consecutively read gprs */
136       for (unsigned j = 0; j < op.size(); j++) {
137          std::map<PhysReg, alu_delay_info>::iterator it =
138             ctx.gpr_map.find(PhysReg{op.physReg() + j});
139          if (it != ctx.gpr_map.end())
140             op_delay.combine(it->second);
141       }
142 
143       bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 ||
144                            instr->opcode == aco_opcode::v_cndmask_b16 ||
145                            instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
146                           i == 2;
147       fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
148                       i + 1 == instr->operands.size();
149       if (!op_delay.lane_mask_forwarding || !fast_forward)
150          delay.combine(op_delay);
151    }
152 }
153 
154 void
update_alu(delay_ctx & ctx,bool is_valu,bool is_trans,int cycles)155 update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
156 {
157    std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
158    while (it != ctx.gpr_map.end()) {
159       alu_delay_info& entry = it->second;
160       entry.valu_instrs += is_valu ? 1 : 0;
161       entry.trans_instrs += is_trans ? 1 : 0;
162       entry.salu_cycles -= cycles;
163       entry.valu_cycles -= cycles;
164       entry.trans_cycles -= cycles;
165       it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
166    }
167 }
168 
169 void
kill_alu(alu_delay_info & delay,Instruction * instr,delay_ctx & ctx)170 kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
171 {
172    /* Consider frontend waits first. These are automatically done by the hardware,
173     * so we don't need to insert s_delay_alu.
174     * They are also lower granularity, waiting for accesses of a counter instead
175     * of only the real per register dependencies.
176     */
177    depctr_wait wait = parse_depctr_wait(instr);
178 
179    int8_t implict_cycles = 0;
180    if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec ||
181        !wait.va_exec) {
182       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
183       while (it != ctx.gpr_map.end()) {
184          alu_delay_info& entry = it->second;
185          bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) ||
186                           (it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) ||
187                           (it->first >= exec && it->first <= exec_hi && !wait.va_exec);
188          if (wait_valu) {
189             implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
190             entry.valu_cycles = 0;
191             entry.trans_cycles = 0;
192          }
193          bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) ||
194                           (it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
195          if (wait_salu) {
196             implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
197             entry.salu_cycles = 0;
198          }
199          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
200       }
201    }
202 
203    /* Previous alu progresses as usual while the frontend waits. */
204    if (implict_cycles != 0)
205       update_alu(ctx, false, false, implict_cycles);
206 
207    if (instr->isVALU() || instr->isSALU())
208       check_alu(ctx, delay, instr);
209 
210    if (!delay.empty()) {
211       update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
212 
213       /* remove all gprs with higher counter from map */
214       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
215       while (it != ctx.gpr_map.end()) {
216          if (delay.valu_instrs <= it->second.valu_instrs)
217             it->second.valu_instrs = alu_delay_info::valu_nop;
218          if (delay.trans_instrs <= it->second.trans_instrs)
219             it->second.trans_instrs = alu_delay_info::trans_nop;
220          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
221       }
222    }
223 }
224 
225 void
gen_alu(Instruction * instr,delay_ctx & ctx)226 gen_alu(Instruction* instr, delay_ctx& ctx)
227 {
228    Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
229    bool is_valu = instr->isVALU();
230    bool is_trans = instr->isTrans();
231 
232    if (is_trans || is_valu || instr->isSALU()) {
233       alu_delay_info delay;
234       delay.lane_mask_forwarding = false;
235       if (is_trans) {
236          delay.trans_instrs = 0;
237          delay.trans_cycles = cycle_info.latency;
238       } else if (is_valu) {
239          delay.valu_instrs = 0;
240          delay.valu_cycles = cycle_info.latency;
241       } else if (instr->isSALU()) {
242          delay.salu_cycles = cycle_info.latency;
243       }
244 
245       for (Definition& def : instr->definitions) {
246          if (is_valu && def.regClass() == ctx.program->lane_mask) {
247             delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
248                                          instr->opcode != aco_opcode::v_readfirstlane_b32;
249          }
250 
251          for (unsigned j = 0; j < def.size(); j++) {
252             auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
253             if (!it.second)
254                it.first->second.combine(delay);
255          }
256       }
257    }
258 
259    update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
260               cycle_info.issue_cycles);
261 }
262 
263 void
emit_delay_alu(delay_ctx & ctx,std::vector<aco_ptr<Instruction>> & instructions,alu_delay_info & delay)264 emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
265                alu_delay_info& delay)
266 {
267    uint32_t imm = 0;
268    if (delay.trans_instrs != delay.trans_nop) {
269       imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
270    }
271 
272    if (delay.valu_instrs != delay.valu_nop) {
273       imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
274    }
275 
276    /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
277     * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
278     * getting this wrong isn't an issue. */
279    if (delay.salu_cycles && imm <= 0xf) {
280       unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
281       imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
282    }
283 
284    Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
285    inst->salu().imm = imm;
286    instructions.emplace_back(inst);
287    delay = alu_delay_info();
288 }
289 
290 void
handle_block(Program * program,Block & block,delay_ctx & ctx)291 handle_block(Program* program, Block& block, delay_ctx& ctx)
292 {
293    std::vector<aco_ptr<Instruction>> new_instructions;
294    alu_delay_info queued_delay;
295 
296    for (size_t i = 0; i < block.instructions.size(); i++) {
297       aco_ptr<Instruction>& instr = block.instructions[i];
298       assert(instr->opcode != aco_opcode::s_delay_alu);
299 
300       kill_alu(queued_delay, instr.get(), ctx);
301       gen_alu(instr.get(), ctx);
302 
303       if (!queued_delay.empty())
304          emit_delay_alu(ctx, new_instructions, queued_delay);
305       new_instructions.emplace_back(std::move(instr));
306    }
307 
308    block.instructions.swap(new_instructions);
309 }
310 
311 } /* end namespace */
312 
313 void
insert_delay_alu(Program * program)314 insert_delay_alu(Program* program)
315 {
316    /* per BB ctx */
317    delay_ctx ctx(program);
318 
319    for (unsigned i = 0; i < program->blocks.size(); i++) {
320       Block& current = program->blocks[i];
321 
322       if (current.instructions.empty())
323          continue;
324 
325       handle_block(program, current, ctx);
326 
327       /* Reset ctx if there is a jump, assuming ALU will be done
328        * because branch latency is pretty high.
329        */
330       if (current.linear_succs.empty() ||
331           current.instructions.back()->opcode == aco_opcode::s_branch) {
332          ctx = delay_ctx(program);
333       }
334    }
335 }
336 
337 void
combine_delay_alu(Program * program)338 combine_delay_alu(Program* program)
339 {
340    /* Combine s_delay_alu using the skip field. */
341    for (Block& block : program->blocks) {
342       int i = 0;
343       int prev_delay_alu = -1;
344       for (aco_ptr<Instruction>& instr : block.instructions) {
345          if (instr->opcode != aco_opcode::s_delay_alu) {
346             block.instructions[i++] = std::move(instr);
347             continue;
348          }
349 
350          uint16_t imm = instr->salu().imm;
351          int skip = i - prev_delay_alu - 1;
352          if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
353             if (imm >> 7 == 0)
354                prev_delay_alu = i;
355             block.instructions[i++] = std::move(instr);
356             continue;
357          }
358 
359          block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
360          prev_delay_alu = -1;
361       }
362       block.instructions.resize(i);
363    }
364 }
365 
366 } // namespace aco
367