• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include <algorithm>
29 #include <array>
30 #include <bitset>
31 #include <vector>
32 
33 namespace aco {
34 namespace {
35 
36 constexpr const size_t max_reg_cnt = 512;
37 constexpr const size_t max_sgpr_cnt = 128;
38 constexpr const size_t min_vgpr = 256;
39 constexpr const size_t max_vgpr_cnt = 256;
40 
41 struct Idx {
operator ==aco::__anon42f748b10111::Idx42    bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
operator !=aco::__anon42f748b10111::Idx43    bool operator!=(const Idx& other) const { return !operator==(other); }
44 
foundaco::__anon42f748b10111::Idx45    bool found() const { return block != UINT32_MAX; }
46 
47    uint32_t block;
48    uint32_t instr;
49 };
50 
51 /** Indicates that a register was not yet written in the shader. */
52 Idx not_written_yet{UINT32_MAX, 0};
53 
54 /** Indicates that an operand is constant or undefined, not written by any instruction. */
55 Idx const_or_undef{UINT32_MAX, 2};
56 
57 /** Indicates that a register was overwritten by different instructions in previous blocks. */
58 Idx overwritten_untrackable{UINT32_MAX, 3};
59 
60 /** Indicates that a register was written by subdword operations. */
61 Idx overwritten_subdword{UINT32_MAX, 4};
62 
63 struct pr_opt_ctx {
64    using Idx_array = std::array<Idx, max_reg_cnt>;
65 
66    Program* program;
67    Block* current_block;
68    uint32_t current_instr_idx;
69    std::vector<uint16_t> uses;
70    std::unique_ptr<Idx_array[]> instr_idx_by_regs;
71 
pr_opt_ctxaco::__anon42f748b10111::pr_opt_ctx72    pr_opt_ctx(Program* p)
73        : program(p), current_block(nullptr), current_instr_idx(0), uses(dead_code_analysis(p)),
74          instr_idx_by_regs(std::unique_ptr<Idx_array[]>{new Idx_array[p->blocks.size()]})
75    {}
76 
reset_block_regsaco::__anon42f748b10111::pr_opt_ctx77    ALWAYS_INLINE void reset_block_regs(const std::vector<uint32_t>& preds,
78                                        const unsigned block_index, const unsigned min_reg,
79                                        const unsigned num_regs)
80    {
81       const unsigned num_preds = preds.size();
82       const unsigned first_pred = preds[0];
83 
84       /* Copy information from the first predecessor. */
85       memcpy(&instr_idx_by_regs[block_index][min_reg], &instr_idx_by_regs[first_pred][min_reg],
86              num_regs * sizeof(Idx));
87 
88       /* Mark overwritten if it doesn't match with other predecessors. */
89       const unsigned until_reg = min_reg + num_regs;
90       for (unsigned i = 1; i < num_preds; ++i) {
91          unsigned pred = preds[i];
92          for (unsigned reg = min_reg; reg < until_reg; ++reg) {
93             Idx& idx = instr_idx_by_regs[block_index][reg];
94             if (idx == overwritten_untrackable)
95                continue;
96 
97             if (idx != instr_idx_by_regs[pred][reg])
98                idx = overwritten_untrackable;
99          }
100       }
101    }
102 
reset_blockaco::__anon42f748b10111::pr_opt_ctx103    void reset_block(Block* block)
104    {
105       current_block = block;
106       current_instr_idx = 0;
107 
108       if (block->linear_preds.empty()) {
109          std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
110                    not_written_yet);
111       } else if (block->kind & block_kind_loop_header) {
112          /* Instructions inside the loop may overwrite registers of temporaries that are
113           * not live inside the loop, but we can't detect that because we haven't processed
114           * the blocks in the loop yet. As a workaround, mark all registers as untrackable.
115           * TODO: Consider improving this in the future.
116           */
117          std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
118                    overwritten_untrackable);
119       } else {
120          reset_block_regs(block->linear_preds, block->index, 0, max_sgpr_cnt);
121          reset_block_regs(block->linear_preds, block->index, 251, 3);
122 
123          if (!block->logical_preds.empty()) {
124             /* We assume that VGPRs are only read by blocks which have a logical predecessor,
125              * ie. any block that reads any VGPR has at least 1 logical predecessor.
126              */
127             reset_block_regs(block->logical_preds, block->index, min_vgpr, max_vgpr_cnt);
128          } else {
129             /* If a block has no logical predecessors, it is not part of the
130              * logical CFG and therefore it also won't have any logical successors.
131              * Such a block does not write any VGPRs ever.
132              */
133             assert(block->logical_succs.empty());
134          }
135       }
136    }
137 
getaco::__anon42f748b10111::pr_opt_ctx138    Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
139 };
140 
141 void
save_reg_writes(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)142 save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
143 {
144    for (const Definition& def : instr->definitions) {
145       assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
146       assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
147 
148       unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
149       unsigned r = def.physReg().reg();
150       Idx idx{ctx.current_block->index, ctx.current_instr_idx};
151 
152       if (def.regClass().is_subdword())
153          idx = overwritten_subdword;
154 
155       assert((r + dw_size) <= max_reg_cnt);
156       assert(def.size() == dw_size || def.regClass().is_subdword());
157       std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
158                 ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx);
159    }
160 }
161 
162 Idx
last_writer_idx(pr_opt_ctx & ctx,PhysReg physReg,RegClass rc)163 last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
164 {
165    /* Verify that all of the operand's registers are written by the same instruction. */
166    assert(physReg.reg() < max_reg_cnt);
167    Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
168    unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
169    unsigned r = physReg.reg();
170    bool all_same =
171       std::all_of(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
172                   ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size,
173                   [instr_idx](Idx i) { return i == instr_idx; });
174 
175    return all_same ? instr_idx : overwritten_untrackable;
176 }
177 
178 Idx
last_writer_idx(pr_opt_ctx & ctx,const Operand & op)179 last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
180 {
181    if (op.isConstant() || op.isUndefined())
182       return const_or_undef;
183 
184    return last_writer_idx(ctx, op.physReg(), op.regClass());
185 }
186 
187 /**
188  * Check whether a register has been overwritten since the given location.
189  * This is an important part of checking whether certain optimizations are
190  * valid.
191  * Note that the decision is made based on registers and not on SSA IDs.
192  */
193 bool
is_overwritten_since(pr_opt_ctx & ctx,PhysReg reg,RegClass rc,const Idx & since_idx)194 is_overwritten_since(pr_opt_ctx& ctx, PhysReg reg, RegClass rc, const Idx& since_idx)
195 {
196    /* If we didn't find an instruction, assume that the register is overwritten. */
197    if (!since_idx.found())
198       return true;
199 
200    /* TODO: We currently can't keep track of subdword registers. */
201    if (rc.is_subdword())
202       return true;
203 
204    unsigned begin_reg = reg.reg();
205    unsigned end_reg = begin_reg + rc.size();
206    unsigned current_block_idx = ctx.current_block->index;
207 
208    for (unsigned r = begin_reg; r < end_reg; ++r) {
209       Idx& i = ctx.instr_idx_by_regs[current_block_idx][r];
210       if (i == overwritten_untrackable && current_block_idx > since_idx.block)
211          return true;
212       else if (i == overwritten_untrackable || i == not_written_yet)
213          continue;
214       else if (i == overwritten_subdword)
215          return true;
216 
217       assert(i.found());
218 
219       if (i.block > since_idx.block || (i.block == since_idx.block && i.instr > since_idx.instr))
220          return true;
221    }
222 
223    return false;
224 }
225 
226 template <typename T>
227 bool
is_overwritten_since(pr_opt_ctx & ctx,const T & t,const Idx & idx)228 is_overwritten_since(pr_opt_ctx& ctx, const T& t, const Idx& idx)
229 {
230    return is_overwritten_since(ctx, t.physReg(), t.regClass(), idx);
231 }
232 
233 void
try_apply_branch_vcc(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)234 try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
235 {
236    /* We are looking for the following pattern:
237     *
238     * vcc = ...                      ; last_vcc_wr
239     * sX, scc = s_and_bXX vcc, exec  ; op0_instr
240     * (...vcc and exec must not be overwritten inbetween...)
241     * s_cbranch_XX scc               ; instr
242     *
243     * If possible, the above is optimized into:
244     *
245     * vcc = ...                      ; last_vcc_wr
246     * s_cbranch_XX vcc               ; instr modified to use vcc
247     */
248 
249    /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
250    if (ctx.program->gfx_level < GFX8)
251       return;
252 
253    if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
254        instr->operands[0].physReg() != scc)
255       return;
256 
257    Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
258    Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
259 
260    /* We need to make sure:
261     * - the instructions that wrote the operand register and VCC are both found
262     * - the operand register used by the branch, and VCC were both written in the current block
263     * - EXEC hasn't been overwritten since the last VCC write
264     * - VCC hasn't been overwritten since the operand register was written
265     *   (ie. the last VCC writer precedes the op0 writer)
266     */
267    if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
268        op0_instr_idx.block != ctx.current_block->index ||
269        last_vcc_wr_idx.block != ctx.current_block->index ||
270        is_overwritten_since(ctx, exec, ctx.program->lane_mask, last_vcc_wr_idx) ||
271        is_overwritten_since(ctx, vcc, ctx.program->lane_mask, op0_instr_idx))
272       return;
273 
274    Instruction* op0_instr = ctx.get(op0_instr_idx);
275    Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
276 
277    if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
278         op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
279        op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
280        !last_vcc_wr->isVOPC())
281       return;
282 
283    assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
284 
285    /* Reduce the uses of the SCC def */
286    ctx.uses[instr->operands[0].tempId()]--;
287    /* Use VCC instead of SCC in the branch */
288    instr->operands[0] = op0_instr->operands[0];
289 }
290 
291 void
try_optimize_scc_nocompare(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)292 try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
293 {
294    /* We are looking for the following pattern:
295     *
296     * s_bfe_u32 s0, s3, 0x40018  ; outputs SGPR and SCC if the SGPR != 0
297     * s_cmp_eq_i32 s0, 0         ; comparison between the SGPR and 0
298     * s_cbranch_scc0 BB3         ; use the result of the comparison, eg. branch or cselect
299     *
300     * If possible, the above is optimized into:
301     *
302     * s_bfe_u32 s0, s3, 0x40018  ; original instruction
303     * s_cbranch_scc1 BB3         ; modified to use SCC directly rather than the SGPR with comparison
304     *
305     */
306 
307    if (!instr->isSALU() && !instr->isBranch())
308       return;
309 
310    if (instr->isSOPC() &&
311        (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
312         instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
313         instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
314        (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
315        (instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
316       /* Make sure the constant is always in operand 1 */
317       if (instr->operands[0].isConstant())
318          std::swap(instr->operands[0], instr->operands[1]);
319 
320       if (ctx.uses[instr->operands[0].tempId()] > 1)
321          return;
322 
323       /* Find the writer instruction of Operand 0. */
324       Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
325       if (!wr_idx.found())
326          return;
327 
328       Instruction* wr_instr = ctx.get(wr_idx);
329       if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
330           wr_instr->definitions[1].physReg() != scc)
331          return;
332 
333       /* Look for instructions which set SCC := (D != 0) */
334       switch (wr_instr->opcode) {
335       case aco_opcode::s_bfe_i32:
336       case aco_opcode::s_bfe_i64:
337       case aco_opcode::s_bfe_u32:
338       case aco_opcode::s_bfe_u64:
339       case aco_opcode::s_and_b32:
340       case aco_opcode::s_and_b64:
341       case aco_opcode::s_andn2_b32:
342       case aco_opcode::s_andn2_b64:
343       case aco_opcode::s_or_b32:
344       case aco_opcode::s_or_b64:
345       case aco_opcode::s_orn2_b32:
346       case aco_opcode::s_orn2_b64:
347       case aco_opcode::s_xor_b32:
348       case aco_opcode::s_xor_b64:
349       case aco_opcode::s_not_b32:
350       case aco_opcode::s_not_b64:
351       case aco_opcode::s_nor_b32:
352       case aco_opcode::s_nor_b64:
353       case aco_opcode::s_xnor_b32:
354       case aco_opcode::s_xnor_b64:
355       case aco_opcode::s_nand_b32:
356       case aco_opcode::s_nand_b64:
357       case aco_opcode::s_lshl_b32:
358       case aco_opcode::s_lshl_b64:
359       case aco_opcode::s_lshr_b32:
360       case aco_opcode::s_lshr_b64:
361       case aco_opcode::s_ashr_i32:
362       case aco_opcode::s_ashr_i64:
363       case aco_opcode::s_abs_i32:
364       case aco_opcode::s_absdiff_i32: break;
365       default: return;
366       }
367 
368       /* Check whether both SCC and Operand 0 are written by the same instruction. */
369       Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
370       if (wr_idx != sccwr_idx) {
371          /* Check whether the current instruction is the only user of its first operand. */
372          if (ctx.uses[wr_instr->definitions[1].tempId()] ||
373              ctx.uses[wr_instr->definitions[0].tempId()] > 1)
374             return;
375 
376          /* Check whether the operands of the writer are overwritten. */
377          for (const Operand& op : wr_instr->operands) {
378             if (!op.isConstant() && is_overwritten_since(ctx, op, wr_idx))
379                return;
380          }
381 
382          aco_opcode pulled_opcode = wr_instr->opcode;
383          if (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
384              instr->opcode == aco_opcode::s_cmp_eq_i32 ||
385              instr->opcode == aco_opcode::s_cmp_eq_u64) {
386             /* When s_cmp_eq is used, it effectively inverts the SCC def.
387              * However, we can't simply invert the opcodes here because that
388              * would change the meaning of the program.
389              */
390             return;
391          }
392 
393          Definition scc_def = instr->definitions[0];
394          ctx.uses[wr_instr->definitions[0].tempId()]--;
395 
396          /* Copy the writer instruction, but use SCC from the current instr.
397           * This means that the original instruction will be eliminated.
398           */
399          if (wr_instr->format == Format::SOP2) {
400             instr.reset(create_instruction<SOP2_instruction>(pulled_opcode, Format::SOP2, 2, 2));
401             instr->operands[1] = wr_instr->operands[1];
402          } else if (wr_instr->format == Format::SOP1) {
403             instr.reset(create_instruction<SOP1_instruction>(pulled_opcode, Format::SOP1, 1, 2));
404          }
405          instr->definitions[0] = wr_instr->definitions[0];
406          instr->definitions[1] = scc_def;
407          instr->operands[0] = wr_instr->operands[0];
408          return;
409       }
410 
411       /* Use the SCC def from wr_instr */
412       ctx.uses[instr->operands[0].tempId()]--;
413       instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc);
414       ctx.uses[instr->operands[0].tempId()]++;
415 
416       /* Set the opcode and operand to 32-bit */
417       instr->operands[1] = Operand::zero();
418       instr->opcode =
419          (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
420           instr->opcode == aco_opcode::s_cmp_eq_u64)
421             ? aco_opcode::s_cmp_eq_u32
422             : aco_opcode::s_cmp_lg_u32;
423    } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
424                instr->operands[0].physReg() == scc) ||
425               instr->opcode == aco_opcode::s_cselect_b32 ||
426               instr->opcode == aco_opcode::s_cselect_b64) {
427 
428       /* For cselect, operand 2 is the SCC condition */
429       unsigned scc_op_idx = 0;
430       if (instr->opcode == aco_opcode::s_cselect_b32 ||
431           instr->opcode == aco_opcode::s_cselect_b64) {
432          scc_op_idx = 2;
433       }
434 
435       Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
436       if (!wr_idx.found())
437          return;
438 
439       Instruction* wr_instr = ctx.get(wr_idx);
440 
441       /* Check if we found the pattern above. */
442       if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
443           wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
444          return;
445       if (wr_instr->operands[0].physReg() != scc)
446          return;
447       if (!wr_instr->operands[1].constantEquals(0))
448          return;
449 
450       /* The optimization can be unsafe when there are other users. */
451       if (ctx.uses[instr->operands[scc_op_idx].tempId()] > 1)
452          return;
453 
454       if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
455          /* Flip the meaning of the instruction to correctly use the SCC. */
456          if (instr->format == Format::PSEUDO_BRANCH)
457             instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
458                                                                      : aco_opcode::p_cbranch_z;
459          else if (instr->opcode == aco_opcode::s_cselect_b32 ||
460                   instr->opcode == aco_opcode::s_cselect_b64)
461             std::swap(instr->operands[0], instr->operands[1]);
462          else
463             unreachable(
464                "scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
465       }
466 
467       /* Use the SCC def from the original instruction, not the comparison */
468       ctx.uses[instr->operands[scc_op_idx].tempId()]--;
469       instr->operands[scc_op_idx] = wr_instr->operands[0];
470    }
471 }
472 
473 void
try_combine_dpp(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)474 try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
475 {
476    /* We are looking for the following pattern:
477     *
478     * v_mov_dpp vA, vB, ...      ; move instruction with DPP
479     * v_xxx vC, vA, ...          ; current instr that uses the result from the move
480     *
481     * If possible, the above is optimized into:
482     *
483     * v_xxx_dpp vC, vB, ...      ; current instr modified to use DPP directly
484     *
485     */
486 
487    if (!instr->isVALU() || instr->isDPP())
488       return;
489 
490    for (unsigned i = 0; i < instr->operands.size(); i++) {
491       Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]);
492       if (!op_instr_idx.found())
493          continue;
494 
495       /* is_overwritten_since only considers active lanes when the register could possibly
496        * have been overwritten from inactive lanes. Restrict this optimization to at most
497        * one block so that there is no possibility for clobbered inactive lanes.
498        */
499       if (ctx.current_block->index - op_instr_idx.block > 1)
500          continue;
501 
502       const Instruction* mov = ctx.get(op_instr_idx);
503       if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP())
504          continue;
505 
506       /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite
507        * it's own operand before we use it.
508        */
509       if (mov->definitions[0].physReg() == mov->operands[0].physReg() &&
510           (!mov->definitions[0].tempId() || ctx.uses[mov->definitions[0].tempId()] > 1))
511          continue;
512 
513       /* Don't propagate DPP if the source register is overwritten since the move. */
514       if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
515          continue;
516 
517       bool dpp8 = mov->isDPP8();
518 
519       /* Fetch-inactive means exec is ignored, which allows us to combine across exec changes. */
520       if (!(dpp8 ? mov->dpp8().fetch_inactive : mov->dpp16().fetch_inactive) &&
521           is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx))
522          continue;
523 
524       /* We won't eliminate the DPP mov if the operand is used twice */
525       bool op_used_twice = false;
526       for (unsigned j = 0; j < instr->operands.size(); j++)
527          op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
528       if (op_used_twice)
529          continue;
530 
531       bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) &&
532                         get_operand_size(instr, i) == 32;
533       bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0];
534       if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
535          continue;
536 
537       if (i != 0) {
538          if (!can_swap_operands(instr, &instr->opcode, 0, i))
539             continue;
540          instr->valu().swapOperands(0, i);
541       }
542 
543       if (!can_use_DPP(ctx.program->gfx_level, instr, dpp8))
544          continue;
545 
546       if (!dpp8) /* anything else doesn't make sense in SSA */
547          assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf);
548 
549       if (--ctx.uses[mov->definitions[0].tempId()])
550          ctx.uses[mov->operands[0].tempId()]++;
551 
552       convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
553 
554       instr->operands[0] = mov->operands[0];
555 
556       if (dpp8) {
557          DPP8_instruction* dpp = &instr->dpp8();
558          dpp->lane_sel = mov->dpp8().lane_sel;
559          dpp->fetch_inactive = mov->dpp8().fetch_inactive;
560          if (mov_uses_mods)
561             instr->format = asVOP3(instr->format);
562       } else {
563          DPP16_instruction* dpp = &instr->dpp16();
564          dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
565          dpp->bound_ctrl = true;
566          dpp->fetch_inactive = mov->dpp16().fetch_inactive;
567       }
568       instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0];
569       instr->valu().abs[0] |= mov->valu().abs[0];
570       return;
571    }
572 }
573 
574 unsigned
num_encoded_alu_operands(const aco_ptr<Instruction> & instr)575 num_encoded_alu_operands(const aco_ptr<Instruction>& instr)
576 {
577    if (instr->isSALU()) {
578       if (instr->isSOP2() || instr->isSOPC())
579          return 2;
580       else if (instr->isSOP1())
581          return 1;
582 
583       return 0;
584    }
585 
586    if (instr->isVALU()) {
587       if (instr->isVOP1())
588          return 1;
589       else if (instr->isVOPC() || instr->isVOP2())
590          return 2;
591       else if (instr->opcode == aco_opcode::v_writelane_b32_e64 ||
592                instr->opcode == aco_opcode::v_writelane_b32)
593          return 2; /* potentially VOP3, but reads VDST as SRC2 */
594       else if (instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG())
595          return instr->operands.size();
596    }
597 
598    return 0;
599 }
600 
601 void
try_reassign_split_vector(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)602 try_reassign_split_vector(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
603 {
604    /* Any unused split_vector definition can always use the same register
605     * as the operand. This avoids creating unnecessary copies.
606     */
607    if (instr->opcode == aco_opcode::p_split_vector) {
608       Operand& op = instr->operands[0];
609       if (!op.isTemp() || op.isKill())
610          return;
611 
612       PhysReg reg = op.physReg();
613       for (Definition& def : instr->definitions) {
614          if (def.getTemp().type() == op.getTemp().type() && def.isKill())
615             def.setFixed(reg);
616 
617          reg = reg.advance(def.bytes());
618       }
619 
620       return;
621    }
622 
623    /* We are looking for the following pattern:
624     *
625     * sA, sB = p_split_vector s[X:Y]
626     * ... X and Y not overwritten here ...
627     * use sA or sB <--- current instruction
628     *
629     * If possible, we propagate the registers from the p_split_vector
630     * operand into the current instruction and the above is optimized into:
631     *
632     * use sX or sY
633     *
634     * Thereby, we might violate register assignment rules.
635     * This optimization exists because it's too difficult to solve it
636     * in RA, and should be removed after we solved this in RA.
637     */
638 
639    if (!instr->isVALU() && !instr->isSALU())
640       return;
641 
642    for (unsigned i = 0; i < num_encoded_alu_operands(instr); i++) {
643       /* Find the instruction that writes the current operand. */
644       const Operand& op = instr->operands[i];
645       Idx op_instr_idx = last_writer_idx(ctx, op);
646       if (!op_instr_idx.found())
647          continue;
648 
649       /* Check if the operand is written by p_split_vector. */
650       Instruction* split_vec = ctx.get(op_instr_idx);
651       if (split_vec->opcode != aco_opcode::p_split_vector &&
652           split_vec->opcode != aco_opcode::p_extract_vector)
653          continue;
654 
655       Operand& split_op = split_vec->operands[0];
656 
657       /* Don't do anything if the p_split_vector operand is not a temporary
658        * or is killed by the p_split_vector.
659        * In this case the definitions likely already reuse the same registers as the operand.
660        */
661       if (!split_op.isTemp() || split_op.isKill())
662          continue;
663 
664       /* Only propagate operands of the same type */
665       if (split_op.getTemp().type() != op.getTemp().type())
666          continue;
667 
668       /* Check if the p_split_vector operand's registers are overwritten. */
669       if (is_overwritten_since(ctx, split_op, op_instr_idx))
670          continue;
671 
672       PhysReg reg = split_op.physReg();
673       if (split_vec->opcode == aco_opcode::p_extract_vector) {
674          reg =
675             reg.advance(split_vec->definitions[0].bytes() * split_vec->operands[1].constantValue());
676       }
677       for (Definition& def : split_vec->definitions) {
678          if (def.getTemp() != op.getTemp()) {
679             reg = reg.advance(def.bytes());
680             continue;
681          }
682 
683          /* Don't propagate misaligned SGPRs.
684           * Note: No ALU instruction can take a variable larger than 64bit.
685           */
686          if (op.regClass() == s2 && reg.reg() % 2 != 0)
687             break;
688 
689          /* Sub dword operands might need updates to SDWA/opsel,
690           * but we only track full register writes at the moment.
691           */
692          assert(op.physReg().byte() == reg.byte());
693 
694          /* If there is only one use (left), recolor the split_vector definition */
695          if (ctx.uses[op.tempId()] == 1)
696             def.setFixed(reg);
697          else
698             ctx.uses[op.tempId()]--;
699 
700          /* Use the p_split_vector operand register directly.
701           *
702           * Note: this might violate register assignment rules to some extend
703           *       in case the definition does not get recolored, eventually.
704           */
705          instr->operands[i].setFixed(reg);
706          break;
707       }
708    }
709 }
710 
711 void
try_convert_fma_to_vop2(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)712 try_convert_fma_to_vop2(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
713 {
714    /* We convert v_fma_f32 with inline constant to fmamk/fmaak.
715     * This is only benefical if it allows more VOPD.
716     */
717    if (ctx.program->gfx_level < GFX11 || ctx.program->wave_size != 32 ||
718        instr->opcode != aco_opcode::v_fma_f32 || instr->usesModifiers())
719       return;
720 
721    int constant_idx = -1;
722    int vgpr_idx = -1;
723    for (int i = 0; i < 3; i++) {
724       const Operand& op = instr->operands[i];
725       if (op.isConstant() && !op.isLiteral())
726          constant_idx = i;
727       else if (op.isOfType(RegType::vgpr))
728          vgpr_idx = i;
729       else
730          return;
731    }
732 
733    if (constant_idx < 0 || vgpr_idx < 0)
734       return;
735 
736    std::swap(instr->operands[constant_idx], instr->operands[2]);
737    if (constant_idx == 0 || vgpr_idx == 0)
738       std::swap(instr->operands[0], instr->operands[1]);
739    instr->operands[2] = Operand::literal32(instr->operands[2].constantValue());
740    instr->opcode = constant_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
741    instr->format = Format::VOP2;
742 }
743 
744 void
process_instruction(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)745 process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
746 {
747    /* Don't try to optimize instructions which are already dead. */
748    if (!instr || is_dead(ctx.uses, instr.get())) {
749       instr.reset();
750       ctx.current_instr_idx++;
751       return;
752    }
753 
754    try_apply_branch_vcc(ctx, instr);
755 
756    try_optimize_scc_nocompare(ctx, instr);
757 
758    try_combine_dpp(ctx, instr);
759 
760    try_reassign_split_vector(ctx, instr);
761 
762    try_convert_fma_to_vop2(ctx, instr);
763 
764    if (instr)
765       save_reg_writes(ctx, instr);
766 
767    ctx.current_instr_idx++;
768 }
769 
770 } // namespace
771 
772 void
optimize_postRA(Program * program)773 optimize_postRA(Program* program)
774 {
775    pr_opt_ctx ctx(program);
776 
777    /* Forward pass
778     * Goes through each instruction exactly once, and can transform
779     * instructions or adjust the use counts of temps.
780     */
781    for (auto& block : program->blocks) {
782       ctx.reset_block(&block);
783 
784       for (aco_ptr<Instruction>& instr : block.instructions)
785          process_instruction(ctx, instr);
786    }
787 
788    /* Cleanup pass
789     * Gets rid of instructions which are manually deleted or
790     * no longer have any uses.
791     */
792    for (auto& block : program->blocks) {
793       std::vector<aco_ptr<Instruction>> instructions;
794       instructions.reserve(block.instructions.size());
795 
796       for (aco_ptr<Instruction>& instr : block.instructions) {
797          if (!instr || is_dead(ctx.uses, instr.get()))
798             continue;
799 
800          instructions.emplace_back(std::move(instr));
801       }
802 
803       block.instructions = std::move(instructions);
804    }
805 }
806 
807 } // namespace aco
808