1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include <algorithm>
29 #include <array>
30 #include <bitset>
31 #include <vector>
32
33 namespace aco {
34 namespace {
35
36 constexpr const size_t max_reg_cnt = 512;
37 constexpr const size_t max_sgpr_cnt = 128;
38 constexpr const size_t min_vgpr = 256;
39 constexpr const size_t max_vgpr_cnt = 256;
40
41 struct Idx {
operator ==aco::__anon42f748b10111::Idx42 bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
operator !=aco::__anon42f748b10111::Idx43 bool operator!=(const Idx& other) const { return !operator==(other); }
44
foundaco::__anon42f748b10111::Idx45 bool found() const { return block != UINT32_MAX; }
46
47 uint32_t block;
48 uint32_t instr;
49 };
50
51 /** Indicates that a register was not yet written in the shader. */
52 Idx not_written_yet{UINT32_MAX, 0};
53
54 /** Indicates that an operand is constant or undefined, not written by any instruction. */
55 Idx const_or_undef{UINT32_MAX, 2};
56
57 /** Indicates that a register was overwritten by different instructions in previous blocks. */
58 Idx overwritten_untrackable{UINT32_MAX, 3};
59
60 /** Indicates that a register was written by subdword operations. */
61 Idx overwritten_subdword{UINT32_MAX, 4};
62
63 struct pr_opt_ctx {
64 using Idx_array = std::array<Idx, max_reg_cnt>;
65
66 Program* program;
67 Block* current_block;
68 uint32_t current_instr_idx;
69 std::vector<uint16_t> uses;
70 std::unique_ptr<Idx_array[]> instr_idx_by_regs;
71
pr_opt_ctxaco::__anon42f748b10111::pr_opt_ctx72 pr_opt_ctx(Program* p)
73 : program(p), current_block(nullptr), current_instr_idx(0), uses(dead_code_analysis(p)),
74 instr_idx_by_regs(std::unique_ptr<Idx_array[]>{new Idx_array[p->blocks.size()]})
75 {}
76
reset_block_regsaco::__anon42f748b10111::pr_opt_ctx77 ALWAYS_INLINE void reset_block_regs(const std::vector<uint32_t>& preds,
78 const unsigned block_index, const unsigned min_reg,
79 const unsigned num_regs)
80 {
81 const unsigned num_preds = preds.size();
82 const unsigned first_pred = preds[0];
83
84 /* Copy information from the first predecessor. */
85 memcpy(&instr_idx_by_regs[block_index][min_reg], &instr_idx_by_regs[first_pred][min_reg],
86 num_regs * sizeof(Idx));
87
88 /* Mark overwritten if it doesn't match with other predecessors. */
89 const unsigned until_reg = min_reg + num_regs;
90 for (unsigned i = 1; i < num_preds; ++i) {
91 unsigned pred = preds[i];
92 for (unsigned reg = min_reg; reg < until_reg; ++reg) {
93 Idx& idx = instr_idx_by_regs[block_index][reg];
94 if (idx == overwritten_untrackable)
95 continue;
96
97 if (idx != instr_idx_by_regs[pred][reg])
98 idx = overwritten_untrackable;
99 }
100 }
101 }
102
reset_blockaco::__anon42f748b10111::pr_opt_ctx103 void reset_block(Block* block)
104 {
105 current_block = block;
106 current_instr_idx = 0;
107
108 if (block->linear_preds.empty()) {
109 std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
110 not_written_yet);
111 } else if (block->kind & block_kind_loop_header) {
112 /* Instructions inside the loop may overwrite registers of temporaries that are
113 * not live inside the loop, but we can't detect that because we haven't processed
114 * the blocks in the loop yet. As a workaround, mark all registers as untrackable.
115 * TODO: Consider improving this in the future.
116 */
117 std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
118 overwritten_untrackable);
119 } else {
120 reset_block_regs(block->linear_preds, block->index, 0, max_sgpr_cnt);
121 reset_block_regs(block->linear_preds, block->index, 251, 3);
122
123 if (!block->logical_preds.empty()) {
124 /* We assume that VGPRs are only read by blocks which have a logical predecessor,
125 * ie. any block that reads any VGPR has at least 1 logical predecessor.
126 */
127 reset_block_regs(block->logical_preds, block->index, min_vgpr, max_vgpr_cnt);
128 } else {
129 /* If a block has no logical predecessors, it is not part of the
130 * logical CFG and therefore it also won't have any logical successors.
131 * Such a block does not write any VGPRs ever.
132 */
133 assert(block->logical_succs.empty());
134 }
135 }
136 }
137
getaco::__anon42f748b10111::pr_opt_ctx138 Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
139 };
140
141 void
save_reg_writes(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)142 save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
143 {
144 for (const Definition& def : instr->definitions) {
145 assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
146 assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
147
148 unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
149 unsigned r = def.physReg().reg();
150 Idx idx{ctx.current_block->index, ctx.current_instr_idx};
151
152 if (def.regClass().is_subdword())
153 idx = overwritten_subdword;
154
155 assert((r + dw_size) <= max_reg_cnt);
156 assert(def.size() == dw_size || def.regClass().is_subdword());
157 std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
158 ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx);
159 }
160 }
161
162 Idx
last_writer_idx(pr_opt_ctx & ctx,PhysReg physReg,RegClass rc)163 last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
164 {
165 /* Verify that all of the operand's registers are written by the same instruction. */
166 assert(physReg.reg() < max_reg_cnt);
167 Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
168 unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
169 unsigned r = physReg.reg();
170 bool all_same =
171 std::all_of(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
172 ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size,
173 [instr_idx](Idx i) { return i == instr_idx; });
174
175 return all_same ? instr_idx : overwritten_untrackable;
176 }
177
178 Idx
last_writer_idx(pr_opt_ctx & ctx,const Operand & op)179 last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
180 {
181 if (op.isConstant() || op.isUndefined())
182 return const_or_undef;
183
184 return last_writer_idx(ctx, op.physReg(), op.regClass());
185 }
186
187 /**
188 * Check whether a register has been overwritten since the given location.
189 * This is an important part of checking whether certain optimizations are
190 * valid.
191 * Note that the decision is made based on registers and not on SSA IDs.
192 */
193 bool
is_overwritten_since(pr_opt_ctx & ctx,PhysReg reg,RegClass rc,const Idx & since_idx)194 is_overwritten_since(pr_opt_ctx& ctx, PhysReg reg, RegClass rc, const Idx& since_idx)
195 {
196 /* If we didn't find an instruction, assume that the register is overwritten. */
197 if (!since_idx.found())
198 return true;
199
200 /* TODO: We currently can't keep track of subdword registers. */
201 if (rc.is_subdword())
202 return true;
203
204 unsigned begin_reg = reg.reg();
205 unsigned end_reg = begin_reg + rc.size();
206 unsigned current_block_idx = ctx.current_block->index;
207
208 for (unsigned r = begin_reg; r < end_reg; ++r) {
209 Idx& i = ctx.instr_idx_by_regs[current_block_idx][r];
210 if (i == overwritten_untrackable && current_block_idx > since_idx.block)
211 return true;
212 else if (i == overwritten_untrackable || i == not_written_yet)
213 continue;
214 else if (i == overwritten_subdword)
215 return true;
216
217 assert(i.found());
218
219 if (i.block > since_idx.block || (i.block == since_idx.block && i.instr > since_idx.instr))
220 return true;
221 }
222
223 return false;
224 }
225
226 template <typename T>
227 bool
is_overwritten_since(pr_opt_ctx & ctx,const T & t,const Idx & idx)228 is_overwritten_since(pr_opt_ctx& ctx, const T& t, const Idx& idx)
229 {
230 return is_overwritten_since(ctx, t.physReg(), t.regClass(), idx);
231 }
232
233 void
try_apply_branch_vcc(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)234 try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
235 {
236 /* We are looking for the following pattern:
237 *
238 * vcc = ... ; last_vcc_wr
239 * sX, scc = s_and_bXX vcc, exec ; op0_instr
240 * (...vcc and exec must not be overwritten inbetween...)
241 * s_cbranch_XX scc ; instr
242 *
243 * If possible, the above is optimized into:
244 *
245 * vcc = ... ; last_vcc_wr
246 * s_cbranch_XX vcc ; instr modified to use vcc
247 */
248
249 /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
250 if (ctx.program->gfx_level < GFX8)
251 return;
252
253 if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
254 instr->operands[0].physReg() != scc)
255 return;
256
257 Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
258 Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
259
260 /* We need to make sure:
261 * - the instructions that wrote the operand register and VCC are both found
262 * - the operand register used by the branch, and VCC were both written in the current block
263 * - EXEC hasn't been overwritten since the last VCC write
264 * - VCC hasn't been overwritten since the operand register was written
265 * (ie. the last VCC writer precedes the op0 writer)
266 */
267 if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
268 op0_instr_idx.block != ctx.current_block->index ||
269 last_vcc_wr_idx.block != ctx.current_block->index ||
270 is_overwritten_since(ctx, exec, ctx.program->lane_mask, last_vcc_wr_idx) ||
271 is_overwritten_since(ctx, vcc, ctx.program->lane_mask, op0_instr_idx))
272 return;
273
274 Instruction* op0_instr = ctx.get(op0_instr_idx);
275 Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
276
277 if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
278 op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
279 op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
280 !last_vcc_wr->isVOPC())
281 return;
282
283 assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
284
285 /* Reduce the uses of the SCC def */
286 ctx.uses[instr->operands[0].tempId()]--;
287 /* Use VCC instead of SCC in the branch */
288 instr->operands[0] = op0_instr->operands[0];
289 }
290
291 void
try_optimize_scc_nocompare(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)292 try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
293 {
294 /* We are looking for the following pattern:
295 *
296 * s_bfe_u32 s0, s3, 0x40018 ; outputs SGPR and SCC if the SGPR != 0
297 * s_cmp_eq_i32 s0, 0 ; comparison between the SGPR and 0
298 * s_cbranch_scc0 BB3 ; use the result of the comparison, eg. branch or cselect
299 *
300 * If possible, the above is optimized into:
301 *
302 * s_bfe_u32 s0, s3, 0x40018 ; original instruction
303 * s_cbranch_scc1 BB3 ; modified to use SCC directly rather than the SGPR with comparison
304 *
305 */
306
307 if (!instr->isSALU() && !instr->isBranch())
308 return;
309
310 if (instr->isSOPC() &&
311 (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
312 instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
313 instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
314 (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
315 (instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
316 /* Make sure the constant is always in operand 1 */
317 if (instr->operands[0].isConstant())
318 std::swap(instr->operands[0], instr->operands[1]);
319
320 if (ctx.uses[instr->operands[0].tempId()] > 1)
321 return;
322
323 /* Find the writer instruction of Operand 0. */
324 Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
325 if (!wr_idx.found())
326 return;
327
328 Instruction* wr_instr = ctx.get(wr_idx);
329 if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
330 wr_instr->definitions[1].physReg() != scc)
331 return;
332
333 /* Look for instructions which set SCC := (D != 0) */
334 switch (wr_instr->opcode) {
335 case aco_opcode::s_bfe_i32:
336 case aco_opcode::s_bfe_i64:
337 case aco_opcode::s_bfe_u32:
338 case aco_opcode::s_bfe_u64:
339 case aco_opcode::s_and_b32:
340 case aco_opcode::s_and_b64:
341 case aco_opcode::s_andn2_b32:
342 case aco_opcode::s_andn2_b64:
343 case aco_opcode::s_or_b32:
344 case aco_opcode::s_or_b64:
345 case aco_opcode::s_orn2_b32:
346 case aco_opcode::s_orn2_b64:
347 case aco_opcode::s_xor_b32:
348 case aco_opcode::s_xor_b64:
349 case aco_opcode::s_not_b32:
350 case aco_opcode::s_not_b64:
351 case aco_opcode::s_nor_b32:
352 case aco_opcode::s_nor_b64:
353 case aco_opcode::s_xnor_b32:
354 case aco_opcode::s_xnor_b64:
355 case aco_opcode::s_nand_b32:
356 case aco_opcode::s_nand_b64:
357 case aco_opcode::s_lshl_b32:
358 case aco_opcode::s_lshl_b64:
359 case aco_opcode::s_lshr_b32:
360 case aco_opcode::s_lshr_b64:
361 case aco_opcode::s_ashr_i32:
362 case aco_opcode::s_ashr_i64:
363 case aco_opcode::s_abs_i32:
364 case aco_opcode::s_absdiff_i32: break;
365 default: return;
366 }
367
368 /* Check whether both SCC and Operand 0 are written by the same instruction. */
369 Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
370 if (wr_idx != sccwr_idx) {
371 /* Check whether the current instruction is the only user of its first operand. */
372 if (ctx.uses[wr_instr->definitions[1].tempId()] ||
373 ctx.uses[wr_instr->definitions[0].tempId()] > 1)
374 return;
375
376 /* Check whether the operands of the writer are overwritten. */
377 for (const Operand& op : wr_instr->operands) {
378 if (!op.isConstant() && is_overwritten_since(ctx, op, wr_idx))
379 return;
380 }
381
382 aco_opcode pulled_opcode = wr_instr->opcode;
383 if (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
384 instr->opcode == aco_opcode::s_cmp_eq_i32 ||
385 instr->opcode == aco_opcode::s_cmp_eq_u64) {
386 /* When s_cmp_eq is used, it effectively inverts the SCC def.
387 * However, we can't simply invert the opcodes here because that
388 * would change the meaning of the program.
389 */
390 return;
391 }
392
393 Definition scc_def = instr->definitions[0];
394 ctx.uses[wr_instr->definitions[0].tempId()]--;
395
396 /* Copy the writer instruction, but use SCC from the current instr.
397 * This means that the original instruction will be eliminated.
398 */
399 if (wr_instr->format == Format::SOP2) {
400 instr.reset(create_instruction<SOP2_instruction>(pulled_opcode, Format::SOP2, 2, 2));
401 instr->operands[1] = wr_instr->operands[1];
402 } else if (wr_instr->format == Format::SOP1) {
403 instr.reset(create_instruction<SOP1_instruction>(pulled_opcode, Format::SOP1, 1, 2));
404 }
405 instr->definitions[0] = wr_instr->definitions[0];
406 instr->definitions[1] = scc_def;
407 instr->operands[0] = wr_instr->operands[0];
408 return;
409 }
410
411 /* Use the SCC def from wr_instr */
412 ctx.uses[instr->operands[0].tempId()]--;
413 instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc);
414 ctx.uses[instr->operands[0].tempId()]++;
415
416 /* Set the opcode and operand to 32-bit */
417 instr->operands[1] = Operand::zero();
418 instr->opcode =
419 (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
420 instr->opcode == aco_opcode::s_cmp_eq_u64)
421 ? aco_opcode::s_cmp_eq_u32
422 : aco_opcode::s_cmp_lg_u32;
423 } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
424 instr->operands[0].physReg() == scc) ||
425 instr->opcode == aco_opcode::s_cselect_b32 ||
426 instr->opcode == aco_opcode::s_cselect_b64) {
427
428 /* For cselect, operand 2 is the SCC condition */
429 unsigned scc_op_idx = 0;
430 if (instr->opcode == aco_opcode::s_cselect_b32 ||
431 instr->opcode == aco_opcode::s_cselect_b64) {
432 scc_op_idx = 2;
433 }
434
435 Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
436 if (!wr_idx.found())
437 return;
438
439 Instruction* wr_instr = ctx.get(wr_idx);
440
441 /* Check if we found the pattern above. */
442 if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
443 wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
444 return;
445 if (wr_instr->operands[0].physReg() != scc)
446 return;
447 if (!wr_instr->operands[1].constantEquals(0))
448 return;
449
450 /* The optimization can be unsafe when there are other users. */
451 if (ctx.uses[instr->operands[scc_op_idx].tempId()] > 1)
452 return;
453
454 if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
455 /* Flip the meaning of the instruction to correctly use the SCC. */
456 if (instr->format == Format::PSEUDO_BRANCH)
457 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
458 : aco_opcode::p_cbranch_z;
459 else if (instr->opcode == aco_opcode::s_cselect_b32 ||
460 instr->opcode == aco_opcode::s_cselect_b64)
461 std::swap(instr->operands[0], instr->operands[1]);
462 else
463 unreachable(
464 "scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
465 }
466
467 /* Use the SCC def from the original instruction, not the comparison */
468 ctx.uses[instr->operands[scc_op_idx].tempId()]--;
469 instr->operands[scc_op_idx] = wr_instr->operands[0];
470 }
471 }
472
473 void
try_combine_dpp(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)474 try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
475 {
476 /* We are looking for the following pattern:
477 *
478 * v_mov_dpp vA, vB, ... ; move instruction with DPP
479 * v_xxx vC, vA, ... ; current instr that uses the result from the move
480 *
481 * If possible, the above is optimized into:
482 *
483 * v_xxx_dpp vC, vB, ... ; current instr modified to use DPP directly
484 *
485 */
486
487 if (!instr->isVALU() || instr->isDPP())
488 return;
489
490 for (unsigned i = 0; i < instr->operands.size(); i++) {
491 Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]);
492 if (!op_instr_idx.found())
493 continue;
494
495 /* is_overwritten_since only considers active lanes when the register could possibly
496 * have been overwritten from inactive lanes. Restrict this optimization to at most
497 * one block so that there is no possibility for clobbered inactive lanes.
498 */
499 if (ctx.current_block->index - op_instr_idx.block > 1)
500 continue;
501
502 const Instruction* mov = ctx.get(op_instr_idx);
503 if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP())
504 continue;
505
506 /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite
507 * it's own operand before we use it.
508 */
509 if (mov->definitions[0].physReg() == mov->operands[0].physReg() &&
510 (!mov->definitions[0].tempId() || ctx.uses[mov->definitions[0].tempId()] > 1))
511 continue;
512
513 /* Don't propagate DPP if the source register is overwritten since the move. */
514 if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
515 continue;
516
517 bool dpp8 = mov->isDPP8();
518
519 /* Fetch-inactive means exec is ignored, which allows us to combine across exec changes. */
520 if (!(dpp8 ? mov->dpp8().fetch_inactive : mov->dpp16().fetch_inactive) &&
521 is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx))
522 continue;
523
524 /* We won't eliminate the DPP mov if the operand is used twice */
525 bool op_used_twice = false;
526 for (unsigned j = 0; j < instr->operands.size(); j++)
527 op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
528 if (op_used_twice)
529 continue;
530
531 bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) &&
532 get_operand_size(instr, i) == 32;
533 bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0];
534 if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
535 continue;
536
537 if (i != 0) {
538 if (!can_swap_operands(instr, &instr->opcode, 0, i))
539 continue;
540 instr->valu().swapOperands(0, i);
541 }
542
543 if (!can_use_DPP(ctx.program->gfx_level, instr, dpp8))
544 continue;
545
546 if (!dpp8) /* anything else doesn't make sense in SSA */
547 assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf);
548
549 if (--ctx.uses[mov->definitions[0].tempId()])
550 ctx.uses[mov->operands[0].tempId()]++;
551
552 convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
553
554 instr->operands[0] = mov->operands[0];
555
556 if (dpp8) {
557 DPP8_instruction* dpp = &instr->dpp8();
558 dpp->lane_sel = mov->dpp8().lane_sel;
559 dpp->fetch_inactive = mov->dpp8().fetch_inactive;
560 if (mov_uses_mods)
561 instr->format = asVOP3(instr->format);
562 } else {
563 DPP16_instruction* dpp = &instr->dpp16();
564 dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
565 dpp->bound_ctrl = true;
566 dpp->fetch_inactive = mov->dpp16().fetch_inactive;
567 }
568 instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0];
569 instr->valu().abs[0] |= mov->valu().abs[0];
570 return;
571 }
572 }
573
574 unsigned
num_encoded_alu_operands(const aco_ptr<Instruction> & instr)575 num_encoded_alu_operands(const aco_ptr<Instruction>& instr)
576 {
577 if (instr->isSALU()) {
578 if (instr->isSOP2() || instr->isSOPC())
579 return 2;
580 else if (instr->isSOP1())
581 return 1;
582
583 return 0;
584 }
585
586 if (instr->isVALU()) {
587 if (instr->isVOP1())
588 return 1;
589 else if (instr->isVOPC() || instr->isVOP2())
590 return 2;
591 else if (instr->opcode == aco_opcode::v_writelane_b32_e64 ||
592 instr->opcode == aco_opcode::v_writelane_b32)
593 return 2; /* potentially VOP3, but reads VDST as SRC2 */
594 else if (instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG())
595 return instr->operands.size();
596 }
597
598 return 0;
599 }
600
601 void
try_reassign_split_vector(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)602 try_reassign_split_vector(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
603 {
604 /* Any unused split_vector definition can always use the same register
605 * as the operand. This avoids creating unnecessary copies.
606 */
607 if (instr->opcode == aco_opcode::p_split_vector) {
608 Operand& op = instr->operands[0];
609 if (!op.isTemp() || op.isKill())
610 return;
611
612 PhysReg reg = op.physReg();
613 for (Definition& def : instr->definitions) {
614 if (def.getTemp().type() == op.getTemp().type() && def.isKill())
615 def.setFixed(reg);
616
617 reg = reg.advance(def.bytes());
618 }
619
620 return;
621 }
622
623 /* We are looking for the following pattern:
624 *
625 * sA, sB = p_split_vector s[X:Y]
626 * ... X and Y not overwritten here ...
627 * use sA or sB <--- current instruction
628 *
629 * If possible, we propagate the registers from the p_split_vector
630 * operand into the current instruction and the above is optimized into:
631 *
632 * use sX or sY
633 *
634 * Thereby, we might violate register assignment rules.
635 * This optimization exists because it's too difficult to solve it
636 * in RA, and should be removed after we solved this in RA.
637 */
638
639 if (!instr->isVALU() && !instr->isSALU())
640 return;
641
642 for (unsigned i = 0; i < num_encoded_alu_operands(instr); i++) {
643 /* Find the instruction that writes the current operand. */
644 const Operand& op = instr->operands[i];
645 Idx op_instr_idx = last_writer_idx(ctx, op);
646 if (!op_instr_idx.found())
647 continue;
648
649 /* Check if the operand is written by p_split_vector. */
650 Instruction* split_vec = ctx.get(op_instr_idx);
651 if (split_vec->opcode != aco_opcode::p_split_vector &&
652 split_vec->opcode != aco_opcode::p_extract_vector)
653 continue;
654
655 Operand& split_op = split_vec->operands[0];
656
657 /* Don't do anything if the p_split_vector operand is not a temporary
658 * or is killed by the p_split_vector.
659 * In this case the definitions likely already reuse the same registers as the operand.
660 */
661 if (!split_op.isTemp() || split_op.isKill())
662 continue;
663
664 /* Only propagate operands of the same type */
665 if (split_op.getTemp().type() != op.getTemp().type())
666 continue;
667
668 /* Check if the p_split_vector operand's registers are overwritten. */
669 if (is_overwritten_since(ctx, split_op, op_instr_idx))
670 continue;
671
672 PhysReg reg = split_op.physReg();
673 if (split_vec->opcode == aco_opcode::p_extract_vector) {
674 reg =
675 reg.advance(split_vec->definitions[0].bytes() * split_vec->operands[1].constantValue());
676 }
677 for (Definition& def : split_vec->definitions) {
678 if (def.getTemp() != op.getTemp()) {
679 reg = reg.advance(def.bytes());
680 continue;
681 }
682
683 /* Don't propagate misaligned SGPRs.
684 * Note: No ALU instruction can take a variable larger than 64bit.
685 */
686 if (op.regClass() == s2 && reg.reg() % 2 != 0)
687 break;
688
689 /* Sub dword operands might need updates to SDWA/opsel,
690 * but we only track full register writes at the moment.
691 */
692 assert(op.physReg().byte() == reg.byte());
693
694 /* If there is only one use (left), recolor the split_vector definition */
695 if (ctx.uses[op.tempId()] == 1)
696 def.setFixed(reg);
697 else
698 ctx.uses[op.tempId()]--;
699
700 /* Use the p_split_vector operand register directly.
701 *
702 * Note: this might violate register assignment rules to some extend
703 * in case the definition does not get recolored, eventually.
704 */
705 instr->operands[i].setFixed(reg);
706 break;
707 }
708 }
709 }
710
711 void
try_convert_fma_to_vop2(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)712 try_convert_fma_to_vop2(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
713 {
714 /* We convert v_fma_f32 with inline constant to fmamk/fmaak.
715 * This is only benefical if it allows more VOPD.
716 */
717 if (ctx.program->gfx_level < GFX11 || ctx.program->wave_size != 32 ||
718 instr->opcode != aco_opcode::v_fma_f32 || instr->usesModifiers())
719 return;
720
721 int constant_idx = -1;
722 int vgpr_idx = -1;
723 for (int i = 0; i < 3; i++) {
724 const Operand& op = instr->operands[i];
725 if (op.isConstant() && !op.isLiteral())
726 constant_idx = i;
727 else if (op.isOfType(RegType::vgpr))
728 vgpr_idx = i;
729 else
730 return;
731 }
732
733 if (constant_idx < 0 || vgpr_idx < 0)
734 return;
735
736 std::swap(instr->operands[constant_idx], instr->operands[2]);
737 if (constant_idx == 0 || vgpr_idx == 0)
738 std::swap(instr->operands[0], instr->operands[1]);
739 instr->operands[2] = Operand::literal32(instr->operands[2].constantValue());
740 instr->opcode = constant_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
741 instr->format = Format::VOP2;
742 }
743
744 void
process_instruction(pr_opt_ctx & ctx,aco_ptr<Instruction> & instr)745 process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
746 {
747 /* Don't try to optimize instructions which are already dead. */
748 if (!instr || is_dead(ctx.uses, instr.get())) {
749 instr.reset();
750 ctx.current_instr_idx++;
751 return;
752 }
753
754 try_apply_branch_vcc(ctx, instr);
755
756 try_optimize_scc_nocompare(ctx, instr);
757
758 try_combine_dpp(ctx, instr);
759
760 try_reassign_split_vector(ctx, instr);
761
762 try_convert_fma_to_vop2(ctx, instr);
763
764 if (instr)
765 save_reg_writes(ctx, instr);
766
767 ctx.current_instr_idx++;
768 }
769
770 } // namespace
771
772 void
optimize_postRA(Program * program)773 optimize_postRA(Program* program)
774 {
775 pr_opt_ctx ctx(program);
776
777 /* Forward pass
778 * Goes through each instruction exactly once, and can transform
779 * instructions or adjust the use counts of temps.
780 */
781 for (auto& block : program->blocks) {
782 ctx.reset_block(&block);
783
784 for (aco_ptr<Instruction>& instr : block.instructions)
785 process_instruction(ctx, instr);
786 }
787
788 /* Cleanup pass
789 * Gets rid of instructions which are manually deleted or
790 * no longer have any uses.
791 */
792 for (auto& block : program->blocks) {
793 std::vector<aco_ptr<Instruction>> instructions;
794 instructions.reserve(block.instructions.size());
795
796 for (aco_ptr<Instruction>& instr : block.instructions) {
797 if (!instr || is_dead(ctx.uses, instr.get()))
798 continue;
799
800 instructions.emplace_back(std::move(instr));
801 }
802
803 block.instructions = std::move(instructions);
804 }
805 }
806
807 } // namespace aco
808