• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "util/bitset.h"
10 #include "util/enum_operators.h"
11 
12 #include <algorithm>
13 #include <array>
14 #include <bitset>
15 #include <map>
16 #include <optional>
17 #include <vector>
18 
19 namespace aco {
20 namespace {
21 
22 struct ra_ctx;
23 struct DefInfo;
24 
25 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
26                                      unsigned idx, RegClass rc);
27 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
28                           RegClass rc);
29 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
30                              bool allow_16bit_write);
31 
32 struct assignment {
33    PhysReg reg;
34    RegClass rc;
35    union {
36       struct {
37          bool assigned : 1;
38          bool vcc : 1;
39          bool m0 : 1;
40          bool renamed : 1;
41       };
42       uint8_t _ = 0;
43    };
44    uint32_t affinity = 0;
45    assignment() = default;
assignmentaco::__anone0d89ec20111::assignment46    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anone0d89ec20111::assignment47    void set(const Definition& def)
48    {
49       assigned = true;
50       reg = def.physReg();
51       rc = def.regClass();
52    }
53 };
54 
55 /* Iterator type for making PhysRegInterval compatible with range-based for */
56 struct PhysRegIterator {
57    using difference_type = int;
58    using value_type = unsigned;
59    using reference = const unsigned&;
60    using pointer = const unsigned*;
61    using iterator_category = std::bidirectional_iterator_tag;
62 
63    PhysReg reg;
64 
operator *aco::__anone0d89ec20111::PhysRegIterator65    PhysReg operator*() const { return reg; }
66 
operator ++aco::__anone0d89ec20111::PhysRegIterator67    PhysRegIterator& operator++()
68    {
69       reg.reg_b += 4;
70       return *this;
71    }
72 
operator --aco::__anone0d89ec20111::PhysRegIterator73    PhysRegIterator& operator--()
74    {
75       reg.reg_b -= 4;
76       return *this;
77    }
78 
operator ==aco::__anone0d89ec20111::PhysRegIterator79    bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
80 
operator !=aco::__anone0d89ec20111::PhysRegIterator81    bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
82 
operator <aco::__anone0d89ec20111::PhysRegIterator83    bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
84 };
85 
86 struct vector_info {
vector_infoaco::__anone0d89ec20111::vector_info87    vector_info() : is_weak(false), num_parts(0), parts(NULL) {}
vector_infoaco::__anone0d89ec20111::vector_info88    vector_info(Instruction* instr, unsigned start = 0, bool weak = false)
89        : is_weak(weak), num_parts(instr->operands.size() - start),
90          parts(instr->operands.begin() + start)
91    {}
92 
93    /* If true, then we should stop trying to form a vector if anything goes wrong. Useful for when
94     * the cost of failing does not introduce copies. */
95    bool is_weak;
96    uint32_t num_parts;
97    Operand* parts;
98 };
99 
100 struct ra_ctx {
101 
102    Program* program;
103    Block* block = NULL;
104    aco::monotonic_buffer_resource memory;
105    std::vector<assignment> assignments;
106    std::vector<aco::unordered_map<uint32_t, Temp>> renames;
107    std::vector<std::pair<uint32_t, PhysReg>> loop_header;
108    aco::unordered_map<uint32_t, Temp> orig_names;
109    aco::unordered_map<uint32_t, vector_info> vectors;
110    aco::unordered_map<uint32_t, Instruction*> split_vectors;
111    aco_ptr<Instruction> pseudo_dummy;
112    aco_ptr<Instruction> phi_dummy;
113    uint16_t max_used_sgpr = 0;
114    uint16_t max_used_vgpr = 0;
115    uint16_t sgpr_limit;
116    uint16_t vgpr_limit;
117    std::bitset<512> war_hint;
118    PhysRegIterator rr_sgpr_it;
119    PhysRegIterator rr_vgpr_it;
120 
121    uint16_t sgpr_bounds;
122    uint16_t vgpr_bounds;
123    uint16_t num_linear_vgprs;
124 
125    ra_test_policy policy;
126 
ra_ctxaco::__anone0d89ec20111::ra_ctx127    ra_ctx(Program* program_, ra_test_policy policy_)
128        : program(program_), assignments(program->peekAllocationId()),
129          renames(program->blocks.size(), aco::unordered_map<uint32_t, Temp>(memory)),
130          orig_names(memory), vectors(memory), split_vectors(memory), policy(policy_)
131    {
132       pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
133       phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
134       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
135       vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
136 
137       sgpr_bounds = program->max_reg_demand.sgpr;
138       vgpr_bounds = program->max_reg_demand.vgpr;
139       num_linear_vgprs = 0;
140    }
141 };
142 
143 /* Half-open register interval used in "sliding window"-style for-loops */
144 struct PhysRegInterval {
145    PhysReg lo_;
146    unsigned size;
147 
148    /* Inclusive lower bound */
loaco::__anone0d89ec20111::PhysRegInterval149    PhysReg lo() const { return lo_; }
150 
151    /* Exclusive upper bound */
hiaco::__anone0d89ec20111::PhysRegInterval152    PhysReg hi() const { return PhysReg{lo() + size}; }
153 
operator +=aco::__anone0d89ec20111::PhysRegInterval154    PhysRegInterval& operator+=(uint32_t stride)
155    {
156       lo_ = PhysReg{lo_.reg() + stride};
157       return *this;
158    }
159 
operator !=aco::__anone0d89ec20111::PhysRegInterval160    bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
161 
162    /* Construct a half-open interval, excluding the end register */
from_untilaco::__anone0d89ec20111::PhysRegInterval163    static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
164 
containsaco::__anone0d89ec20111::PhysRegInterval165    bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
166 
containsaco::__anone0d89ec20111::PhysRegInterval167    bool contains(const PhysRegInterval& needle) const
168    {
169       return needle.lo() >= lo() && needle.hi() <= hi();
170    }
171 
beginaco::__anone0d89ec20111::PhysRegInterval172    PhysRegIterator begin() const { return {lo_}; }
173 
endaco::__anone0d89ec20111::PhysRegInterval174    PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
175 };
176 
177 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)178 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
179 {
180    return a.hi() > b.lo() && b.hi() > a.lo();
181 }
182 
183 /* Gets the stride for full (non-subdword) registers */
184 uint32_t
get_stride(RegClass rc)185 get_stride(RegClass rc)
186 {
187    if (rc.type() == RegType::vgpr) {
188       return 1;
189    } else {
190       uint32_t size = rc.size();
191       if (size == 2) {
192          return 2;
193       } else if (size >= 4) {
194          return 4;
195       } else {
196          return 1;
197       }
198    }
199 }
200 
201 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegType type,bool linear_vgpr)202 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
203 {
204    uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
205    if (type == RegType::vgpr && linear_vgpr) {
206       return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
207    } else if (type == RegType::vgpr) {
208       return PhysRegInterval{PhysReg(256), linear_vgpr_start};
209    } else {
210       return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
211    }
212 }
213 
214 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegClass rc)215 get_reg_bounds(ra_ctx& ctx, RegClass rc)
216 {
217    return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
218 }
219 
220 struct DefInfo {
221    PhysRegInterval bounds;
222    uint8_t size;
223    uint8_t stride;
224    /* Even if stride=4, we might be able to write to the high half instead without preserving the
225     * low half. In that case, data_stride=2. */
226    uint8_t data_stride;
227    RegClass rc;
228 
DefInfoaco::__anone0d89ec20111::DefInfo229    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
230    {
231       size = rc.size();
232       stride = get_stride(rc);
233       data_stride = 0;
234 
235       bounds = get_reg_bounds(ctx, rc);
236 
237       if (rc.is_subdword() && operand >= 0) {
238          /* stride in bytes */
239          stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
240       } else if (rc.is_subdword()) {
241          get_subdword_definition_info(ctx.program, instr);
242       } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
243          /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
244           *
245           * The register use is not calculated correctly, and the hardware assumes a
246           * full dword per component. Don't use the last registers of the register file.
247           * Otherwise, the instruction will be skipped.
248           *
249           * https://reviews.llvm.org/D81172
250           */
251          bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
252          assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
253 
254          if (imageGather4D16Bug)
255             bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
256       }
257 
258       if (!data_stride)
259          data_stride = rc.is_subdword() ? stride : (stride * 4);
260    }
261 
262 private:
263    void get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr);
264 };
265 
266 class RegisterFile {
267 public:
RegisterFile()268    RegisterFile() { regs.fill(0); }
269 
270    std::array<uint32_t, 512> regs;
271    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
272 
operator [](PhysReg index) const273    const uint32_t& operator[](PhysReg index) const { return regs[index]; }
274 
operator [](PhysReg index)275    uint32_t& operator[](PhysReg index) { return regs[index]; }
276 
count_zero(PhysRegInterval reg_interval) const277    unsigned count_zero(PhysRegInterval reg_interval) const
278    {
279       unsigned res = 0;
280       for (PhysReg reg : reg_interval)
281          res += !regs[reg];
282       return res;
283    }
284 
285    /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes) const286    bool test(PhysReg start, unsigned num_bytes) const
287    {
288       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
289          assert(i <= 511);
290          if (regs[i] & 0x0FFFFFFF)
291             return true;
292          if (regs[i] == 0xF0000000) {
293             auto it = subdword_regs.find(i);
294             assert(it != subdword_regs.end());
295             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
296                if (it->second[j])
297                   return true;
298             }
299          }
300       }
301       return false;
302    }
303 
block(PhysReg start,RegClass rc)304    void block(PhysReg start, RegClass rc)
305    {
306       if (rc.is_subdword())
307          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
308       else
309          fill(start, rc.size(), 0xFFFFFFFF);
310    }
311 
is_blocked(PhysReg start) const312    bool is_blocked(PhysReg start) const
313    {
314       if (regs[start] == 0xFFFFFFFF)
315          return true;
316       if (regs[start] == 0xF0000000) {
317          auto it = subdword_regs.find(start);
318          assert(it != subdword_regs.end());
319          for (unsigned i = start.byte(); i < 4; i++)
320             if (it->second[i] == 0xFFFFFFFF)
321                return true;
322       }
323       return false;
324    }
325 
is_empty_or_blocked(PhysReg start) const326    bool is_empty_or_blocked(PhysReg start) const
327    {
328       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
329        * incremented value to 1 */
330       if (regs[start] == 0xF0000000) {
331          auto it = subdword_regs.find(start);
332          assert(it != subdword_regs.end());
333          return it->second[start.byte()] + 1 <= 1;
334       }
335       return regs[start] + 1 <= 1;
336    }
337 
clear(PhysReg start,RegClass rc)338    void clear(PhysReg start, RegClass rc)
339    {
340       if (rc.is_subdword())
341          fill_subdword(start, rc.bytes(), 0);
342       else
343          fill(start, rc.size(), 0);
344    }
345 
fill_killed_operands(Instruction * instr)346    void fill_killed_operands(Instruction* instr)
347    {
348       for (Operand& op : instr->operands) {
349          if (op.isPrecolored()) {
350             block(op.physReg(), op.regClass());
351          } else if (op.isFixed() && op.isFirstKillBeforeDef()) {
352             if (op.regClass().is_subdword())
353                fill_subdword(op.physReg(), op.bytes(), op.tempId());
354             else
355                fill(op.physReg(), op.size(), op.tempId());
356          }
357       }
358    }
359 
clear(Operand op)360    void clear(Operand op) { clear(op.physReg(), op.regClass()); }
361 
fill(Definition def)362    void fill(Definition def)
363    {
364       if (def.regClass().is_subdword())
365          fill_subdword(def.physReg(), def.bytes(), def.tempId());
366       else
367          fill(def.physReg(), def.size(), def.tempId());
368    }
369 
clear(Definition def)370    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
371 
get_id(PhysReg reg) const372    unsigned get_id(PhysReg reg) const
373    {
374       return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
375    }
376 
377 private:
fill(PhysReg start,unsigned size,uint32_t val)378    void fill(PhysReg start, unsigned size, uint32_t val)
379    {
380       for (unsigned i = 0; i < size; i++)
381          regs[start + i] = val;
382    }
383 
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)384    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
385    {
386       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
387       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
388          /* emplace or get */
389          std::array<uint32_t, 4>& sub =
390             subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
391          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
392             sub[j] = val;
393 
394          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
395             subdword_regs.erase(i);
396             regs[i] = 0;
397          }
398       }
399    }
400 };
401 
402 std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
403                                 const PhysRegInterval reg_interval);
404 
405 /* helper function for debugging */
406 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)407 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
408 {
409    if (reg_file[reg] == 0xFFFFFFFF) {
410       printf((const char*)u8"☐");
411    } else if (reg_file[reg]) {
412       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
413       if (show_subdword_alloc) {
414          auto block_chars = {
415             // clang-format off
416             u8"?", u8"▘", u8"▝", u8"▀",
417             u8"▖", u8"▌", u8"▞", u8"▛",
418             u8"▗", u8"▚", u8"▐", u8"▜",
419             u8"▄", u8"▙", u8"▟", u8"▉"
420             // clang-format on
421          };
422          unsigned index = 0;
423          for (int i = 0; i < 4; ++i) {
424             if (reg_file.subdword_regs.at(reg)[i]) {
425                index |= 1 << i;
426             }
427          }
428          printf("%s", (const char*)(block_chars.begin()[index]));
429       } else {
430          /* Indicate filled register slot */
431          if (!has_adjacent_variable) {
432             printf((const char*)u8"█");
433          } else {
434             /* Use a slightly shorter box to leave a small gap between adjacent variables */
435             printf((const char*)u8"▉");
436          }
437       }
438    } else {
439       printf((const char*)u8"·");
440    }
441 }
442 
443 /* helper function for debugging */
444 UNUSED void
print_regs(ra_ctx & ctx,PhysRegInterval regs,const RegisterFile & reg_file)445 print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
446 {
447    char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
448    const int max_regs_per_line = 64;
449 
450    /* print markers */
451    printf("       ");
452    for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
453       printf("%-3.2u ", i);
454    }
455    printf("\n");
456 
457    /* print usage */
458    auto line_begin_it = regs.begin();
459    while (line_begin_it != regs.end()) {
460       const int regs_in_line =
461          std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
462 
463       if (line_begin_it == regs.begin()) {
464          printf("%cgprs: ", reg_char);
465       } else {
466          printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
467       }
468       const auto line_end_it = std::next(line_begin_it, regs_in_line);
469 
470       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
471          bool has_adjacent_variable =
472             (std::next(reg_it) != line_end_it &&
473              reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
474          print_reg(reg_file, *reg_it, has_adjacent_variable);
475       }
476 
477       line_begin_it = line_end_it;
478       printf("\n");
479    }
480 
481    const unsigned free_regs =
482       std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
483    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
484 
485    /* print assignments ordered by registers */
486    std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
487    for (unsigned id : find_vars(ctx, reg_file, regs)) {
488       const assignment& var = ctx.assignments[id];
489       PhysReg reg = var.reg;
490       ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
491       assert(inserted.second);
492    }
493 
494    for (const auto& reg_and_var : regs_to_vars) {
495       const auto& first_reg = reg_and_var.first;
496       const auto& size_id = reg_and_var.second;
497 
498       printf("%%%u ", size_id.second);
499       if (ctx.orig_names.count(size_id.second) &&
500           ctx.orig_names[size_id.second].id() != size_id.second) {
501          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
502       }
503       printf("= %c[%d", reg_char, first_reg.reg() % 256);
504       PhysReg last_reg = first_reg.advance(size_id.first - 1);
505       if (first_reg.reg() != last_reg.reg()) {
506          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
507          printf("-%d", last_reg.reg() % 256);
508       }
509       printf("]");
510       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
511          printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
512       }
513       printf("\n");
514    }
515 }
516 
517 bool
is_sgpr_writable_without_side_effects(amd_gfx_level gfx_level,PhysReg reg)518 is_sgpr_writable_without_side_effects(amd_gfx_level gfx_level, PhysReg reg)
519 {
520    assert(reg < 256);
521    bool has_flat_scr_lo_gfx89 = gfx_level >= GFX8 && gfx_level <= GFX9;
522    bool has_flat_scr_lo_gfx7_or_xnack_mask = gfx_level <= GFX9;
523    return (reg <= vcc_hi || reg == m0) &&
524           (!has_flat_scr_lo_gfx89 || (reg != flat_scr_lo && reg != flat_scr_hi)) &&
525           (!has_flat_scr_lo_gfx7_or_xnack_mask || (reg != 104 || reg != 105));
526 }
527 
528 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)529 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
530                             unsigned idx, RegClass rc)
531 {
532    assert(gfx_level >= GFX8);
533    if (instr->isPseudo()) {
534       /* v_readfirstlane_b32 cannot use SDWA */
535       if (instr->opcode == aco_opcode::p_as_uniform)
536          return 4;
537       else
538          return rc.bytes() % 2 == 0 ? 2 : 1;
539    }
540 
541    assert(rc.bytes() <= 2);
542    if (instr->isVALU()) {
543       if (can_use_SDWA(gfx_level, instr, false))
544          return rc.bytes();
545       if (can_use_opsel(gfx_level, instr->opcode, idx))
546          return 2;
547       if (instr->isVOP3P())
548          return 2;
549    }
550 
551    switch (instr->opcode) {
552    case aco_opcode::v_cvt_f32_ubyte0: return 1;
553    case aco_opcode::ds_write_b8:
554    case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
555    case aco_opcode::buffer_store_byte:
556    case aco_opcode::buffer_store_short:
557    case aco_opcode::buffer_store_format_d16_x:
558    case aco_opcode::flat_store_byte:
559    case aco_opcode::flat_store_short:
560    case aco_opcode::scratch_store_byte:
561    case aco_opcode::scratch_store_short:
562    case aco_opcode::global_store_byte:
563    case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
564    default: return 4;
565    }
566 }
567 
568 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)569 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
570                      RegClass rc)
571 {
572    amd_gfx_level gfx_level = ctx.program->gfx_level;
573    if (instr->isPseudo() || byte == 0)
574       return;
575 
576    assert(rc.bytes() <= 2);
577    if (instr->isVALU()) {
578       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
579          switch (byte) {
580          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
581          case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
582          case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
583          case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
584          }
585          return;
586       }
587 
588       /* use SDWA */
589       if (can_use_SDWA(gfx_level, instr, false)) {
590          convert_to_SDWA(gfx_level, instr);
591          return;
592       }
593 
594       /* use opsel */
595       if (instr->isVOP3P()) {
596          assert(byte == 2 && !instr->valu().opsel_lo[idx]);
597          instr->valu().opsel_lo[idx] = true;
598          instr->valu().opsel_hi[idx] = true;
599          return;
600       }
601 
602       assert(can_use_opsel(gfx_level, instr->opcode, idx));
603       instr->valu().opsel[idx] = true;
604       return;
605    }
606 
607    assert(byte == 2);
608    if (instr->opcode == aco_opcode::ds_write_b8)
609       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
610    else if (instr->opcode == aco_opcode::ds_write_b16)
611       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
612    else if (instr->opcode == aco_opcode::buffer_store_byte)
613       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
614    else if (instr->opcode == aco_opcode::buffer_store_short)
615       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
616    else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
617       instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
618    else if (instr->opcode == aco_opcode::flat_store_byte)
619       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
620    else if (instr->opcode == aco_opcode::flat_store_short)
621       instr->opcode = aco_opcode::flat_store_short_d16_hi;
622    else if (instr->opcode == aco_opcode::scratch_store_byte)
623       instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
624    else if (instr->opcode == aco_opcode::scratch_store_short)
625       instr->opcode = aco_opcode::scratch_store_short_d16_hi;
626    else if (instr->opcode == aco_opcode::global_store_byte)
627       instr->opcode = aco_opcode::global_store_byte_d16_hi;
628    else if (instr->opcode == aco_opcode::global_store_short)
629       instr->opcode = aco_opcode::global_store_short_d16_hi;
630    else
631       unreachable("Something went wrong: Impossible register assignment.");
632    return;
633 }
634 
635 void
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr)636 DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr)
637 {
638    amd_gfx_level gfx_level = program->gfx_level;
639    assert(gfx_level >= GFX8);
640 
641    stride = rc.bytes() % 2 == 0 ? 2 : 1;
642 
643    if (instr->isPseudo()) {
644       if (instr->opcode == aco_opcode::p_interp_gfx11) {
645          rc = RegClass(RegType::vgpr, rc.size());
646          stride = 1;
647       }
648       return;
649    }
650 
651    if (instr->isVALU()) {
652       assert(rc.bytes() <= 2);
653 
654       if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
655          return;
656 
657       rc = instr_is_16bit(gfx_level, instr->opcode) ? v2b : v1;
658       stride = rc == v2b ? 4 : 1;
659       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
660           can_use_opsel(gfx_level, instr->opcode, -1)) {
661          data_stride = 2;
662          stride = rc == v2b ? 2 : stride;
663       }
664       return;
665    }
666 
667    switch (instr->opcode) {
668    case aco_opcode::v_interp_p2_f16: return;
669    /* D16 loads with _hi version */
670    case aco_opcode::ds_read_u8_d16:
671    case aco_opcode::ds_read_i8_d16:
672    case aco_opcode::ds_read_u16_d16:
673    case aco_opcode::flat_load_ubyte_d16:
674    case aco_opcode::flat_load_sbyte_d16:
675    case aco_opcode::flat_load_short_d16:
676    case aco_opcode::global_load_ubyte_d16:
677    case aco_opcode::global_load_sbyte_d16:
678    case aco_opcode::global_load_short_d16:
679    case aco_opcode::scratch_load_ubyte_d16:
680    case aco_opcode::scratch_load_sbyte_d16:
681    case aco_opcode::scratch_load_short_d16:
682    case aco_opcode::buffer_load_ubyte_d16:
683    case aco_opcode::buffer_load_sbyte_d16:
684    case aco_opcode::buffer_load_short_d16:
685    case aco_opcode::buffer_load_format_d16_x: {
686       assert(gfx_level >= GFX9);
687       if (program->dev.sram_ecc_enabled) {
688          rc = v1;
689          stride = 1;
690          data_stride = 2;
691       } else {
692          stride = 2;
693       }
694       return;
695    }
696    /* 3-component D16 loads */
697    case aco_opcode::buffer_load_format_d16_xyz:
698    case aco_opcode::tbuffer_load_format_d16_xyz: {
699       assert(gfx_level >= GFX9);
700       if (program->dev.sram_ecc_enabled) {
701          rc = v2;
702          stride = 1;
703       } else {
704          stride = 4;
705       }
706       return;
707    }
708    default: break;
709    }
710 
711    if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
712       assert(gfx_level >= GFX9);
713       stride = 4;
714    } else {
715       rc = RegClass(RegType::vgpr, rc.size());
716       stride = 1;
717    }
718 }
719 
720 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg,bool allow_16bit_write)721 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
722                         bool allow_16bit_write)
723 {
724    if (instr->isPseudo())
725       return;
726 
727    if (instr->isVALU()) {
728       amd_gfx_level gfx_level = program->gfx_level;
729       assert(instr->definitions[0].bytes() <= 2);
730 
731       if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
732          return;
733 
734       if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
735          return;
736 
737       /* use SDWA */
738       if (can_use_SDWA(gfx_level, instr, false)) {
739          convert_to_SDWA(gfx_level, instr);
740          return;
741       }
742 
743       assert(allow_16bit_write);
744 
745       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
746          instr->opcode = aco_opcode::v_fma_mixhi_f16;
747          return;
748       }
749 
750       /* use opsel */
751       assert(reg.byte() == 2);
752       assert(can_use_opsel(gfx_level, instr->opcode, -1));
753       instr->valu().opsel[3] = true; /* dst in high half */
754       return;
755    }
756 
757    if (reg.byte() == 0)
758       return;
759    else if (instr->opcode == aco_opcode::v_interp_p2_f16)
760       instr->opcode = aco_opcode::v_interp_p2_hi_f16;
761    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
762       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
763    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
764       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
765    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
766       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
767    else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
768       instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
769    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
770       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
771    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
772       instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
773    else if (instr->opcode == aco_opcode::flat_load_short_d16)
774       instr->opcode = aco_opcode::flat_load_short_d16_hi;
775    else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
776       instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
777    else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
778       instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
779    else if (instr->opcode == aco_opcode::scratch_load_short_d16)
780       instr->opcode = aco_opcode::scratch_load_short_d16_hi;
781    else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
782       instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
783    else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
784       instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
785    else if (instr->opcode == aco_opcode::global_load_short_d16)
786       instr->opcode = aco_opcode::global_load_short_d16_hi;
787    else if (instr->opcode == aco_opcode::ds_read_u8_d16)
788       instr->opcode = aco_opcode::ds_read_u8_d16_hi;
789    else if (instr->opcode == aco_opcode::ds_read_i8_d16)
790       instr->opcode = aco_opcode::ds_read_i8_d16_hi;
791    else if (instr->opcode == aco_opcode::ds_read_u16_d16)
792       instr->opcode = aco_opcode::ds_read_u16_d16_hi;
793    else
794       unreachable("Something went wrong: Impossible register assignment.");
795 }
796 
797 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)798 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
799 {
800    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
801    unsigned size = rc.size();
802    if (rc.type() == RegType::vgpr) {
803       assert(reg >= 256);
804       uint16_t hi = reg - 256 + size - 1;
805       assert(hi <= 255);
806       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
807    } else if (reg + rc.size() <= max_addressible_sgpr) {
808       uint16_t hi = reg + size - 1;
809       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
810    }
811 }
812 
813 enum UpdateRenames {
814    rename_not_killed_ops = 0x1,
815 };
816 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
817 
818 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)819 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
820                std::vector<std::pair<Operand, Definition>>& parallelcopies,
821                aco_ptr<Instruction>& instr, UpdateRenames flags)
822 {
823    /* clear operands */
824    for (std::pair<Operand, Definition>& copy : parallelcopies) {
825       /* the definitions with id are not from this function and already handled */
826       if (copy.second.isTemp())
827          continue;
828       reg_file.clear(copy.first);
829    }
830 
831    /* allocate id's and rename operands: this is done transparently here */
832    auto it = parallelcopies.begin();
833    while (it != parallelcopies.end()) {
834       if (it->second.isTemp()) {
835          ++it;
836          continue;
837       }
838 
839       /* check if we moved a definition: change the register and remove copy */
840       bool is_def = false;
841       for (Definition& def : instr->definitions) {
842          if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
843             // FIXME: ensure that the definition can use this reg
844             def.setFixed(it->second.physReg());
845             reg_file.fill(def);
846             ctx.assignments[def.tempId()].reg = def.physReg();
847             it = parallelcopies.erase(it);
848             is_def = true;
849             break;
850          }
851       }
852       if (is_def)
853          continue;
854 
855       /* check if we moved another parallelcopy definition */
856       for (std::pair<Operand, Definition>& other : parallelcopies) {
857          if (!other.second.isTemp())
858             continue;
859          if (it->first.getTemp() == other.second.getTemp()) {
860             other.second.setFixed(it->second.physReg());
861             ctx.assignments[other.second.tempId()].reg = other.second.physReg();
862             it = parallelcopies.erase(it);
863             is_def = true;
864             /* check if we moved an operand, again */
865             bool fill = true;
866             for (Operand& op : instr->operands) {
867                if (op.isTemp() && op.tempId() == other.second.tempId()) {
868                   // FIXME: ensure that the operand can use this reg
869                   op.setFixed(other.second.physReg());
870                   fill = !op.isKillBeforeDef();
871                }
872             }
873             if (fill)
874                reg_file.fill(other.second);
875             break;
876          }
877       }
878       if (is_def)
879          continue;
880 
881       std::pair<Operand, Definition>& copy = *it;
882       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
883       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
884       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
885 
886       /* check if we moved an operand */
887       bool first[2] = {true, true};
888       bool fill = true;
889       for (unsigned i = 0; i < instr->operands.size(); i++) {
890          Operand& op = instr->operands[i];
891          if (!op.isTemp())
892             continue;
893          if (op.tempId() == copy.first.tempId()) {
894             /* only rename precolored operands if the copy-location matches */
895             bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg();
896 
897             /* Omit renaming in some cases for p_create_vector in order to avoid
898              * unnecessary shuffle code. */
899             if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
900                omit_renaming = true;
901                for (std::pair<Operand, Definition>& pc : parallelcopies) {
902                   PhysReg def_reg = pc.second.physReg();
903                   omit_renaming &= def_reg > copy.first.physReg()
904                                       ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
905                                       : (def_reg + pc.second.size() <= copy.first.physReg().reg());
906                }
907             }
908 
909             /* Fix the kill flags */
910             if (first[omit_renaming])
911                op.setFirstKill(omit_renaming || op.isKill());
912             else
913                op.setKill(omit_renaming || op.isKill());
914             first[omit_renaming] = false;
915 
916             if (omit_renaming)
917                continue;
918 
919             op.setTemp(copy.second.getTemp());
920             op.setFixed(copy.second.physReg());
921 
922             fill = !op.isKillBeforeDef() || op.isPrecolored();
923          }
924       }
925 
926       /* Apply changes to register file. */
927       if (fill)
928          reg_file.fill(copy.second);
929 
930       ++it;
931    }
932 }
933 
934 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,const RegisterFile & reg_file,DefInfo info)935 get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
936 {
937    PhysRegInterval bounds = info.bounds;
938    uint32_t size = info.size;
939    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
940    RegClass rc = info.rc;
941 
942    if (stride < size && !rc.is_subdword()) {
943       DefInfo new_info = info;
944       new_info.stride = stride * 2;
945       if (size % new_info.stride == 0) {
946          std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
947          if (res)
948             return res;
949       }
950    }
951 
952    PhysRegIterator& rr_it = rc.type() == RegType::vgpr ? ctx.rr_vgpr_it : ctx.rr_sgpr_it;
953    if (stride == 1) {
954       if (rr_it != bounds.begin() && bounds.contains(rr_it.reg)) {
955          assert(bounds.begin() < rr_it);
956          assert(rr_it < bounds.end());
957          info.bounds = PhysRegInterval::from_until(rr_it.reg, bounds.hi());
958          std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
959          if (res)
960             return res;
961          bounds = PhysRegInterval::from_until(bounds.lo(), rr_it.reg);
962       }
963    }
964 
965    auto is_free = [&](PhysReg reg_index)
966    { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
967 
968    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
969         reg_win += stride) {
970       if (std::all_of(reg_win.begin(), reg_win.end(), is_free)) {
971          if (stride == 1) {
972             PhysRegIterator new_rr_it{PhysReg{reg_win.lo() + size}};
973             if (new_rr_it < bounds.end())
974                rr_it = new_rr_it;
975          }
976          adjust_max_used_regs(ctx, rc, reg_win.lo());
977          return reg_win.lo();
978       }
979    }
980 
981    /* do this late because using the upper bytes of a register can require
982     * larger instruction encodings or copies
983     * TODO: don't do this in situations where it doesn't benefit */
984    if (rc.is_subdword()) {
985       for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
986            reg_file.subdword_regs) {
987          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
988          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
989             continue;
990 
991          auto it = entry.second.begin();
992          for (unsigned i = 0; i < 4; i += info.stride) {
993             /* check if there's a block of free bytes large enough to hold the register */
994             bool reg_found =
995                std::all_of(std::next(it, i), std::next(it, std::min(4u, i + rc.bytes())),
996                            [](unsigned v) { return v == 0; });
997 
998             /* check if also the neighboring reg is free if needed */
999             if (reg_found && i + rc.bytes() > 4)
1000                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
1001 
1002             if (reg_found) {
1003                PhysReg res{entry.first};
1004                res.reg_b += i;
1005                adjust_max_used_regs(ctx, rc, entry.first);
1006                return res;
1007             }
1008          }
1009       }
1010    }
1011 
1012    return {};
1013 }
1014 
1015 /* collect variables from a register area */
1016 std::vector<unsigned>
find_vars(ra_ctx & ctx,const RegisterFile & reg_file,const PhysRegInterval reg_interval)1017 find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
1018 {
1019    std::vector<unsigned> vars;
1020    for (PhysReg j : reg_interval) {
1021       if (reg_file.is_blocked(j))
1022          continue;
1023       if (reg_file[j] == 0xF0000000) {
1024          for (unsigned k = 0; k < 4; k++) {
1025             unsigned id = reg_file.subdword_regs.at(j)[k];
1026             if (id && (vars.empty() || id != vars.back()))
1027                vars.emplace_back(id);
1028          }
1029       } else {
1030          unsigned id = reg_file[j];
1031          if (id && (vars.empty() || id != vars.back()))
1032             vars.emplace_back(id);
1033       }
1034    }
1035    return vars;
1036 }
1037 
1038 /* collect variables from a register area and clear reg_file
1039  * variables are sorted in decreasing size and
1040  * increasing assigned register
1041  */
1042 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1043 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1044 {
1045    std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1046    std::sort(ids.begin(), ids.end(),
1047              [&](unsigned a, unsigned b)
1048              {
1049                 assignment& var_a = ctx.assignments[a];
1050                 assignment& var_b = ctx.assignments[b];
1051                 return var_a.rc.bytes() > var_b.rc.bytes() ||
1052                        (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1053              });
1054 
1055    for (unsigned id : ids) {
1056       assignment& var = ctx.assignments[id];
1057       reg_file.clear(var.reg, var.rc);
1058    }
1059    return ids;
1060 }
1061 
1062 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1063 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1064                                std::vector<std::pair<Operand, Definition>>& parallelcopies,
1065                                aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1066                                DefInfo info, unsigned id)
1067 {
1068    PhysReg reg = def_reg.lo();
1069    /* dead operand: return position in vector */
1070    for (unsigned i = 0; i < instr->operands.size(); i++) {
1071       if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1072           instr->operands[i].isKillBeforeDef()) {
1073          assert(!reg_file.test(reg, instr->operands[i].bytes()));
1074          if (info.rc.is_subdword() || reg.byte() == 0)
1075             return reg;
1076          else
1077             return {};
1078       }
1079       reg.reg_b += instr->operands[i].bytes();
1080    }
1081 
1082    /* GFX9+ has a VGPR swap instruction. */
1083    if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1084       return {};
1085 
1086    /* check if the previous position was in vector */
1087    assignment& var = ctx.assignments[id];
1088    if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1089       reg = def_reg.lo();
1090       /* try to use the previous register of the operand */
1091       for (unsigned i = 0; i < instr->operands.size(); i++) {
1092          if (reg != var.reg) {
1093             reg.reg_b += instr->operands[i].bytes();
1094             continue;
1095          }
1096 
1097          /* check if we can swap positions */
1098          if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1099              instr->operands[i].regClass() == info.rc) {
1100             assignment& op = ctx.assignments[instr->operands[i].tempId()];
1101             /* if everything matches, create parallelcopy for the killed operand */
1102             if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1103                 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1104                Definition pc_def = Definition(reg, info.rc);
1105                parallelcopies.emplace_back(instr->operands[i], pc_def);
1106                return op.reg;
1107             }
1108          }
1109          return {};
1110       }
1111    }
1112    return {};
1113 }
1114 
1115 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1116 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1117                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
1118                     const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1119                     const PhysRegInterval def_reg)
1120 {
1121    /* Variables are sorted from large to small and with increasing assigned register */
1122    for (unsigned id : vars) {
1123       assignment& var = ctx.assignments[id];
1124       PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
1125       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1126       uint32_t size = info.size;
1127 
1128       /* check if this is a dead operand, then we can re-use the space from the definition
1129        * also use the correct stride for sub-dword operands */
1130       bool is_dead_operand = false;
1131       std::optional<PhysReg> res;
1132       if (instr->opcode == aco_opcode::p_create_vector) {
1133          res =
1134             get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1135       } else {
1136          for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1137             if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1138                info = DefInfo(ctx, instr, var.rc, i);
1139                if (instr->operands[i].isKillBeforeDef()) {
1140                   info.bounds = def_reg;
1141                   res = get_reg_simple(ctx, reg_file, info);
1142                   is_dead_operand = true;
1143                }
1144                break;
1145             }
1146          }
1147       }
1148       if (!res && !def_reg.size) {
1149          /* If this is before definitions are handled, def_reg may be an empty interval. */
1150          info.bounds = bounds;
1151          res = get_reg_simple(ctx, reg_file, info);
1152       } else if (!res) {
1153          /* Try to find space within the bounds but outside of the definition */
1154          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1155          res = get_reg_simple(ctx, reg_file, info);
1156          if (!res && def_reg.hi() <= bounds.hi()) {
1157             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1158             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1159             res = get_reg_simple(ctx, reg_file, info);
1160          }
1161       }
1162 
1163       if (res) {
1164          /* mark the area as blocked */
1165          reg_file.block(*res, var.rc);
1166 
1167          /* create parallelcopy pair (without definition id) */
1168          Temp tmp = Temp(id, var.rc);
1169          Operand pc_op = Operand(tmp);
1170          pc_op.setFixed(var.reg);
1171          Definition pc_def = Definition(*res, pc_op.regClass());
1172          parallelcopies.emplace_back(pc_op, pc_def);
1173          continue;
1174       }
1175 
1176       PhysReg best_pos = bounds.lo();
1177       unsigned num_moves = 0xFF;
1178       unsigned num_vars = 0;
1179 
1180       /* we use a sliding window to find potential positions */
1181       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1182       for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1183            reg_win += stride) {
1184          if (!is_dead_operand && intersects(reg_win, def_reg))
1185             continue;
1186 
1187          /* second, check that we have at most k=num_moves elements in the window
1188           * and no element is larger than the currently processed one */
1189          unsigned k = 0;
1190          unsigned n = 0;
1191          unsigned last_var = 0;
1192          bool found = true;
1193          for (PhysReg j : reg_win) {
1194             if (reg_file[j] == 0 || reg_file[j] == last_var)
1195                continue;
1196 
1197             if (reg_file.is_blocked(j) || k > num_moves) {
1198                found = false;
1199                break;
1200             }
1201             if (reg_file[j] == 0xF0000000) {
1202                k += 1;
1203                n++;
1204                continue;
1205             }
1206             /* we cannot split live ranges of linear vgprs */
1207             if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1208                found = false;
1209                break;
1210             }
1211             bool is_kill = false;
1212             for (const Operand& op : instr->operands) {
1213                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1214                   is_kill = true;
1215                   break;
1216                }
1217             }
1218             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1219                found = false;
1220                break;
1221             }
1222 
1223             k += ctx.assignments[reg_file[j]].rc.size();
1224             last_var = reg_file[j];
1225             n++;
1226             if (k > num_moves || (k == num_moves && n <= num_vars)) {
1227                found = false;
1228                break;
1229             }
1230          }
1231 
1232          if (found) {
1233             best_pos = reg_win.lo();
1234             num_moves = k;
1235             num_vars = n;
1236          }
1237       }
1238 
1239       /* FIXME: we messed up and couldn't find space for the variables to be copied */
1240       if (num_moves == 0xFF)
1241          return false;
1242 
1243       PhysRegInterval reg_win{best_pos, size};
1244 
1245       /* collect variables and block reg file */
1246       std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1247 
1248       /* mark the area as blocked */
1249       reg_file.block(reg_win.lo(), var.rc);
1250       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1251 
1252       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1253          return false;
1254 
1255       /* create parallelcopy pair (without definition id) */
1256       Temp tmp = Temp(id, var.rc);
1257       Operand pc_op = Operand(tmp);
1258       pc_op.setFixed(var.reg);
1259       Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1260       parallelcopies.emplace_back(pc_op, pc_def);
1261    }
1262 
1263    return true;
1264 }
1265 
1266 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1267 get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
1268              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1269              aco_ptr<Instruction>& instr)
1270 {
1271    const PhysRegInterval& bounds = info.bounds;
1272    uint32_t size = info.size;
1273    uint32_t stride = info.stride;
1274    RegClass rc = info.rc;
1275 
1276    /* check how many free regs we have */
1277    unsigned regs_free = reg_file.count_zero(bounds);
1278 
1279    /* mark and count killed operands */
1280    unsigned killed_ops = 0;
1281    std::bitset<256> is_killed_operand; /* per-register */
1282    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1283       Operand& op = instr->operands[j];
1284       if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1285           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1286          assert(op.isFixed());
1287 
1288          for (unsigned i = 0; i < op.size(); ++i) {
1289             is_killed_operand[(op.physReg() & 0xff) + i] = true;
1290          }
1291 
1292          killed_ops += op.getTemp().size();
1293       }
1294    }
1295 
1296    assert((regs_free + ctx.num_linear_vgprs) >= size);
1297 
1298    /* we might have to move dead operands to dst in order to make space */
1299    unsigned op_moves = 0;
1300 
1301    if (size > (regs_free - killed_ops))
1302       op_moves = size - (regs_free - killed_ops);
1303 
1304    /* find the best position to place the definition */
1305    PhysRegInterval best_win = {bounds.lo(), size};
1306    unsigned num_moves = 0xFF;
1307    unsigned num_vars = 0;
1308 
1309    /* we use a sliding window to check potential positions */
1310    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1311         reg_win += stride) {
1312       /* first check if the register window starts in the middle of an
1313        * allocated variable: this is what we have to fix to allow for
1314        * num_moves > size */
1315       if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1316           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1317          continue;
1318       if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1319           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1320          continue;
1321 
1322       /* second, check that we have at most k=num_moves elements in the window
1323        * and no element is larger than the currently processed one */
1324       unsigned k = op_moves;
1325       unsigned n = 0;
1326       unsigned remaining_op_moves = op_moves;
1327       unsigned last_var = 0;
1328       bool found = true;
1329       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1330       for (const PhysReg j : reg_win) {
1331          /* dead operands effectively reduce the number of estimated moves */
1332          if (is_killed_operand[j & 0xFF]) {
1333             if (remaining_op_moves) {
1334                k--;
1335                remaining_op_moves--;
1336             }
1337             continue;
1338          }
1339 
1340          if (reg_file[j] == 0 || reg_file[j] == last_var)
1341             continue;
1342 
1343          if (reg_file[j] == 0xF0000000) {
1344             k += 1;
1345             n++;
1346             continue;
1347          }
1348 
1349          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1350             found = false;
1351             break;
1352          }
1353 
1354          /* we cannot split live ranges of linear vgprs */
1355          if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1356             found = false;
1357             break;
1358          }
1359 
1360          k += ctx.assignments[reg_file[j]].rc.size();
1361          n++;
1362          last_var = reg_file[j];
1363       }
1364 
1365       if (!found || k > num_moves)
1366          continue;
1367       if (k == num_moves && n < num_vars)
1368          continue;
1369       if (!aligned && k == num_moves && n == num_vars)
1370          continue;
1371 
1372       if (found) {
1373          best_win = reg_win;
1374          num_moves = k;
1375          num_vars = n;
1376       }
1377    }
1378 
1379    if (num_moves == 0xFF)
1380       return {};
1381 
1382    /* now, we figured the placement for our definition */
1383    RegisterFile tmp_file(reg_file);
1384 
1385    /* p_create_vector: also re-place killed operands in the definition space */
1386    if (instr->opcode == aco_opcode::p_create_vector)
1387       tmp_file.fill_killed_operands(instr.get());
1388 
1389    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1390 
1391    /* re-enable killed operands */
1392    if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector)
1393       tmp_file.fill_killed_operands(instr.get());
1394 
1395    std::vector<std::pair<Operand, Definition>> pc;
1396    if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1397       return {};
1398 
1399    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1400 
1401    adjust_max_used_regs(ctx, rc, best_win.lo());
1402    return best_win.lo();
1403 }
1404 
1405 bool
get_reg_specified(ra_ctx & ctx,const RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg,int operand)1406 get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
1407                   aco_ptr<Instruction>& instr, PhysReg reg, int operand)
1408 {
1409    /* catch out-of-range registers */
1410    if (reg >= PhysReg{512})
1411       return false;
1412 
1413    DefInfo info(ctx, instr, rc, operand);
1414 
1415    if (reg.reg_b % info.data_stride)
1416       return false;
1417 
1418    assert(util_is_power_of_two_nonzero(info.stride));
1419    reg.reg_b &= ~(info.stride - 1);
1420 
1421    PhysRegInterval reg_win = {PhysReg(reg.reg()), info.rc.size()};
1422    PhysRegInterval vcc_win = {vcc, 2};
1423    /* VCC is outside the bounds */
1424    bool is_vcc =
1425       info.rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1426    bool is_m0 = info.rc == s1 && reg == m0 && can_write_m0(instr);
1427    if (!info.bounds.contains(reg_win) && !is_vcc && !is_m0)
1428       return false;
1429 
1430    if (reg_file.test(reg, info.rc.bytes()))
1431       return false;
1432 
1433    adjust_max_used_regs(ctx, info.rc, reg_win.lo());
1434    return true;
1435 }
1436 
1437 bool
increase_register_file(ra_ctx & ctx,RegClass rc)1438 increase_register_file(ra_ctx& ctx, RegClass rc)
1439 {
1440    if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
1441        ctx.vgpr_bounds < ctx.vgpr_limit) {
1442       /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
1443       update_vgpr_sgpr_demand(
1444          ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
1445 
1446       ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
1447    } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1448       update_vgpr_sgpr_demand(
1449          ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
1450 
1451       ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
1452    } else {
1453       return false;
1454    }
1455 
1456    return true;
1457 }
1458 
1459 struct IDAndRegClass {
IDAndRegClassaco::__anone0d89ec20111::IDAndRegClass1460    IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1461 
1462    unsigned id;
1463    RegClass rc;
1464 };
1465 
1466 struct IDAndInfo {
IDAndInfoaco::__anone0d89ec20111::IDAndInfo1467    IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1468 
1469    unsigned id;
1470    DefInfo info;
1471 };
1472 
1473 void
add_rename(ra_ctx & ctx,Temp orig_val,Temp new_val)1474 add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
1475 {
1476    ctx.renames[ctx.block->index][orig_val.id()] = new_val;
1477    ctx.orig_names.emplace(new_val.id(), orig_val);
1478    ctx.assignments[orig_val.id()].renamed = true;
1479 }
1480 
1481 /* Reallocates vars by sorting them and placing each variable after the previous
1482  * one. If one of the variables has 0xffffffff as an ID, the register assigned
1483  * for that variable will be returned.
1484  */
1485 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1486 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1487                       std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1488 {
1489    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1490     * temporary sizes to dwords.
1491     */
1492    std::vector<IDAndInfo> sorted;
1493    for (IDAndRegClass var : vars) {
1494       DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1495       sorted.emplace_back(var.id, info);
1496    }
1497 
1498    std::sort(
1499       sorted.begin(), sorted.end(),
1500       [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1501       {
1502          unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1503          unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1504          if (a_stride > b_stride)
1505             return true;
1506          if (a_stride < b_stride)
1507             return false;
1508          if (a.id == 0xffffffff || b.id == 0xffffffff)
1509             return a.id ==
1510                    0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1511          return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1512       });
1513 
1514    PhysReg next_reg = start;
1515    PhysReg space_reg;
1516    for (IDAndInfo& var : sorted) {
1517       unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1518       next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1519 
1520       /* 0xffffffff is a special variable ID used reserve a space for killed
1521        * operands and definitions.
1522        */
1523       if (var.id != 0xffffffff) {
1524          if (next_reg != ctx.assignments[var.id].reg) {
1525             RegClass rc = ctx.assignments[var.id].rc;
1526             Temp tmp(var.id, rc);
1527 
1528             Operand pc_op(tmp);
1529             pc_op.setFixed(ctx.assignments[var.id].reg);
1530             Definition pc_def(next_reg, rc);
1531             parallelcopies.emplace_back(pc_op, pc_def);
1532          }
1533       } else {
1534          space_reg = next_reg;
1535       }
1536 
1537       adjust_max_used_regs(ctx, var.info.rc, next_reg);
1538 
1539       next_reg = next_reg.advance(var.info.rc.size() * 4);
1540    }
1541 
1542    return space_reg;
1543 }
1544 
1545 bool
is_vector_intact(ra_ctx & ctx,const RegisterFile & reg_file,const vector_info & vec_info)1546 is_vector_intact(ra_ctx& ctx, const RegisterFile& reg_file, const vector_info& vec_info)
1547 {
1548    unsigned size = 0;
1549    for (unsigned i = 0; i < vec_info.num_parts; i++)
1550       size += vec_info.parts[i].bytes();
1551 
1552    PhysReg first{512};
1553    int offset = 0;
1554    for (unsigned i = 0; i < vec_info.num_parts; i++) {
1555       Operand op = vec_info.parts[i];
1556 
1557       if (ctx.assignments[op.tempId()].assigned) {
1558          PhysReg reg = ctx.assignments[op.tempId()].reg;
1559 
1560          if (first.reg() == 512) {
1561             PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1562             first = reg.advance(-offset);
1563             PhysRegInterval vec = PhysRegInterval{first, DIV_ROUND_UP(size, 4)};
1564             if (!bounds.contains(vec)) /* not enough space for other operands */
1565                return false;
1566          } else {
1567             if (reg != first.advance(offset)) /* not at the best position */
1568                return false;
1569          }
1570       } else {
1571          /* If there's an unexpected temporary, this operand is unlikely to be
1572           * placed in the best position.
1573           */
1574          if (first.reg() != 512 && reg_file.test(first.advance(offset), op.bytes()))
1575             return false;
1576       }
1577 
1578       offset += op.bytes();
1579    }
1580 
1581    return true;
1582 }
1583 
1584 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr,int operand)1585 get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr,
1586                int operand)
1587 {
1588    const vector_info& vec = ctx.vectors[temp.id()];
1589    if (!vec.is_weak || is_vector_intact(ctx, reg_file, vec)) {
1590       unsigned our_offset = 0;
1591       for (unsigned i = 0; i < vec.num_parts; i++) {
1592          const Operand& op = vec.parts[i];
1593          if (op.isTemp() && op.tempId() == temp.id())
1594             break;
1595          else
1596             our_offset += op.bytes();
1597       }
1598 
1599       unsigned their_offset = 0;
1600       /* check for every operand of the vector
1601        * - whether the operand is assigned and
1602        * - we can use the register relative to that operand
1603        */
1604       for (unsigned i = 0; i < vec.num_parts; i++) {
1605          const Operand& op = vec.parts[i];
1606          if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1607              ctx.assignments[op.tempId()].assigned) {
1608             PhysReg reg = ctx.assignments[op.tempId()].reg;
1609             reg.reg_b += (our_offset - their_offset);
1610             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand))
1611                return reg;
1612 
1613             /* return if MIMG vaddr components don't remain vector-aligned */
1614             if (vec.is_weak)
1615                return {};
1616          }
1617          their_offset += op.bytes();
1618       }
1619 
1620       /* We didn't find a register relative to other vector operands.
1621        * Try to find new space which fits the whole vector.
1622        */
1623       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1624       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1625       std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1626       if (reg) {
1627          reg->reg_b += our_offset;
1628          /* make sure to only use byte offset if the instruction supports it */
1629          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg, operand))
1630             return reg;
1631       }
1632    }
1633    return {};
1634 }
1635 
1636 bool
compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies)1637 compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
1638                      std::vector<std::pair<Operand, Definition>>& parallelcopies)
1639 {
1640    PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
1641    int zeros = reg_file.count_zero(linear_vgpr_bounds);
1642    if (zeros == 0)
1643       return false;
1644 
1645    std::vector<IDAndRegClass> vars;
1646    for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
1647       vars.emplace_back(id, ctx.assignments[id].rc);
1648 
1649    ctx.num_linear_vgprs -= zeros;
1650    compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
1651 
1652    return true;
1653 }
1654 
1655 /* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
1656  * from normal VGPRs. This is for two reasons:
1657  * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
1658  *   linear one, we never have to swap a normal VGPR and a linear one.
1659  * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
1660  *   linear VGPR in control flow.
1661  */
1662 PhysReg
alloc_linear_vgpr(ra_ctx & ctx,const RegisterFile & reg_file,aco_ptr<Instruction> & instr,std::vector<std::pair<Operand,Definition>> & parallelcopies)1663 alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
1664                   std::vector<std::pair<Operand, Definition>>& parallelcopies)
1665 {
1666    assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
1667    assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
1668 
1669    RegClass rc = instr->definitions[0].regClass();
1670 
1671    /* Try to choose an unused space in the linear VGPR bounds. */
1672    for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
1673       PhysReg reg(256 + ctx.vgpr_bounds - i);
1674       if (!reg_file.test(reg, rc.bytes())) {
1675          adjust_max_used_regs(ctx, rc, reg);
1676          return reg;
1677       }
1678    }
1679 
1680    PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1681 
1682    /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
1683    compact_linear_vgprs(ctx, reg_file, parallelcopies);
1684 
1685    PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
1686    /* Space that was for normal VGPRs, but is now for linear VGPRs. */
1687    PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
1688 
1689    RegisterFile tmp_file(reg_file);
1690    PhysRegInterval reg_win{reg, rc.size()};
1691    std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
1692 
1693    /* Re-enable killed operands */
1694    tmp_file.fill_killed_operands(instr.get());
1695 
1696    /* Find new assignments for blocking vars. */
1697    std::vector<std::pair<Operand, Definition>> pc;
1698    if (!ctx.policy.skip_optimistic_path &&
1699        get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
1700       parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1701    } else {
1702       /* Fallback algorithm: reallocate all variables at once. */
1703       std::vector<IDAndRegClass> vars;
1704       for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
1705          vars.emplace_back(id, ctx.assignments[id].rc);
1706       compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
1707 
1708       std::vector<IDAndRegClass> killed_op_vars;
1709       for (Operand& op : instr->operands) {
1710          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
1711             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1712       }
1713       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
1714    }
1715 
1716    /* If this is updated earlier, a killed operand can't be placed inside the definition. */
1717    ctx.num_linear_vgprs += rc.size();
1718 
1719    adjust_max_used_regs(ctx, rc, reg);
1720    return reg;
1721 }
1722 
1723 bool
should_compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file)1724 should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
1725 {
1726    if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
1727       return false;
1728 
1729    /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
1730     * ensure in advance that there is enough space for normal VGPRs. */
1731    unsigned max_vgpr_usage = 0;
1732    unsigned next_toplevel = ctx.block->index + 1;
1733    for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
1734       max_vgpr_usage =
1735          MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
1736    }
1737    max_vgpr_usage =
1738       MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].live_in_demand.vgpr);
1739 
1740    for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
1741       max_vgpr_usage -= ctx.assignments[tmp].rc.size();
1742 
1743    return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
1744 }
1745 
1746 PhysReg
get_reg(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1747 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1748         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1749         int operand_index = -1)
1750 {
1751    auto split_vec = ctx.split_vectors.find(temp.id());
1752    if (split_vec != ctx.split_vectors.end()) {
1753       unsigned offset = 0;
1754       for (Definition def : split_vec->second->definitions) {
1755          if (ctx.assignments[def.tempId()].affinity) {
1756             assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1757             if (affinity.assigned) {
1758                PhysReg reg = affinity.reg;
1759                reg.reg_b -= offset;
1760                if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand_index))
1761                   return reg;
1762             }
1763          }
1764          offset += def.bytes();
1765       }
1766    }
1767 
1768    if (ctx.assignments[temp.id()].affinity) {
1769       assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1770       if (affinity.assigned) {
1771          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index))
1772             return affinity.reg;
1773       }
1774    }
1775    if (ctx.assignments[temp.id()].vcc) {
1776       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc, operand_index))
1777          return vcc;
1778    }
1779    if (ctx.assignments[temp.id()].m0) {
1780       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0, operand_index))
1781          return m0;
1782    }
1783 
1784    std::optional<PhysReg> res;
1785 
1786    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1787       res = get_reg_vector(ctx, reg_file, temp, instr, operand_index);
1788       if (res)
1789          return *res;
1790    }
1791 
1792    if (temp.size() == 1 && operand_index == -1) {
1793       for (const Operand& op : instr->operands) {
1794          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass() == temp.regClass()) {
1795             assert(op.isFixed());
1796             if (op.physReg() == vcc || op.physReg() == vcc_hi)
1797                continue;
1798             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, op.physReg(),
1799                                   operand_index))
1800                return op.physReg();
1801          }
1802       }
1803    }
1804 
1805    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1806 
1807    if (!ctx.policy.skip_optimistic_path) {
1808       /* try to find space without live-range splits */
1809       res = get_reg_simple(ctx, reg_file, info);
1810 
1811       if (res)
1812          return *res;
1813    }
1814 
1815    /* try to find space with live-range splits */
1816    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1817 
1818    if (res)
1819       return *res;
1820 
1821    /* try compacting the linear vgprs to make more space */
1822    std::vector<std::pair<Operand, Definition>> pc;
1823    if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
1824        compact_linear_vgprs(ctx, reg_file, pc)) {
1825       parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1826 
1827       /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
1828        * space here. */
1829       RegisterFile tmp_file(reg_file);
1830       for (std::pair<Operand, Definition>& copy : pc)
1831          tmp_file.clear(copy.first);
1832 
1833       return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
1834    }
1835 
1836    /* We should only fail here because keeping under the limit would require
1837     * too many moves. */
1838    assert(reg_file.count_zero(info.bounds) >= info.size);
1839 
1840    /* try using more registers */
1841    if (!increase_register_file(ctx, info.rc)) {
1842       /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
1843        * compact at the end) */
1844       unsigned def_size = info.rc.size();
1845       for (Definition def : instr->definitions) {
1846          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1847             def_size += def.regClass().size();
1848       }
1849 
1850       unsigned killed_op_size = 0;
1851       for (Operand op : instr->operands) {
1852          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1853             killed_op_size += op.regClass().size();
1854       }
1855 
1856       const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
1857 
1858       /* reallocate passthrough variables and non-killed operands */
1859       std::vector<IDAndRegClass> vars;
1860       for (unsigned id : find_vars(ctx, reg_file, regs))
1861          vars.emplace_back(id, ctx.assignments[id].rc);
1862       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1863 
1864       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1865 
1866       /* reallocate killed operands */
1867       std::vector<IDAndRegClass> killed_op_vars;
1868       for (Operand op : instr->operands) {
1869          if (op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1870             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1871       }
1872       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1873 
1874       /* reallocate definitions */
1875       std::vector<IDAndRegClass> def_vars;
1876       for (Definition def : instr->definitions) {
1877          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1878             def_vars.emplace_back(def.tempId(), def.regClass());
1879       }
1880       def_vars.emplace_back(0xffffffff, info.rc);
1881       return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1882    }
1883 
1884    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1885 }
1886 
1887 PhysReg
get_reg_create_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1888 get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1889                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
1890                       aco_ptr<Instruction>& instr)
1891 {
1892    RegClass rc = temp.regClass();
1893    /* create_vector instructions have different costs w.r.t. register coalescing */
1894    uint32_t size = rc.size();
1895    uint32_t bytes = rc.bytes();
1896    uint32_t stride = get_stride(rc);
1897    PhysRegInterval bounds = get_reg_bounds(ctx, rc);
1898 
1899    // TODO: improve p_create_vector for sub-dword vectors
1900 
1901    PhysReg best_pos{0xFFF};
1902    unsigned num_moves = 0xFF;
1903    bool best_avoid = true;
1904    uint32_t correct_pos_mask = 0;
1905 
1906    /* test for each operand which definition placement causes the least shuffle instructions */
1907    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1908         offset += instr->operands[i].bytes(), i++) {
1909       // TODO: think about, if we can alias live operands on the same register
1910       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1911           instr->operands[i].getTemp().type() != rc.type())
1912          continue;
1913 
1914       if (offset > instr->operands[i].physReg().reg_b)
1915          continue;
1916 
1917       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1918       if (reg_lower % 4)
1919          continue;
1920       PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1921       unsigned k = 0;
1922 
1923       /* no need to check multiple times */
1924       if (reg_win.lo() == best_pos)
1925          continue;
1926 
1927       /* check borders */
1928       // TODO: this can be improved */
1929       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1930          continue;
1931       if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1932           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1933          continue;
1934       if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1935           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1936          continue;
1937 
1938       /* count variables to be moved and check "avoid" */
1939       bool avoid = false;
1940       bool linear_vgpr = false;
1941       for (PhysReg j : reg_win) {
1942          if (reg_file[j] != 0) {
1943             if (reg_file[j] == 0xF0000000) {
1944                PhysReg reg;
1945                reg.reg_b = j * 4;
1946                unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1947                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1948                   k += reg_file.test(reg, 1);
1949             } else {
1950                k += 4;
1951                linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1952             }
1953          }
1954          avoid |= ctx.war_hint[j];
1955       }
1956 
1957       /* we cannot split live ranges of linear vgprs */
1958       if (linear_vgpr)
1959          continue;
1960 
1961       if (avoid && !best_avoid)
1962          continue;
1963 
1964       /* count operands in wrong positions */
1965       uint32_t correct_pos_mask_new = 0;
1966       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1967            offset2 += instr->operands[j].bytes(), j++) {
1968          Operand& op = instr->operands[j];
1969          if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1970             correct_pos_mask_new |= 1 << j;
1971          else
1972             k += op.bytes();
1973       }
1974       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1975       if (k > num_moves || (!aligned && k == num_moves))
1976          continue;
1977 
1978       best_pos = reg_win.lo();
1979       num_moves = k;
1980       best_avoid = avoid;
1981       correct_pos_mask = correct_pos_mask_new;
1982    }
1983 
1984    /* too many moves: try the generic get_reg() function */
1985    if (num_moves >= 2 * bytes) {
1986       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1987    } else if (num_moves > bytes) {
1988       DefInfo info(ctx, instr, rc, -1);
1989       std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1990       if (res)
1991          return *res;
1992    }
1993 
1994    /* re-enable killed operands which are in the wrong position */
1995    RegisterFile tmp_file(reg_file);
1996    tmp_file.fill_killed_operands(instr.get());
1997 
1998    for (unsigned i = 0; i < instr->operands.size(); i++) {
1999       if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
2000          tmp_file.clear(instr->operands[i]);
2001    }
2002 
2003    /* collect variables to be moved */
2004    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
2005 
2006    bool success = false;
2007    std::vector<std::pair<Operand, Definition>> pc;
2008    success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
2009 
2010    if (!success) {
2011       if (!increase_register_file(ctx, temp.regClass())) {
2012          /* use the fallback algorithm in get_reg() */
2013          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
2014       }
2015       return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
2016    }
2017 
2018    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
2019    adjust_max_used_regs(ctx, rc, best_pos);
2020 
2021    return best_pos;
2022 }
2023 
2024 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)2025 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
2026 {
2027    if (instr->format != Format::PSEUDO)
2028       return;
2029 
2030    /* all instructions which use handle_operands() need this information */
2031    switch (instr->opcode) {
2032    case aco_opcode::p_extract_vector:
2033    case aco_opcode::p_create_vector:
2034    case aco_opcode::p_split_vector:
2035    case aco_opcode::p_parallelcopy:
2036    case aco_opcode::p_start_linear_vgpr: break;
2037    default: return;
2038    }
2039 
2040    bool writes_linear = false;
2041    /* if all definitions are logical vgpr, no need to care for SCC */
2042    for (Definition& def : instr->definitions) {
2043       if (def.getTemp().regClass().is_linear())
2044          writes_linear = true;
2045    }
2046    /* if all operands are constant, no need to care either */
2047    bool reads_linear = false;
2048    for (Operand& op : instr->operands) {
2049       if (op.isTemp() && op.getTemp().regClass().is_linear())
2050          reads_linear = true;
2051    }
2052 
2053    if (!writes_linear || !reads_linear)
2054       return;
2055 
2056    instr->pseudo().needs_scratch_reg = true;
2057 
2058    if (!reg_file[scc]) {
2059       instr->pseudo().scratch_sgpr = scc;
2060       return;
2061    }
2062 
2063    int reg = ctx.max_used_sgpr;
2064    for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
2065       ;
2066    if (reg < 0) {
2067       reg = ctx.max_used_sgpr + 1;
2068       for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
2069          ;
2070    }
2071 
2072    adjust_max_used_regs(ctx, s1, reg);
2073    instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
2074 }
2075 
2076 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)2077 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
2078                     RegClass rc)
2079 {
2080    if (reg.byte()) {
2081       unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
2082       if (reg.byte() % stride)
2083          return false;
2084    }
2085 
2086    switch (instr->format) {
2087    case Format::SMEM:
2088       return reg != scc && reg != exec &&
2089              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
2090              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
2091               gfx_level >= GFX10); /* sdata can be vcc */
2092    case Format::MUBUF:
2093    case Format::MTBUF: return idx != 2 || gfx_level < GFX12 || reg != scc;
2094    case Format::SOPK:
2095       if (idx == 0 && reg == scc)
2096          return false;
2097       FALLTHROUGH;
2098    case Format::SOP2:
2099    case Format::SOP1:
2100       return get_op_fixed_to_def(instr.get()) != (int)idx ||
2101              is_sgpr_writable_without_side_effects(gfx_level, reg);
2102    default:
2103       // TODO: there are more instructions with restrictions on registers
2104       return true;
2105    }
2106 }
2107 
2108 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)2109 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
2110                       std::vector<std::pair<Operand, Definition>>& parallelcopy,
2111                       aco_ptr<Instruction>& instr)
2112 {
2113    assert(instr->operands.size() <= 128);
2114    assert(parallelcopy.empty());
2115 
2116    RegisterFile tmp_file(register_file);
2117 
2118    BITSET_DECLARE(mask, 128) = {0};
2119 
2120    for (unsigned i = 0; i < instr->operands.size(); i++) {
2121       Operand& op = instr->operands[i];
2122 
2123       if (!op.isPrecolored())
2124          continue;
2125 
2126       assert(op.isTemp());
2127       PhysReg src = ctx.assignments[op.tempId()].reg;
2128       adjust_max_used_regs(ctx, op.regClass(), op.physReg());
2129 
2130       if (op.physReg() == src) {
2131          tmp_file.block(op.physReg(), op.regClass());
2132          continue;
2133       }
2134 
2135       /* An instruction can have at most one operand precolored to the same register. */
2136       assert(std::none_of(parallelcopy.begin(), parallelcopy.end(),
2137                           [&](auto copy) { return copy.second.physReg() == op.physReg(); }));
2138 
2139       /* clear from register_file so fixed operands are not collected be collect_vars() */
2140       tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
2141 
2142       BITSET_SET(mask, i);
2143 
2144       Operand pc_op(instr->operands[i].getTemp());
2145       pc_op.setFixed(src);
2146       Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2147       parallelcopy.emplace_back(pc_op, pc_def);
2148    }
2149 
2150    if (BITSET_IS_EMPTY(mask))
2151       return;
2152 
2153    unsigned i;
2154    std::vector<unsigned> blocking_vars;
2155    BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2156       Operand& op = instr->operands[i];
2157       PhysRegInterval target{op.physReg(), op.size()};
2158       std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2159       blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2160 
2161       /* prevent get_regs_for_copies() from using these registers */
2162       tmp_file.block(op.physReg(), op.regClass());
2163    }
2164 
2165    get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2166    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
2167 }
2168 
2169 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2170 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2171                     std::vector<std::pair<Operand, Definition>>& parallelcopy,
2172                     aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2173 {
2174    /* clear the operand in case it's only a stride mismatch */
2175    PhysReg src = ctx.assignments[operand.tempId()].reg;
2176    register_file.clear(src, operand.regClass());
2177    PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2178 
2179    Operand pc_op = operand;
2180    pc_op.setFixed(src);
2181    Definition pc_def = Definition(dst, pc_op.regClass());
2182    parallelcopy.emplace_back(pc_op, pc_def);
2183    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
2184    register_file.fill(Definition(operand.getTemp(), dst));
2185 }
2186 
2187 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2188 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2189             std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2190             aco_ptr<Instruction>& phi, Temp tmp)
2191 {
2192    std::vector<std::pair<Operand, Definition>> parallelcopy;
2193    PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2194    update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2195 
2196    /* process parallelcopy */
2197    for (std::pair<Operand, Definition> pc : parallelcopy) {
2198       /* see if it's a copy from a different phi */
2199       // TODO: prefer moving some previous phis over live-ins
2200       // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2201       // problem in practice since they can only be fixed to exec)
2202       Instruction* prev_phi = NULL;
2203       for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2204          if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2205             prev_phi = phi_it->get();
2206       }
2207       if (prev_phi) {
2208          /* if so, just update that phi's register */
2209          prev_phi->definitions[0].setFixed(pc.second.physReg());
2210          register_file.fill(prev_phi->definitions[0]);
2211          ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2212                                                                pc.second.regClass()};
2213          continue;
2214       }
2215 
2216       /* rename */
2217       auto orig_it = ctx.orig_names.find(pc.first.tempId());
2218       Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2219       add_rename(ctx, orig, pc.second.getTemp());
2220 
2221       /* otherwise, this is a live-in and we need to create a new phi
2222        * to move it in this block's predecessors */
2223       aco_opcode opcode =
2224          pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2225       Block::edge_vec& preds =
2226          pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2227       aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2228       new_phi->definitions[0] = pc.second;
2229       for (unsigned i = 0; i < preds.size(); i++)
2230          new_phi->operands[i] = Operand(pc.first);
2231       instructions.emplace_back(std::move(new_phi));
2232 
2233       /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2234        * a loop header.
2235        */
2236       live_in.erase(orig.id());
2237    }
2238 
2239    return reg;
2240 }
2241 
2242 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2243 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2244                   std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2245 {
2246    /* move all phis to instructions */
2247    bool has_linear_phis = false;
2248    for (aco_ptr<Instruction>& phi : block.instructions) {
2249       if (!is_phi(phi))
2250          break;
2251       if (!phi->definitions[0].isKill()) {
2252          has_linear_phis |= phi->opcode == aco_opcode::p_linear_phi;
2253          instructions.emplace_back(std::move(phi));
2254       }
2255    }
2256 
2257    /* assign phis with all-matching registers to that register */
2258    for (aco_ptr<Instruction>& phi : instructions) {
2259       Definition& definition = phi->definitions[0];
2260       if (definition.isFixed())
2261          continue;
2262 
2263       if (!phi->operands[0].isTemp())
2264          continue;
2265 
2266       PhysReg reg = phi->operands[0].physReg();
2267       auto OpsSame = [=](const Operand& op) -> bool
2268       { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2269       bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2270       if (!all_same)
2271          continue;
2272 
2273       if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1))
2274          continue;
2275 
2276       definition.setFixed(reg);
2277       register_file.fill(definition);
2278       ctx.assignments[definition.tempId()].set(definition);
2279    }
2280 
2281    /* try to find a register that is used by at least one operand */
2282    for (aco_ptr<Instruction>& phi : instructions) {
2283       Definition& definition = phi->definitions[0];
2284       if (definition.isFixed())
2285          continue;
2286 
2287       /* use affinity if available */
2288       if (ctx.assignments[definition.tempId()].affinity &&
2289           ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2290          assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2291          assert(affinity.rc == definition.regClass());
2292          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg, -1)) {
2293             definition.setFixed(affinity.reg);
2294             register_file.fill(definition);
2295             ctx.assignments[definition.tempId()].set(definition);
2296             continue;
2297          }
2298       }
2299 
2300       /* by going backwards, we aim to avoid copies in else-blocks */
2301       for (int i = phi->operands.size() - 1; i >= 0; i--) {
2302          const Operand& op = phi->operands[i];
2303          if (!op.isTemp() || !op.isFixed())
2304             continue;
2305 
2306          PhysReg reg = op.physReg();
2307          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1)) {
2308             definition.setFixed(reg);
2309             register_file.fill(definition);
2310             ctx.assignments[definition.tempId()].set(definition);
2311             break;
2312          }
2313       }
2314    }
2315 
2316    /* find registers for phis where the register was blocked or no operand was assigned */
2317 
2318    /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2319    for (unsigned i = 0; i < instructions.size(); i++) {
2320       aco_ptr<Instruction>& phi = instructions[i];
2321       Definition& definition = phi->definitions[0];
2322       if (definition.isFixed())
2323          continue;
2324 
2325       definition.setFixed(
2326          get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2327 
2328       register_file.fill(definition);
2329       ctx.assignments[definition.tempId()].set(definition);
2330    }
2331 
2332    /* Provide a scratch register in case we need to preserve SCC */
2333    if (has_linear_phis || block.kind & block_kind_loop_header) {
2334       PhysReg scratch_reg = scc;
2335       if (register_file[scc]) {
2336          scratch_reg = get_reg_phi(ctx, live_in, register_file, instructions, block, ctx.phi_dummy,
2337                                    Temp(0, s1));
2338          if (block.kind & block_kind_loop_header)
2339             ctx.loop_header.back().second = scratch_reg;
2340       }
2341 
2342       for (aco_ptr<Instruction>& phi : instructions) {
2343          phi->pseudo().scratch_sgpr = scratch_reg;
2344          phi->pseudo().needs_scratch_reg = true;
2345       }
2346    }
2347 }
2348 
2349 inline Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2350 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2351 {
2352    /* This variable didn't get renamed, yet. */
2353    if (!ctx.assignments[val.id()].renamed)
2354       return val;
2355 
2356    auto it = ctx.renames[block_idx].find(val.id());
2357    if (it == ctx.renames[block_idx].end())
2358       return val;
2359    else
2360       return it->second;
2361 }
2362 
2363 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2364 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2365 {
2366    /* This variable didn't get renamed, yet. */
2367    if (!ctx.assignments[val.id()].renamed)
2368       return val;
2369 
2370    Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2371    if (preds.size() == 0)
2372       return val;
2373 
2374    if (preds.size() == 1) {
2375       /* if the block has only one predecessor, just look there for the name */
2376       return read_variable(ctx, val, preds[0]);
2377    }
2378 
2379    /* there are multiple predecessors and the block is sealed */
2380    Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2381 
2382    /* get the rename from each predecessor and check if they are the same */
2383    Temp new_val;
2384    bool needs_phi = false;
2385    for (unsigned i = 0; i < preds.size(); i++) {
2386       ops[i] = read_variable(ctx, val, preds[i]);
2387       if (i == 0)
2388          new_val = ops[i];
2389       else
2390          needs_phi |= !(new_val == ops[i]);
2391    }
2392 
2393    if (needs_phi) {
2394       assert(!val.regClass().is_linear_vgpr());
2395 
2396       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2397       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2398       aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2399       new_val = ctx.program->allocateTmp(val.regClass());
2400       phi->definitions[0] = Definition(new_val);
2401       ctx.assignments.emplace_back();
2402       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2403       for (unsigned i = 0; i < preds.size(); i++) {
2404          /* update the operands so that it uses the new affinity */
2405          phi->operands[i] = Operand(ops[i]);
2406          assert(ctx.assignments[ops[i].id()].assigned);
2407          assert(ops[i].regClass() == new_val.regClass());
2408          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2409       }
2410       block->instructions.insert(block->instructions.begin(), std::move(phi));
2411    }
2412 
2413    return new_val;
2414 }
2415 
2416 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx,PhysReg scratch_reg)2417 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2418                  uint32_t loop_exit_idx, PhysReg scratch_reg)
2419 {
2420    Block& loop_header = ctx.program->blocks[loop_header_idx];
2421    aco::unordered_map<uint32_t, Temp> renames(ctx.memory);
2422 
2423    /* create phis for variables renamed during the loop */
2424    for (unsigned t : live_in) {
2425       if (!ctx.assignments[t].renamed)
2426          continue;
2427 
2428       Temp val = Temp(t, ctx.program->temp_rc[t]);
2429       Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2430       Temp renamed = handle_live_in(ctx, val, &loop_header);
2431       if (renamed == prev)
2432          continue;
2433 
2434       /* insert additional renames at block end, but don't overwrite */
2435       renames[prev.id()] = renamed;
2436       ctx.orig_names[renamed.id()] = val;
2437       for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2438          auto it = ctx.renames[idx].emplace(val.id(), renamed);
2439          /* if insertion is unsuccessful, update if necessary */
2440          if (!it.second && it.first->second == prev)
2441             it.first->second = renamed;
2442       }
2443 
2444       /* update loop-carried values of the phi created by handle_live_in() */
2445       for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2446          Operand& op = loop_header.instructions[0]->operands[i];
2447          if (op.getTemp() == prev)
2448             op.setTemp(renamed);
2449       }
2450 
2451       /* use the assignment from the loop preheader and fix def reg */
2452       assignment& var = ctx.assignments[prev.id()];
2453       ctx.assignments[renamed.id()] = var;
2454       loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2455 
2456       /* Set scratch register */
2457       loop_header.instructions[0]->pseudo().scratch_sgpr = scratch_reg;
2458       loop_header.instructions[0]->pseudo().needs_scratch_reg = true;
2459    }
2460 
2461    /* rename loop carried phi operands */
2462    for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2463       aco_ptr<Instruction>& phi = loop_header.instructions[i];
2464       if (!is_phi(phi))
2465          break;
2466       const Block::edge_vec& preds =
2467          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2468       for (unsigned j = 1; j < phi->operands.size(); j++) {
2469          Operand& op = phi->operands[j];
2470          if (!op.isTemp())
2471             continue;
2472 
2473          /* Find the original name, since this operand might not use the original name if the phi
2474           * was created after init_reg_file().
2475           */
2476          auto it = ctx.orig_names.find(op.tempId());
2477          Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2478 
2479          op.setTemp(read_variable(ctx, orig, preds[j]));
2480          op.setFixed(ctx.assignments[op.tempId()].reg);
2481       }
2482    }
2483 
2484    /* return early if no new phi was created */
2485    if (renames.empty())
2486       return;
2487 
2488    /* propagate new renames through loop */
2489    for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2490       Block& current = ctx.program->blocks[idx];
2491       /* rename all uses in this block */
2492       for (aco_ptr<Instruction>& instr : current.instructions) {
2493          /* phis are renamed after RA */
2494          if (idx == loop_header_idx && is_phi(instr))
2495             continue;
2496 
2497          for (Operand& op : instr->operands) {
2498             if (!op.isTemp())
2499                continue;
2500 
2501             auto rename = renames.find(op.tempId());
2502             if (rename != renames.end()) {
2503                assert(rename->second.id());
2504                op.setTemp(rename->second);
2505             }
2506          }
2507       }
2508    }
2509 }
2510 
2511 /**
2512  * This function serves the purpose to correctly initialize the register file
2513  * at the beginning of a block (before any existing phis).
2514  * In order to do so, all live-in variables are entered into the RegisterFile.
2515  * Reg-to-reg moves (renames) from previous blocks are taken into account and
2516  * the SSA is repaired by inserting corresponding phi-nodes.
2517  */
2518 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2519 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2520 {
2521    if (block.kind & block_kind_loop_exit) {
2522       uint32_t header = ctx.loop_header.back().first;
2523       PhysReg scratch_reg = ctx.loop_header.back().second;
2524       ctx.loop_header.pop_back();
2525       handle_loop_phis(ctx, live_out_per_block[header], header, block.index, scratch_reg);
2526    }
2527 
2528    RegisterFile register_file;
2529    const IDSet& live_in = live_out_per_block[block.index];
2530    assert(block.index != 0 || live_in.empty());
2531 
2532    if (block.kind & block_kind_loop_header) {
2533       ctx.loop_header.emplace_back(block.index, PhysReg{scc});
2534       /* already rename phis incoming value */
2535       for (aco_ptr<Instruction>& instr : block.instructions) {
2536          if (!is_phi(instr))
2537             break;
2538          Operand& operand = instr->operands[0];
2539          if (operand.isTemp()) {
2540             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2541             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2542          }
2543       }
2544       for (unsigned t : live_in) {
2545          Temp val = Temp(t, ctx.program->temp_rc[t]);
2546          Temp renamed = read_variable(ctx, val, block.index - 1);
2547          if (renamed != val)
2548             add_rename(ctx, val, renamed);
2549          assignment& var = ctx.assignments[renamed.id()];
2550          assert(var.assigned);
2551          register_file.fill(Definition(renamed, var.reg));
2552       }
2553    } else {
2554       /* rename phi operands */
2555       for (aco_ptr<Instruction>& instr : block.instructions) {
2556          if (!is_phi(instr))
2557             break;
2558          const Block::edge_vec& preds =
2559             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2560 
2561          for (unsigned i = 0; i < instr->operands.size(); i++) {
2562             Operand& operand = instr->operands[i];
2563             if (!operand.isTemp())
2564                continue;
2565             operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2566             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2567          }
2568       }
2569       for (unsigned t : live_in) {
2570          Temp val = Temp(t, ctx.program->temp_rc[t]);
2571          Temp renamed = handle_live_in(ctx, val, &block);
2572          assignment& var = ctx.assignments[renamed.id()];
2573          /* due to live-range splits, the live-in might be a phi, now */
2574          if (var.assigned) {
2575             register_file.fill(Definition(renamed, var.reg));
2576          }
2577          if (renamed != val) {
2578             add_rename(ctx, val, renamed);
2579          }
2580       }
2581    }
2582 
2583    return register_file;
2584 }
2585 
2586 bool
vop3_can_use_vop2acc(ra_ctx & ctx,Instruction * instr)2587 vop3_can_use_vop2acc(ra_ctx& ctx, Instruction* instr)
2588 {
2589    if (!instr->isVOP3() && !instr->isVOP3P())
2590       return false;
2591 
2592    switch (instr->opcode) {
2593    case aco_opcode::v_mad_f32:
2594    case aco_opcode::v_mad_f16:
2595    case aco_opcode::v_mad_legacy_f16: break;
2596    case aco_opcode::v_fma_f32:
2597    case aco_opcode::v_pk_fma_f16:
2598    case aco_opcode::v_fma_f16:
2599    case aco_opcode::v_dot4_i32_i8:
2600       if (ctx.program->gfx_level < GFX10)
2601          return false;
2602       break;
2603    case aco_opcode::v_mad_legacy_f32:
2604       if (!ctx.program->dev.has_mac_legacy32)
2605          return false;
2606       break;
2607    case aco_opcode::v_fma_legacy_f32:
2608       if (!ctx.program->dev.has_fmac_legacy32)
2609          return false;
2610       break;
2611    default: return false;
2612    }
2613 
2614    if (!instr->operands[2].isOfType(RegType::vgpr) || !instr->operands[2].isKillBeforeDef() ||
2615        (!instr->operands[0].isOfType(RegType::vgpr) && !instr->operands[1].isOfType(RegType::vgpr)))
2616       return false;
2617 
2618    if (instr->isVOP3P()) {
2619       for (unsigned i = 0; i < 3; i++) {
2620          if (instr->operands[i].isLiteral())
2621             continue;
2622 
2623          if (instr->valu().opsel_lo[i])
2624             return false;
2625 
2626          /* v_pk_fmac_f16 inline constants are replicated to hi bits starting with gfx11. */
2627          if (instr->valu().opsel_hi[i] ==
2628              (instr->operands[i].isConstant() && ctx.program->gfx_level >= GFX11))
2629             return false;
2630       }
2631    } else {
2632       if (instr->valu().opsel & (ctx.program->gfx_level < GFX11 ? 0xf : ~0x3))
2633          return false;
2634       for (unsigned i = 0; i < 2; i++) {
2635          if (!instr->operands[i].isOfType(RegType::vgpr) && instr->valu().opsel[i])
2636             return false;
2637       }
2638    }
2639 
2640    unsigned im_mask = instr->isDPP16() && instr->isVOP3() ? 0x3 : 0;
2641    if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2642        (instr->valu().neg & ~im_mask))
2643       return false;
2644 
2645    return true;
2646 }
2647 
2648 bool
sop2_can_use_sopk(ra_ctx & ctx,Instruction * instr)2649 sop2_can_use_sopk(ra_ctx& ctx, Instruction* instr)
2650 {
2651    if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_add_u32 &&
2652        instr->opcode != aco_opcode::s_mul_i32 && instr->opcode != aco_opcode::s_cselect_b32)
2653       return false;
2654 
2655    if (instr->opcode == aco_opcode::s_add_u32 && !instr->definitions[1].isKill())
2656       return false;
2657 
2658    uint32_t literal_idx = 0;
2659 
2660    if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2661       literal_idx = 1;
2662 
2663    if (!instr->operands[!literal_idx].isTemp() || !instr->operands[!literal_idx].isKillBeforeDef())
2664       return false;
2665 
2666    if (!instr->operands[literal_idx].isLiteral())
2667       return false;
2668 
2669    const uint32_t i16_mask = 0xffff8000u;
2670    uint32_t value = instr->operands[literal_idx].constantValue();
2671    if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2672       return false;
2673 
2674    return true;
2675 }
2676 
2677 void
create_phi_vector_affinities(ra_ctx & ctx,aco_ptr<Instruction> & instr,std::map<Operand *,std::vector<vector_info>> & vector_phis)2678 create_phi_vector_affinities(ra_ctx& ctx, aco_ptr<Instruction>& instr,
2679                              std::map<Operand*, std::vector<vector_info>>& vector_phis)
2680 {
2681    auto it = ctx.vectors.find(instr->definitions[0].tempId());
2682    if (it == ctx.vectors.end())
2683       return;
2684    vector_info& dest_vector = it->second;
2685 
2686    auto pair = vector_phis.try_emplace(dest_vector.parts, instr->operands.size(), dest_vector);
2687    std::vector<vector_info>& src_vectors = pair.first->second;
2688    if (pair.second) {
2689       RegType type = instr->definitions[0].regClass().type();
2690 
2691       for (vector_info& src_vector : src_vectors) {
2692          src_vector.parts =
2693             (Operand*)ctx.memory.allocate(sizeof(Operand) * src_vector.num_parts, alignof(Operand));
2694          for (unsigned j = 0; j < src_vector.num_parts; j++)
2695             src_vector.parts[j] = Operand(RegClass::get(type, dest_vector.parts[j].bytes()));
2696       }
2697    }
2698 
2699    unsigned index = 0;
2700    for (; index < dest_vector.num_parts; index++) {
2701       if (dest_vector.parts[index].isTemp() &&
2702           dest_vector.parts[index].tempId() == instr->definitions[0].tempId())
2703          break;
2704    }
2705    assert(index != dest_vector.num_parts);
2706 
2707    for (int i = instr->operands.size() - 1; i >= 0; i--) {
2708       const Operand& op = instr->operands[i];
2709       if (!op.isTemp() || op.regClass() != instr->definitions[0].regClass())
2710          continue;
2711 
2712       src_vectors[i].parts[index] = op;
2713       ctx.vectors[op.tempId()] = src_vectors[i];
2714    }
2715 }
2716 
2717 void
get_affinities(ra_ctx & ctx)2718 get_affinities(ra_ctx& ctx)
2719 {
2720    std::vector<std::vector<Temp>> phi_resources;
2721    aco::unordered_map<uint32_t, uint32_t> temp_to_phi_resources(ctx.memory);
2722 
2723    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2724         block_rit++) {
2725       Block& block = *block_rit;
2726 
2727       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2728       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2729          aco_ptr<Instruction>& instr = *rit;
2730          if (is_phi(instr))
2731             break;
2732 
2733          /* add vector affinities */
2734          if (instr->opcode == aco_opcode::p_create_vector) {
2735             for (const Operand& op : instr->operands) {
2736                if (op.isTemp() && op.isFirstKill() &&
2737                    op.getTemp().type() == instr->definitions[0].getTemp().type())
2738                   ctx.vectors[op.tempId()] = vector_info(instr.get());
2739             }
2740          } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2741                     !instr->mimg().strict_wqm && ctx.program->gfx_level < GFX12) {
2742             for (unsigned i = 3; i < instr->operands.size(); i++)
2743                ctx.vectors[instr->operands[i].tempId()] = vector_info(instr.get(), 3, true);
2744          } else if (instr->opcode == aco_opcode::p_split_vector &&
2745                     instr->operands[0].isFirstKillBeforeDef()) {
2746             ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2747          } else if (instr->isVOPC() && !instr->isVOP3()) {
2748             if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2749                ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2750          } else if (instr->isVOP2() && !instr->isVOP3()) {
2751             if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2752                 instr->operands[2].regClass().type() == RegType::sgpr)
2753                ctx.assignments[instr->operands[2].tempId()].vcc = true;
2754             if (instr->definitions.size() == 2)
2755                ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2756          } else if (instr->opcode == aco_opcode::s_and_b32 ||
2757                     instr->opcode == aco_opcode::s_and_b64) {
2758             /* If SCC is used by a branch, we might be able to use
2759              * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2760              */
2761             if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2762                 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2763                ctx.assignments[instr->operands[0].tempId()].vcc = true;
2764          } else if (instr->opcode == aco_opcode::s_sendmsg) {
2765             ctx.assignments[instr->operands[0].tempId()].m0 = true;
2766          }
2767 
2768          int op_fixed_to_def0 = get_op_fixed_to_def(instr.get());
2769          for (unsigned i = 0; i < instr->definitions.size(); i++) {
2770             const Definition& def = instr->definitions[i];
2771             if (!def.isTemp())
2772                continue;
2773             /* mark last-seen phi operand */
2774             auto it = temp_to_phi_resources.find(def.tempId());
2775             if (it != temp_to_phi_resources.end() &&
2776                 def.regClass() == phi_resources[it->second][0].regClass()) {
2777                phi_resources[it->second][0] = def.getTemp();
2778                /* try to coalesce phi affinities with parallelcopies */
2779                Operand op;
2780                if (instr->opcode == aco_opcode::p_parallelcopy) {
2781                   op = instr->operands[i];
2782                } else if (i == 0 && op_fixed_to_def0 != -1) {
2783                   op = instr->operands[op_fixed_to_def0];
2784                } else if (vop3_can_use_vop2acc(ctx, instr.get())) {
2785                   op = instr->operands[2];
2786                } else if (i == 0 && sop2_can_use_sopk(ctx, instr.get())) {
2787                   op = instr->operands[instr->operands[0].isLiteral()];
2788                } else {
2789                   continue;
2790                }
2791 
2792                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2793                   phi_resources[it->second].emplace_back(op.getTemp());
2794                   temp_to_phi_resources[op.tempId()] = it->second;
2795                }
2796             }
2797          }
2798       }
2799 
2800       /* collect phi affinities */
2801       std::map<Operand*, std::vector<vector_info>> vector_phis;
2802       for (; rit != block.instructions.rend(); ++rit) {
2803          aco_ptr<Instruction>& instr = *rit;
2804          assert(is_phi(instr));
2805 
2806          if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2807             continue;
2808 
2809          assert(instr->definitions[0].isTemp());
2810          auto it = temp_to_phi_resources.find(instr->definitions[0].tempId());
2811          unsigned index = phi_resources.size();
2812          std::vector<Temp>* affinity_related;
2813          if (it != temp_to_phi_resources.end()) {
2814             index = it->second;
2815             phi_resources[index][0] = instr->definitions[0].getTemp();
2816             affinity_related = &phi_resources[index];
2817          } else {
2818             phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2819             affinity_related = &phi_resources.back();
2820          }
2821 
2822          for (const Operand& op : instr->operands) {
2823             if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2824                affinity_related->emplace_back(op.getTemp());
2825                if (block.kind & block_kind_loop_header)
2826                   continue;
2827                temp_to_phi_resources[op.tempId()] = index;
2828             }
2829          }
2830 
2831          create_phi_vector_affinities(ctx, instr, vector_phis);
2832       }
2833 
2834       /* visit the loop header phis first in order to create nested affinities */
2835       if (block.kind & block_kind_loop_exit) {
2836          /* find loop header */
2837          auto header_rit = block_rit;
2838          while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2839             header_rit++;
2840 
2841          for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2842             if (!is_phi(phi))
2843                break;
2844             if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2845                continue;
2846 
2847             /* create an (empty) merge-set for the phi-related variables */
2848             auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2849             unsigned index = phi_resources.size();
2850             if (it == temp_to_phi_resources.end()) {
2851                temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2852                phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2853             } else {
2854                index = it->second;
2855             }
2856             for (unsigned i = 1; i < phi->operands.size(); i++) {
2857                const Operand& op = phi->operands[i];
2858                if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2859                   temp_to_phi_resources[op.tempId()] = index;
2860                }
2861             }
2862          }
2863       }
2864    }
2865    /* create affinities */
2866    for (std::vector<Temp>& vec : phi_resources) {
2867       for (unsigned i = 1; i < vec.size(); i++)
2868          if (vec[i].id() != vec[0].id())
2869             ctx.assignments[vec[i].id()].affinity = vec[0].id();
2870    }
2871 }
2872 
2873 void
optimize_encoding_vop2(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2874 optimize_encoding_vop2(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2875 {
2876    if (!vop3_can_use_vop2acc(ctx, instr.get()))
2877       return;
2878 
2879    for (unsigned i = ctx.program->gfx_level < GFX11 ? 0 : 2; i < 3; i++) {
2880       if (instr->operands[i].physReg().byte())
2881          return;
2882    }
2883 
2884    unsigned def_id = instr->definitions[0].tempId();
2885    if (ctx.assignments[def_id].affinity) {
2886       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2887       if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2888           !register_file.test(affinity.reg, instr->operands[2].bytes()))
2889          return;
2890    }
2891 
2892    if (!instr->operands[1].isOfType(RegType::vgpr))
2893       instr->valu().swapOperands(0, 1);
2894 
2895    if (instr->isVOP3P() && instr->operands[0].isLiteral()) {
2896       unsigned literal = instr->operands[0].constantValue();
2897       unsigned lo = (literal >> (instr->valu().opsel_lo[0] * 16)) & 0xffff;
2898       unsigned hi = (literal >> (instr->valu().opsel_hi[0] * 16)) & 0xffff;
2899       instr->operands[0] = Operand::literal32(lo | (hi << 16));
2900    }
2901 
2902    instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2903                             (unsigned)Format::VOP2);
2904    instr->valu().opsel_lo = 0;
2905    instr->valu().opsel_hi = 0;
2906    switch (instr->opcode) {
2907    case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2908    case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2909    case aco_opcode::v_mad_f16:
2910    case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2911    case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2912    case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2913    case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2914    case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2915    case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2916    default: break;
2917    }
2918 }
2919 
2920 void
optimize_encoding_sopk(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2921 optimize_encoding_sopk(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2922 {
2923    /* try to optimize sop2 with literal source to sopk */
2924    if (!sop2_can_use_sopk(ctx, instr.get()))
2925       return;
2926    unsigned literal_idx = instr->operands[1].isLiteral();
2927 
2928    PhysReg op_reg = instr->operands[!literal_idx].physReg();
2929    if (!is_sgpr_writable_without_side_effects(ctx.program->gfx_level, op_reg))
2930       return;
2931 
2932    unsigned def_id = instr->definitions[0].tempId();
2933    if (ctx.assignments[def_id].affinity) {
2934       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2935       if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2936           !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2937          return;
2938    }
2939 
2940    instr->format = Format::SOPK;
2941    instr->salu().imm = instr->operands[literal_idx].constantValue() & 0xffff;
2942    if (literal_idx == 0)
2943       std::swap(instr->operands[0], instr->operands[1]);
2944    if (instr->operands.size() > 2)
2945       std::swap(instr->operands[1], instr->operands[2]);
2946    instr->operands.pop_back();
2947 
2948    switch (instr->opcode) {
2949    case aco_opcode::s_add_u32:
2950    case aco_opcode::s_add_i32: instr->opcode = aco_opcode::s_addk_i32; break;
2951    case aco_opcode::s_mul_i32: instr->opcode = aco_opcode::s_mulk_i32; break;
2952    case aco_opcode::s_cselect_b32: instr->opcode = aco_opcode::s_cmovk_i32; break;
2953    default: unreachable("illegal instruction");
2954    }
2955 }
2956 
2957 void
optimize_encoding(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2958 optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2959 {
2960    if (instr->isVALU())
2961       optimize_encoding_vop2(ctx, register_file, instr);
2962    if (instr->isSALU())
2963       optimize_encoding_sopk(ctx, register_file, instr);
2964 }
2965 
2966 void
emit_parallel_copy_internal(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2967 emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2968                             aco_ptr<Instruction>& instr,
2969                             std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
2970                             RegisterFile& register_file)
2971 {
2972    if (parallelcopy.empty())
2973       return;
2974 
2975    aco_ptr<Instruction> pc;
2976    pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
2977                                parallelcopy.size()));
2978    bool linear_vgpr = false;
2979    bool may_swap_sgprs = false;
2980    std::bitset<256> sgpr_operands;
2981    for (unsigned i = 0; i < parallelcopy.size(); i++) {
2982       linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2983 
2984       if (!may_swap_sgprs && parallelcopy[i].first.isTemp() &&
2985           parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2986          unsigned op_reg = parallelcopy[i].first.physReg().reg();
2987          unsigned def_reg = parallelcopy[i].second.physReg().reg();
2988          for (unsigned j = 0; j < parallelcopy[i].first.size(); j++) {
2989             sgpr_operands.set(op_reg + j);
2990             if (sgpr_operands.test(def_reg + j))
2991                may_swap_sgprs = true;
2992          }
2993       }
2994 
2995       pc->operands[i] = parallelcopy[i].first;
2996       pc->definitions[i] = parallelcopy[i].second;
2997       assert(pc->operands[i].size() == pc->definitions[i].size());
2998 
2999       /* it might happen that the operand is already renamed. we have to restore the
3000        * original name. */
3001       auto it = ctx.orig_names.find(pc->operands[i].tempId());
3002       Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
3003       add_rename(ctx, orig, pc->definitions[i].getTemp());
3004    }
3005 
3006    if (temp_in_scc && (may_swap_sgprs || linear_vgpr)) {
3007       /* disable definitions and re-enable operands */
3008       RegisterFile tmp_file(register_file);
3009       for (const Definition& def : instr->definitions) {
3010          if (def.isTemp() && !def.isKill())
3011             tmp_file.clear(def);
3012       }
3013       for (const Operand& op : instr->operands) {
3014          if (op.isTemp() && op.isFirstKill())
3015             tmp_file.block(op.physReg(), op.regClass());
3016       }
3017 
3018       handle_pseudo(ctx, tmp_file, pc.get());
3019    } else {
3020       pc->pseudo().needs_scratch_reg = may_swap_sgprs || linear_vgpr;
3021       pc->pseudo().scratch_sgpr = scc;
3022    }
3023 
3024    instructions.emplace_back(std::move(pc));
3025 
3026    parallelcopy.clear();
3027 }
3028 
3029 void
emit_parallel_copy(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)3030 emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
3031                    aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
3032                    bool temp_in_scc, RegisterFile& register_file)
3033 {
3034    if (parallelcopy.empty())
3035       return;
3036 
3037    std::vector<std::pair<Operand, Definition>> linear_vgpr;
3038    if (ctx.num_linear_vgprs) {
3039       unsigned next = 0;
3040       for (unsigned i = 0; i < parallelcopy.size(); i++) {
3041          if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
3042             linear_vgpr.push_back(parallelcopy[i]);
3043             continue;
3044          }
3045 
3046          if (next != i)
3047             parallelcopy[next] = parallelcopy[i];
3048          next++;
3049       }
3050       parallelcopy.resize(next);
3051    }
3052 
3053    /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
3054     * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
3055    emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
3056                                register_file);
3057    emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
3058                                register_file);
3059 }
3060 
3061 } /* end namespace */
3062 
3063 void
register_allocation(Program * program,ra_test_policy policy)3064 register_allocation(Program* program, ra_test_policy policy)
3065 {
3066    ra_ctx ctx(program, policy);
3067    get_affinities(ctx);
3068 
3069    for (Block& block : program->blocks) {
3070       ctx.block = &block;
3071 
3072       /* initialize register file */
3073       RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block);
3074       ctx.war_hint.reset();
3075       ctx.rr_vgpr_it = {PhysReg{256}};
3076       ctx.rr_sgpr_it = {PhysReg{0}};
3077 
3078       std::vector<aco_ptr<Instruction>> instructions;
3079       instructions.reserve(block.instructions.size());
3080 
3081       /* this is a slight adjustment from the paper as we already have phi nodes:
3082        * We consider them incomplete phis and only handle the definition. */
3083       get_regs_for_phis(ctx, block, register_file, instructions,
3084                         program->live.live_in[block.index]);
3085 
3086       /* Handle all other instructions of the block */
3087       auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
3088       auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
3089       for (; instr_it != block.instructions.end(); ++instr_it) {
3090          aco_ptr<Instruction>& instr = *instr_it;
3091          std::vector<std::pair<Operand, Definition>> parallelcopy;
3092          assert(!is_phi(instr));
3093 
3094          /* handle operands */
3095          bool fixed = false;
3096          for (unsigned i = 0; i < instr->operands.size(); ++i) {
3097             auto& operand = instr->operands[i];
3098             if (!operand.isTemp())
3099                continue;
3100 
3101             /* rename operands */
3102             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
3103             assert(ctx.assignments[operand.tempId()].assigned);
3104 
3105             fixed |=
3106                operand.isPrecolored() && ctx.assignments[operand.tempId()].reg != operand.physReg();
3107          }
3108 
3109          bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
3110                              instr->opcode == aco_opcode::v_writelane_b32_e64;
3111          if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
3112              instr->operands[1].isTemp()) {
3113             /* v_writelane_b32 can take two sgprs but only if one is m0. */
3114             if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
3115                 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
3116                instr->operands[0].setPrecolored(m0);
3117                fixed = true;
3118             }
3119          }
3120 
3121          if (fixed)
3122             handle_fixed_operands(ctx, register_file, parallelcopy, instr);
3123 
3124          for (unsigned i = 0; i < instr->operands.size(); ++i) {
3125             auto& operand = instr->operands[i];
3126             if (!operand.isTemp() || operand.isFixed())
3127                continue;
3128 
3129             PhysReg reg = ctx.assignments[operand.tempId()].reg;
3130             if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
3131                operand.setFixed(reg);
3132             else
3133                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
3134 
3135             if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
3136                 (instr->isDS() && instr->ds().gds)) {
3137                for (unsigned j = 0; j < operand.size(); j++)
3138                   ctx.war_hint.set(operand.physReg().reg() + j);
3139             }
3140          }
3141          bool temp_in_scc = register_file[scc];
3142 
3143          /* remove dead vars from register file */
3144          for (const Operand& op : instr->operands) {
3145             if (op.isTemp() && op.isFirstKillBeforeDef())
3146                register_file.clear(op);
3147          }
3148 
3149          optimize_encoding(ctx, register_file, instr);
3150 
3151          /* Handle definitions which must have the same register as an operand.
3152           * We expect that the definition has the same size as the operand, otherwise the new
3153           * location for the operand (if it's not killed) might intersect with the old one.
3154           * We can't read from the old location because it's corrupted, and we can't write the new
3155           * location because that's used by a live-through operand.
3156           */
3157          int op_fixed_to_def = get_op_fixed_to_def(instr.get());
3158          if (op_fixed_to_def != -1)
3159             instr->definitions[0].setPrecolored(instr->operands[op_fixed_to_def].physReg());
3160 
3161          /* handle fixed definitions first */
3162          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3163             auto& definition = instr->definitions[i];
3164             if (!definition.isFixed())
3165                continue;
3166 
3167             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
3168             /* check if the target register is blocked */
3169             if (register_file.test(definition.physReg(), definition.bytes())) {
3170                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
3171 
3172                /* create parallelcopy pair to move blocking vars */
3173                std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
3174 
3175                RegisterFile tmp_file(register_file);
3176                /* re-enable the killed operands, so that we don't move the blocking vars there */
3177                tmp_file.fill_killed_operands(instr.get());
3178 
3179                ASSERTED bool success = false;
3180                success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
3181                assert(success);
3182 
3183                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3184             }
3185 
3186             if (!definition.isTemp())
3187                continue;
3188 
3189             ctx.assignments[definition.tempId()].set(definition);
3190             register_file.fill(definition);
3191          }
3192 
3193          /* handle all other definitions */
3194          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3195             Definition* definition = &instr->definitions[i];
3196 
3197             if (definition->isFixed() || !definition->isTemp())
3198                continue;
3199 
3200             /* find free reg */
3201             if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
3202                /* Allocation of linear VGPRs is special. */
3203                definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
3204                update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3205             } else if (instr->opcode == aco_opcode::p_split_vector) {
3206                PhysReg reg = instr->operands[0].physReg();
3207                RegClass rc = definition->regClass();
3208                for (unsigned j = 0; j < i; j++)
3209                   reg.reg_b += instr->definitions[j].bytes();
3210                if (get_reg_specified(ctx, register_file, rc, instr, reg, -1)) {
3211                   definition->setFixed(reg);
3212                } else if (i == 0) {
3213                   RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
3214                   DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
3215                   std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
3216                   if (res && get_reg_specified(ctx, register_file, rc, instr, *res, -1))
3217                      definition->setFixed(*res);
3218                } else if (instr->definitions[i - 1].isFixed()) {
3219                   reg = instr->definitions[i - 1].physReg();
3220                   reg.reg_b += instr->definitions[i - 1].bytes();
3221                   if (get_reg_specified(ctx, register_file, rc, instr, reg, -1))
3222                      definition->setFixed(reg);
3223                }
3224             } else if (instr->opcode == aco_opcode::p_parallelcopy) {
3225                PhysReg reg = instr->operands[i].physReg();
3226                if (instr->operands[i].isTemp() &&
3227                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
3228                    !register_file.test(reg, definition->bytes()))
3229                   definition->setFixed(reg);
3230             } else if (instr->opcode == aco_opcode::p_extract_vector) {
3231                PhysReg reg = instr->operands[0].physReg();
3232                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
3233                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg, -1))
3234                   definition->setFixed(reg);
3235             } else if (instr->opcode == aco_opcode::p_create_vector) {
3236                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
3237                                                    parallelcopy, instr);
3238                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3239                definition->setFixed(reg);
3240             } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
3241                        instr->operands[2].isTemp() && instr->operands[2].isKill() &&
3242                        instr->operands[2].regClass() == definition->regClass()) {
3243                /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
3244                 * Here we set a policy of forcing them the same if operands[2] gets killed (and
3245                 * otherwise they don't overlap). This may not be optimal if RA would select a
3246                 * different location due to affinity, but that gets complicated very quickly. */
3247                definition->setFixed(instr->operands[2].physReg());
3248             }
3249 
3250             if (!definition->isFixed()) {
3251                Temp tmp = definition->getTemp();
3252                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
3253                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
3254                   definition->setFixed(reg);
3255                   if (reg.byte() || register_file.test(reg, 4)) {
3256                      bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
3257                      add_subdword_definition(program, instr, reg, allow_16bit_write);
3258                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
3259                                                              the reference */
3260                   }
3261                } else {
3262                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
3263                }
3264                update_renames(ctx, register_file, parallelcopy, instr,
3265                               instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
3266                                                                            : (UpdateRenames)0);
3267             }
3268 
3269             assert(
3270                definition->isFixed() &&
3271                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
3272                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
3273             ctx.assignments[definition->tempId()].set(*definition);
3274             register_file.fill(*definition);
3275          }
3276 
3277          handle_pseudo(ctx, register_file, instr.get());
3278 
3279          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3280           * be read */
3281          for (const Definition& def : instr->definitions) {
3282             if (def.isTemp() && def.isKill())
3283                register_file.clear(def);
3284          }
3285          for (unsigned i = 0; i < instr->operands.size(); i++) {
3286             const Operand& op = instr->operands[i];
3287             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3288                register_file.clear(op);
3289             if (op.isTemp() && op.physReg().byte() != 0)
3290                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3291          }
3292 
3293          emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
3294 
3295          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3296          bool instr_needs_vop3 =
3297             !instr->isVOP3() &&
3298             ((withoutDPP(instr->format) == Format::VOPC &&
3299               instr->definitions[0].physReg() != vcc) ||
3300              (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3301              ((instr->opcode == aco_opcode::v_add_co_u32 ||
3302                instr->opcode == aco_opcode::v_addc_co_u32 ||
3303                instr->opcode == aco_opcode::v_sub_co_u32 ||
3304                instr->opcode == aco_opcode::v_subb_co_u32 ||
3305                instr->opcode == aco_opcode::v_subrev_co_u32 ||
3306                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3307               instr->definitions[1].physReg() != vcc) ||
3308              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3309                instr->opcode == aco_opcode::v_subb_co_u32 ||
3310                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3311               instr->operands[2].physReg() != vcc));
3312          if (instr_needs_vop3) {
3313 
3314             /* If the first operand is a literal, we have to move it to an sgpr
3315              * for generations without VOP3+literal support.
3316              * Both literals and sgprs count towards the constant bus limit,
3317              * so this is always valid.
3318              */
3319             if (instr->operands.size() && instr->operands[0].isLiteral() &&
3320                 program->gfx_level < GFX10) {
3321                /* Re-use the register we already allocated for the definition.
3322                 * This works because the instruction cannot have any other SGPR operand.
3323                 */
3324                Temp tmp = program->allocateTmp(instr->operands[0].size() == 2 ? s2 : s1);
3325                const Definition& def =
3326                   instr->isVOPC() ? instr->definitions[0] : instr->definitions.back();
3327                assert(def.regClass() == s2);
3328                ctx.assignments.emplace_back(def.physReg(), tmp.regClass());
3329 
3330                Instruction* copy =
3331                   create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
3332                copy->operands[0] = instr->operands[0];
3333                if (copy->operands[0].bytes() < 4)
3334                   copy->operands[0] = Operand::c32(copy->operands[0].constantValue());
3335                copy->definitions[0] = Definition(tmp);
3336                copy->definitions[0].setFixed(def.physReg());
3337 
3338                instr->operands[0] = Operand(tmp);
3339                instr->operands[0].setFixed(def.physReg());
3340                instr->operands[0].setFirstKill(true);
3341 
3342                instructions.emplace_back(copy);
3343             }
3344 
3345             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3346             instr->format = asVOP3(instr->format);
3347          }
3348 
3349          instructions.emplace_back(std::move(*instr_it));
3350 
3351       } /* end for Instr */
3352 
3353       if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
3354          /* Reset this for block_kind_resume. */
3355          ctx.num_linear_vgprs = 0;
3356 
3357          ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
3358          ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
3359          assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
3360          assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
3361       } else if (should_compact_linear_vgprs(ctx, register_file)) {
3362          aco_ptr<Instruction> br = std::move(instructions.back());
3363          instructions.pop_back();
3364 
3365          bool temp_in_scc =
3366             register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
3367 
3368          std::vector<std::pair<Operand, Definition>> parallelcopy;
3369          compact_linear_vgprs(ctx, register_file, parallelcopy);
3370          update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
3371          emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
3372 
3373          instructions.push_back(std::move(br));
3374       }
3375 
3376       block.instructions = std::move(instructions);
3377    } /* end for BB */
3378 
3379    /* num_gpr = rnd_up(max_used_gpr + 1) */
3380    program->config->num_vgprs =
3381       std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3382    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3383 
3384    program->progress = CompilationProgress::after_ra;
3385 }
3386 
3387 } // namespace aco
3388