• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "util/bitset.h"
28 #include "util/enum_operators.h"
29 
30 #include <algorithm>
31 #include <array>
32 #include <bitset>
33 #include <map>
34 #include <optional>
35 #include <set>
36 #include <unordered_map>
37 #include <vector>
38 
39 namespace aco {
40 namespace {
41 
42 struct ra_ctx;
43 
44 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
45                                      unsigned idx, RegClass rc);
46 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
47                           RegClass rc);
48 std::pair<unsigned, unsigned>
49 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
50 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
51 
52 struct assignment {
53    PhysReg reg;
54    RegClass rc;
55    union {
56       struct {
57          bool assigned : 1;
58          bool vcc : 1;
59          bool m0 : 1;
60       };
61       uint8_t _ = 0;
62    };
63    uint32_t affinity = 0;
64    assignment() = default;
assignmentaco::__anonc76b5ea00111::assignment65    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anonc76b5ea00111::assignment66    void set(const Definition& def)
67    {
68       assigned = true;
69       reg = def.physReg();
70       rc = def.regClass();
71    }
72 };
73 
74 struct ra_ctx {
75 
76    Program* program;
77    Block* block = NULL;
78    std::vector<assignment> assignments;
79    std::vector<std::unordered_map<unsigned, Temp>> renames;
80    std::vector<uint32_t> loop_header;
81    std::unordered_map<unsigned, Temp> orig_names;
82    std::unordered_map<unsigned, Instruction*> vectors;
83    std::unordered_map<unsigned, Instruction*> split_vectors;
84    aco_ptr<Instruction> pseudo_dummy;
85    aco_ptr<Instruction> phi_dummy;
86    uint16_t max_used_sgpr = 0;
87    uint16_t max_used_vgpr = 0;
88    uint16_t sgpr_limit;
89    uint16_t vgpr_limit;
90    std::bitset<512> war_hint;
91 
92    ra_test_policy policy;
93 
ra_ctxaco::__anonc76b5ea00111::ra_ctx94    ra_ctx(Program* program_, ra_test_policy policy_)
95        : program(program_), assignments(program->peekAllocationId()),
96          renames(program->blocks.size()), policy(policy_)
97    {
98       pseudo_dummy.reset(
99          create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
100       phi_dummy.reset(
101          create_instruction<Instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
102       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
103       vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
104    }
105 };
106 
107 /* Iterator type for making PhysRegInterval compatible with range-based for */
108 struct PhysRegIterator {
109    using difference_type = int;
110    using value_type = unsigned;
111    using reference = const unsigned&;
112    using pointer = const unsigned*;
113    using iterator_category = std::bidirectional_iterator_tag;
114 
115    PhysReg reg;
116 
operator *aco::__anonc76b5ea00111::PhysRegIterator117    PhysReg operator*() const { return reg; }
118 
operator ++aco::__anonc76b5ea00111::PhysRegIterator119    PhysRegIterator& operator++()
120    {
121       reg.reg_b += 4;
122       return *this;
123    }
124 
operator --aco::__anonc76b5ea00111::PhysRegIterator125    PhysRegIterator& operator--()
126    {
127       reg.reg_b -= 4;
128       return *this;
129    }
130 
operator ==aco::__anonc76b5ea00111::PhysRegIterator131    bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
132 
operator !=aco::__anonc76b5ea00111::PhysRegIterator133    bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
134 
operator <aco::__anonc76b5ea00111::PhysRegIterator135    bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
136 };
137 
138 /* Half-open register interval used in "sliding window"-style for-loops */
139 struct PhysRegInterval {
140    PhysReg lo_;
141    unsigned size;
142 
143    /* Inclusive lower bound */
loaco::__anonc76b5ea00111::PhysRegInterval144    PhysReg lo() const { return lo_; }
145 
146    /* Exclusive upper bound */
hiaco::__anonc76b5ea00111::PhysRegInterval147    PhysReg hi() const { return PhysReg{lo() + size}; }
148 
operator +=aco::__anonc76b5ea00111::PhysRegInterval149    PhysRegInterval& operator+=(uint32_t stride)
150    {
151       lo_ = PhysReg{lo_.reg() + stride};
152       return *this;
153    }
154 
operator !=aco::__anonc76b5ea00111::PhysRegInterval155    bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
156 
157    /* Construct a half-open interval, excluding the end register */
from_untilaco::__anonc76b5ea00111::PhysRegInterval158    static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
159 
containsaco::__anonc76b5ea00111::PhysRegInterval160    bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
161 
containsaco::__anonc76b5ea00111::PhysRegInterval162    bool contains(const PhysRegInterval& needle) const
163    {
164       return needle.lo() >= lo() && needle.hi() <= hi();
165    }
166 
beginaco::__anonc76b5ea00111::PhysRegInterval167    PhysRegIterator begin() const { return {lo_}; }
168 
endaco::__anonc76b5ea00111::PhysRegInterval169    PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
170 };
171 
172 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)173 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
174 {
175    return a.hi() > b.lo() && b.hi() > a.lo();
176 }
177 
178 /* Gets the stride for full (non-subdword) registers */
179 uint32_t
get_stride(RegClass rc)180 get_stride(RegClass rc)
181 {
182    if (rc.type() == RegType::vgpr) {
183       return 1;
184    } else {
185       uint32_t size = rc.size();
186       if (size == 2) {
187          return 2;
188       } else if (size >= 4) {
189          return 4;
190       } else {
191          return 1;
192       }
193    }
194 }
195 
196 PhysRegInterval
get_reg_bounds(Program * program,RegType type)197 get_reg_bounds(Program* program, RegType type)
198 {
199    if (type == RegType::vgpr) {
200       return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
201    } else {
202       return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
203    }
204 }
205 
206 struct DefInfo {
207    PhysRegInterval bounds;
208    uint8_t size;
209    uint8_t stride;
210    RegClass rc;
211 
DefInfoaco::__anonc76b5ea00111::DefInfo212    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
213    {
214       size = rc.size();
215       stride = get_stride(rc);
216 
217       bounds = get_reg_bounds(ctx.program, rc.type());
218 
219       if (rc.is_subdword() && operand >= 0) {
220          /* stride in bytes */
221          stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
222       } else if (rc.is_subdword()) {
223          std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
224          stride = info.first;
225          if (info.second > rc.bytes()) {
226             rc = RegClass::get(rc.type(), info.second);
227             size = rc.size();
228             /* we might still be able to put the definition in the high half,
229              * but that's only useful for affinities and this information isn't
230              * used for them */
231             stride = align(stride, info.second);
232             if (!rc.is_subdword())
233                stride = DIV_ROUND_UP(stride, 4);
234          }
235          assert(stride > 0);
236       } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
237          /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
238           *
239           * The register use is not calculated correctly, and the hardware assumes a
240           * full dword per component. Don't use the last registers of the register file.
241           * Otherwise, the instruction will be skipped.
242           *
243           * https://reviews.llvm.org/D81172
244           */
245          bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
246          assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
247 
248          if (imageGather4D16Bug)
249             bounds.size -= rc.bytes() / 4;
250       }
251    }
252 };
253 
254 class RegisterFile {
255 public:
RegisterFile()256    RegisterFile() { regs.fill(0); }
257 
258    std::array<uint32_t, 512> regs;
259    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
260 
operator [](PhysReg index) const261    const uint32_t& operator[](PhysReg index) const { return regs[index]; }
262 
operator [](PhysReg index)263    uint32_t& operator[](PhysReg index) { return regs[index]; }
264 
count_zero(PhysRegInterval reg_interval)265    unsigned count_zero(PhysRegInterval reg_interval)
266    {
267       unsigned res = 0;
268       for (PhysReg reg : reg_interval)
269          res += !regs[reg];
270       return res;
271    }
272 
273    /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes)274    bool test(PhysReg start, unsigned num_bytes)
275    {
276       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
277          assert(i <= 511);
278          if (regs[i] & 0x0FFFFFFF)
279             return true;
280          if (regs[i] == 0xF0000000) {
281             assert(subdword_regs.find(i) != subdword_regs.end());
282             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
283                if (subdword_regs[i][j])
284                   return true;
285             }
286          }
287       }
288       return false;
289    }
290 
block(PhysReg start,RegClass rc)291    void block(PhysReg start, RegClass rc)
292    {
293       if (rc.is_subdword())
294          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
295       else
296          fill(start, rc.size(), 0xFFFFFFFF);
297    }
298 
is_blocked(PhysReg start)299    bool is_blocked(PhysReg start)
300    {
301       if (regs[start] == 0xFFFFFFFF)
302          return true;
303       if (regs[start] == 0xF0000000) {
304          for (unsigned i = start.byte(); i < 4; i++)
305             if (subdword_regs[start][i] == 0xFFFFFFFF)
306                return true;
307       }
308       return false;
309    }
310 
is_empty_or_blocked(PhysReg start)311    bool is_empty_or_blocked(PhysReg start)
312    {
313       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
314        * incremented value to 1 */
315       if (regs[start] == 0xF0000000) {
316          return subdword_regs[start][start.byte()] + 1 <= 1;
317       }
318       return regs[start] + 1 <= 1;
319    }
320 
clear(PhysReg start,RegClass rc)321    void clear(PhysReg start, RegClass rc)
322    {
323       if (rc.is_subdword())
324          fill_subdword(start, rc.bytes(), 0);
325       else
326          fill(start, rc.size(), 0);
327    }
328 
fill(Operand op)329    void fill(Operand op)
330    {
331       if (op.regClass().is_subdword())
332          fill_subdword(op.physReg(), op.bytes(), op.tempId());
333       else
334          fill(op.physReg(), op.size(), op.tempId());
335    }
336 
clear(Operand op)337    void clear(Operand op) { clear(op.physReg(), op.regClass()); }
338 
fill(Definition def)339    void fill(Definition def)
340    {
341       if (def.regClass().is_subdword())
342          fill_subdword(def.physReg(), def.bytes(), def.tempId());
343       else
344          fill(def.physReg(), def.size(), def.tempId());
345    }
346 
clear(Definition def)347    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
348 
get_id(PhysReg reg)349    unsigned get_id(PhysReg reg)
350    {
351       return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
352    }
353 
354 private:
fill(PhysReg start,unsigned size,uint32_t val)355    void fill(PhysReg start, unsigned size, uint32_t val)
356    {
357       for (unsigned i = 0; i < size; i++)
358          regs[start + i] = val;
359    }
360 
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)361    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
362    {
363       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
364       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
365          /* emplace or get */
366          std::array<uint32_t, 4>& sub =
367             subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
368          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
369             sub[j] = val;
370 
371          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
372             subdword_regs.erase(i);
373             regs[i] = 0;
374          }
375       }
376    }
377 };
378 
379 std::vector<unsigned> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
380                                 const PhysRegInterval reg_interval);
381 
382 /* helper function for debugging */
383 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)384 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
385 {
386    if (reg_file[reg] == 0xFFFFFFFF) {
387       printf((const char*)u8"☐");
388    } else if (reg_file[reg]) {
389       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
390       if (show_subdword_alloc) {
391          auto block_chars = {
392             // clang-format off
393             u8"?", u8"▘", u8"▝", u8"▀",
394             u8"▖", u8"▌", u8"▞", u8"▛",
395             u8"▗", u8"▚", u8"▐", u8"▜",
396             u8"▄", u8"▙", u8"▟", u8"▉"
397             // clang-format on
398          };
399          unsigned index = 0;
400          for (int i = 0; i < 4; ++i) {
401             if (reg_file.subdword_regs.at(reg)[i]) {
402                index |= 1 << i;
403             }
404          }
405          printf("%s", (const char*)(block_chars.begin()[index]));
406       } else {
407          /* Indicate filled register slot */
408          if (!has_adjacent_variable) {
409             printf((const char*)u8"█");
410          } else {
411             /* Use a slightly shorter box to leave a small gap between adjacent variables */
412             printf((const char*)u8"▉");
413          }
414       }
415    } else {
416       printf((const char*)u8"·");
417    }
418 }
419 
420 /* helper function for debugging */
421 UNUSED void
print_regs(ra_ctx & ctx,bool vgprs,RegisterFile & reg_file)422 print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
423 {
424    PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
425    char reg_char = vgprs ? 'v' : 's';
426    const int max_regs_per_line = 64;
427 
428    /* print markers */
429    printf("       ");
430    for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
431       printf("%-3.2u ", i);
432    }
433    printf("\n");
434 
435    /* print usage */
436    auto line_begin_it = regs.begin();
437    while (line_begin_it != regs.end()) {
438       const int regs_in_line =
439          std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
440 
441       if (line_begin_it == regs.begin()) {
442          printf("%cgprs: ", reg_char);
443       } else {
444          printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
445       }
446       const auto line_end_it = std::next(line_begin_it, regs_in_line);
447 
448       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
449          bool has_adjacent_variable =
450             (std::next(reg_it) != line_end_it &&
451              reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
452          print_reg(reg_file, *reg_it, has_adjacent_variable);
453       }
454 
455       line_begin_it = line_end_it;
456       printf("\n");
457    }
458 
459    const unsigned free_regs =
460       std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
461    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
462 
463    /* print assignments ordered by registers */
464    std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
465    for (unsigned id : find_vars(ctx, reg_file, regs)) {
466       const assignment& var = ctx.assignments[id];
467       PhysReg reg = var.reg;
468       ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
469       assert(inserted.second);
470    }
471 
472    for (const auto& reg_and_var : regs_to_vars) {
473       const auto& first_reg = reg_and_var.first;
474       const auto& size_id = reg_and_var.second;
475 
476       printf("%%%u ", size_id.second);
477       if (ctx.orig_names.count(size_id.second) &&
478           ctx.orig_names[size_id.second].id() != size_id.second) {
479          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
480       }
481       printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
482       PhysReg last_reg = first_reg.advance(size_id.first - 1);
483       if (first_reg.reg() != last_reg.reg()) {
484          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
485          printf("-%d", last_reg.reg() - regs.lo());
486       }
487       printf("]");
488       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
489          printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
490       }
491       printf("\n");
492    }
493 }
494 
495 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)496 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
497                             unsigned idx, RegClass rc)
498 {
499    if (instr->isPseudo()) {
500       /* v_readfirstlane_b32 cannot use SDWA */
501       if (instr->opcode == aco_opcode::p_as_uniform)
502          return 4;
503       else if (gfx_level >= GFX8)
504          return rc.bytes() % 2 == 0 ? 2 : 1;
505       else
506          return 4;
507    }
508 
509    assert(rc.bytes() <= 2);
510    if (instr->isVALU()) {
511       if (can_use_SDWA(gfx_level, instr, false))
512          return rc.bytes();
513       if (can_use_opsel(gfx_level, instr->opcode, idx))
514          return 2;
515       if (instr->isVOP3P())
516          return 2;
517    }
518 
519    switch (instr->opcode) {
520    case aco_opcode::v_cvt_f32_ubyte0: return 1;
521    case aco_opcode::ds_write_b8:
522    case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
523    case aco_opcode::buffer_store_byte:
524    case aco_opcode::buffer_store_short:
525    case aco_opcode::buffer_store_format_d16_x:
526    case aco_opcode::flat_store_byte:
527    case aco_opcode::flat_store_short:
528    case aco_opcode::scratch_store_byte:
529    case aco_opcode::scratch_store_short:
530    case aco_opcode::global_store_byte:
531    case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
532    default: return 4;
533    }
534 }
535 
536 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)537 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
538                      RegClass rc)
539 {
540    amd_gfx_level gfx_level = ctx.program->gfx_level;
541    if (instr->isPseudo() || byte == 0)
542       return;
543 
544    assert(rc.bytes() <= 2);
545    if (instr->isVALU()) {
546       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
547          switch (byte) {
548          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
549          case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
550          case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
551          case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
552          }
553          return;
554       }
555 
556       /* use SDWA */
557       if (can_use_SDWA(gfx_level, instr, false)) {
558          convert_to_SDWA(gfx_level, instr);
559          return;
560       }
561 
562       /* use opsel */
563       if (instr->isVOP3P()) {
564          assert(byte == 2 && !instr->valu().opsel_lo[idx]);
565          instr->valu().opsel_lo[idx] = true;
566          instr->valu().opsel_hi[idx] = true;
567          return;
568       }
569 
570       assert(can_use_opsel(gfx_level, instr->opcode, idx));
571       instr->valu().opsel[idx] = true;
572       return;
573    }
574 
575    assert(byte == 2);
576    if (instr->opcode == aco_opcode::ds_write_b8)
577       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
578    else if (instr->opcode == aco_opcode::ds_write_b16)
579       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
580    else if (instr->opcode == aco_opcode::buffer_store_byte)
581       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
582    else if (instr->opcode == aco_opcode::buffer_store_short)
583       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
584    else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
585       instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
586    else if (instr->opcode == aco_opcode::flat_store_byte)
587       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
588    else if (instr->opcode == aco_opcode::flat_store_short)
589       instr->opcode = aco_opcode::flat_store_short_d16_hi;
590    else if (instr->opcode == aco_opcode::scratch_store_byte)
591       instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
592    else if (instr->opcode == aco_opcode::scratch_store_short)
593       instr->opcode = aco_opcode::scratch_store_short_d16_hi;
594    else if (instr->opcode == aco_opcode::global_store_byte)
595       instr->opcode = aco_opcode::global_store_byte_d16_hi;
596    else if (instr->opcode == aco_opcode::global_store_short)
597       instr->opcode = aco_opcode::global_store_short_d16_hi;
598    else
599       unreachable("Something went wrong: Impossible register assignment.");
600    return;
601 }
602 
603 /* minimum_stride, bytes_written */
604 std::pair<unsigned, unsigned>
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr,RegClass rc)605 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
606 {
607    amd_gfx_level gfx_level = program->gfx_level;
608 
609    if (instr->isPseudo()) {
610       if (instr->opcode == aco_opcode::p_interp_gfx11)
611          return std::make_pair(4u, 4u);
612       else if (gfx_level >= GFX8)
613          return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
614       else
615          return std::make_pair(4, rc.size() * 4u);
616    }
617 
618    if (instr->isVALU() || instr->isVINTRP()) {
619       assert(rc.bytes() <= 2);
620 
621       if (can_use_SDWA(gfx_level, instr, false))
622          return std::make_pair(rc.bytes(), rc.bytes());
623 
624       unsigned bytes_written = 4u;
625       if (instr_is_16bit(gfx_level, instr->opcode))
626          bytes_written = 2u;
627 
628       unsigned stride = 4u;
629       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
630           can_use_opsel(gfx_level, instr->opcode, -1))
631          stride = 2u;
632 
633       return std::make_pair(stride, bytes_written);
634    }
635 
636    switch (instr->opcode) {
637    /* D16 loads with _hi version */
638    case aco_opcode::ds_read_u8_d16:
639    case aco_opcode::ds_read_i8_d16:
640    case aco_opcode::ds_read_u16_d16:
641    case aco_opcode::flat_load_ubyte_d16:
642    case aco_opcode::flat_load_sbyte_d16:
643    case aco_opcode::flat_load_short_d16:
644    case aco_opcode::global_load_ubyte_d16:
645    case aco_opcode::global_load_sbyte_d16:
646    case aco_opcode::global_load_short_d16:
647    case aco_opcode::scratch_load_ubyte_d16:
648    case aco_opcode::scratch_load_sbyte_d16:
649    case aco_opcode::scratch_load_short_d16:
650    case aco_opcode::buffer_load_ubyte_d16:
651    case aco_opcode::buffer_load_sbyte_d16:
652    case aco_opcode::buffer_load_short_d16:
653    case aco_opcode::buffer_load_format_d16_x: {
654       assert(gfx_level >= GFX9);
655       if (!program->dev.sram_ecc_enabled)
656          return std::make_pair(2u, 2u);
657       else
658          return std::make_pair(2u, 4u);
659    }
660    /* 3-component D16 loads */
661    case aco_opcode::buffer_load_format_d16_xyz:
662    case aco_opcode::tbuffer_load_format_d16_xyz: {
663       assert(gfx_level >= GFX9);
664       if (!program->dev.sram_ecc_enabled)
665          return std::make_pair(4u, 6u);
666       break;
667    }
668 
669    default: break;
670    }
671 
672    if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
673       assert(gfx_level >= GFX9);
674       return std::make_pair(4u, rc.bytes());
675    }
676 
677    return std::make_pair(4, rc.size() * 4u);
678 }
679 
680 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg)681 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
682 {
683    if (instr->isPseudo())
684       return;
685 
686    if (instr->isVALU()) {
687       amd_gfx_level gfx_level = program->gfx_level;
688       assert(instr->definitions[0].bytes() <= 2);
689 
690       if (reg.byte() == 0 && instr_is_16bit(gfx_level, instr->opcode))
691          return;
692 
693       /* use SDWA */
694       if (can_use_SDWA(gfx_level, instr, false)) {
695          convert_to_SDWA(gfx_level, instr);
696          return;
697       }
698 
699       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
700          instr->opcode = aco_opcode::v_fma_mixhi_f16;
701          return;
702       }
703 
704       /* use opsel */
705       assert(reg.byte() == 2);
706       assert(can_use_opsel(gfx_level, instr->opcode, -1));
707       instr->valu().opsel[3] = true; /* dst in high half */
708       return;
709    }
710 
711    if (reg.byte() == 0)
712       return;
713    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
714       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
715    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
716       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
717    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
718       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
719    else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
720       instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
721    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
722       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
723    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
724       instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
725    else if (instr->opcode == aco_opcode::flat_load_short_d16)
726       instr->opcode = aco_opcode::flat_load_short_d16_hi;
727    else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
728       instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
729    else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
730       instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
731    else if (instr->opcode == aco_opcode::scratch_load_short_d16)
732       instr->opcode = aco_opcode::scratch_load_short_d16_hi;
733    else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
734       instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
735    else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
736       instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
737    else if (instr->opcode == aco_opcode::global_load_short_d16)
738       instr->opcode = aco_opcode::global_load_short_d16_hi;
739    else if (instr->opcode == aco_opcode::ds_read_u8_d16)
740       instr->opcode = aco_opcode::ds_read_u8_d16_hi;
741    else if (instr->opcode == aco_opcode::ds_read_i8_d16)
742       instr->opcode = aco_opcode::ds_read_i8_d16_hi;
743    else if (instr->opcode == aco_opcode::ds_read_u16_d16)
744       instr->opcode = aco_opcode::ds_read_u16_d16_hi;
745    else
746       unreachable("Something went wrong: Impossible register assignment.");
747 }
748 
749 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)750 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
751 {
752    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
753    unsigned size = rc.size();
754    if (rc.type() == RegType::vgpr) {
755       assert(reg >= 256);
756       uint16_t hi = reg - 256 + size - 1;
757       assert(hi <= 255);
758       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
759    } else if (reg + rc.size() <= max_addressible_sgpr) {
760       uint16_t hi = reg + size - 1;
761       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
762    }
763 }
764 
765 enum UpdateRenames {
766    rename_not_killed_ops = 0x1,
767    fill_killed_ops = 0x2,
768    rename_precolored_ops = 0x4,
769 };
770 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
771 
772 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)773 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
774                std::vector<std::pair<Operand, Definition>>& parallelcopies,
775                aco_ptr<Instruction>& instr, UpdateRenames flags)
776 {
777    /* clear operands */
778    for (std::pair<Operand, Definition>& copy : parallelcopies) {
779       /* the definitions with id are not from this function and already handled */
780       if (copy.second.isTemp())
781          continue;
782       reg_file.clear(copy.first);
783    }
784 
785    /* allocate id's and rename operands: this is done transparently here */
786    auto it = parallelcopies.begin();
787    while (it != parallelcopies.end()) {
788       if (it->second.isTemp()) {
789          ++it;
790          continue;
791       }
792 
793       /* check if we moved a definition: change the register and remove copy */
794       bool is_def = false;
795       for (Definition& def : instr->definitions) {
796          if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
797             // FIXME: ensure that the definition can use this reg
798             def.setFixed(it->second.physReg());
799             reg_file.fill(def);
800             ctx.assignments[def.tempId()].reg = def.physReg();
801             it = parallelcopies.erase(it);
802             is_def = true;
803             break;
804          }
805       }
806       if (is_def)
807          continue;
808 
809       /* check if we moved another parallelcopy definition */
810       for (std::pair<Operand, Definition>& other : parallelcopies) {
811          if (!other.second.isTemp())
812             continue;
813          if (it->first.getTemp() == other.second.getTemp()) {
814             other.second.setFixed(it->second.physReg());
815             ctx.assignments[other.second.tempId()].reg = other.second.physReg();
816             it = parallelcopies.erase(it);
817             is_def = true;
818             /* check if we moved an operand, again */
819             bool fill = true;
820             for (Operand& op : instr->operands) {
821                if (op.isTemp() && op.tempId() == other.second.tempId()) {
822                   // FIXME: ensure that the operand can use this reg
823                   op.setFixed(other.second.physReg());
824                   fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
825                }
826             }
827             if (fill)
828                reg_file.fill(other.second);
829             break;
830          }
831       }
832       if (is_def)
833          continue;
834 
835       std::pair<Operand, Definition>& copy = *it;
836       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
837       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
838       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
839 
840       /* check if we moved an operand */
841       bool first = true;
842       bool fill = true;
843       for (unsigned i = 0; i < instr->operands.size(); i++) {
844          Operand& op = instr->operands[i];
845          if (!op.isTemp())
846             continue;
847          if (op.tempId() == copy.first.tempId()) {
848             /* only rename precolored operands if the copy-location matches */
849             if ((flags & rename_precolored_ops) && op.isFixed() &&
850                 op.physReg() != copy.second.physReg())
851                continue;
852 
853             bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
854             for (std::pair<Operand, Definition>& pc : parallelcopies) {
855                PhysReg def_reg = pc.second.physReg();
856                omit_renaming &= def_reg > copy.first.physReg()
857                                    ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
858                                    : (def_reg + pc.second.size() <= copy.first.physReg().reg());
859             }
860             if (omit_renaming) {
861                if (first)
862                   op.setFirstKill(true);
863                else
864                   op.setKill(true);
865                first = false;
866                continue;
867             }
868             op.setTemp(copy.second.getTemp());
869             op.setFixed(copy.second.physReg());
870 
871             fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
872          }
873       }
874 
875       if (fill)
876          reg_file.fill(copy.second);
877 
878       ++it;
879    }
880 }
881 
882 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,RegisterFile & reg_file,DefInfo info)883 get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
884 {
885    const PhysRegInterval& bounds = info.bounds;
886    uint32_t size = info.size;
887    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
888    RegClass rc = info.rc;
889 
890    DefInfo new_info = info;
891    new_info.rc = RegClass(rc.type(), size);
892    for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {
893       if (size % new_stride)
894          continue;
895       new_info.stride = new_stride;
896       std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
897       if (res)
898          return res;
899    }
900 
901    auto is_free = [&](PhysReg reg_index)
902    { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
903 
904    if (stride == 1) {
905       /* best fit algorithm: find the smallest gap to fit in the variable */
906       PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
907       const unsigned max_gpr =
908          (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
909 
910       PhysRegIterator reg_it = bounds.begin();
911       const PhysRegIterator end_it =
912          std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
913       while (reg_it != bounds.end()) {
914          /* Find the next chunk of available register slots */
915          reg_it = std::find_if(reg_it, end_it, is_free);
916          auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);
917          if (reg_it == bounds.end()) {
918             break;
919          }
920 
921          if (next_nonfree_it == end_it) {
922             /* All registers past max_used_gpr are free */
923             next_nonfree_it = bounds.end();
924          }
925 
926          PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);
927 
928          /* early return on exact matches */
929          if (size == gap.size) {
930             adjust_max_used_regs(ctx, rc, gap.lo());
931             return gap.lo();
932          }
933 
934          /* check if it fits and the gap size is smaller */
935          if (size < gap.size && gap.size < best_gap.size) {
936             best_gap = gap;
937          }
938 
939          /* Move past the processed chunk */
940          reg_it = next_nonfree_it;
941       }
942 
943       if (best_gap.size == UINT_MAX)
944          return {};
945 
946       /* find best position within gap by leaving a good stride for other variables*/
947       unsigned buffer = best_gap.size - size;
948       if (buffer > 1) {
949          if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
950              ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
951              ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
952             best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
953       }
954 
955       adjust_max_used_regs(ctx, rc, best_gap.lo());
956       return best_gap.lo();
957    }
958 
959    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
960         reg_win += stride) {
961       if (reg_file[reg_win.lo()] != 0) {
962          continue;
963       }
964 
965       bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
966       if (is_valid) {
967          adjust_max_used_regs(ctx, rc, reg_win.lo());
968          return reg_win.lo();
969       }
970    }
971 
972    /* do this late because using the upper bytes of a register can require
973     * larger instruction encodings or copies
974     * TODO: don't do this in situations where it doesn't benefit */
975    if (rc.is_subdword()) {
976       for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
977          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
978          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
979             continue;
980 
981          for (unsigned i = 0; i < 4; i += info.stride) {
982             /* check if there's a block of free bytes large enough to hold the register */
983             bool reg_found =
984                std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
985                            [](unsigned v) { return v == 0; });
986 
987             /* check if also the neighboring reg is free if needed */
988             if (reg_found && i + rc.bytes() > 4)
989                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
990 
991             if (reg_found) {
992                PhysReg res{entry.first};
993                res.reg_b += i;
994                adjust_max_used_regs(ctx, rc, entry.first);
995                return res;
996             }
997          }
998       }
999    }
1000 
1001    return {};
1002 }
1003 
1004 /* collect variables from a register area */
1005 std::vector<unsigned>
find_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1006 find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1007 {
1008    std::vector<unsigned> vars;
1009    for (PhysReg j : reg_interval) {
1010       if (reg_file.is_blocked(j))
1011          continue;
1012       if (reg_file[j] == 0xF0000000) {
1013          for (unsigned k = 0; k < 4; k++) {
1014             unsigned id = reg_file.subdword_regs[j][k];
1015             if (id && (vars.empty() || id != vars.back()))
1016                vars.emplace_back(id);
1017          }
1018       } else {
1019          unsigned id = reg_file[j];
1020          if (id && (vars.empty() || id != vars.back()))
1021             vars.emplace_back(id);
1022       }
1023    }
1024    return vars;
1025 }
1026 
1027 /* collect variables from a register area and clear reg_file
1028  * variables are sorted in decreasing size and
1029  * increasing assigned register
1030  */
1031 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1032 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1033 {
1034    std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1035    std::sort(ids.begin(), ids.end(),
1036              [&](unsigned a, unsigned b)
1037              {
1038                 assignment& var_a = ctx.assignments[a];
1039                 assignment& var_b = ctx.assignments[b];
1040                 return var_a.rc.bytes() > var_b.rc.bytes() ||
1041                        (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1042              });
1043 
1044    for (unsigned id : ids) {
1045       assignment& var = ctx.assignments[id];
1046       reg_file.clear(var.reg, var.rc);
1047    }
1048    return ids;
1049 }
1050 
1051 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1052 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1053                                std::vector<std::pair<Operand, Definition>>& parallelcopies,
1054                                aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1055                                DefInfo info, unsigned id)
1056 {
1057    PhysReg reg = def_reg.lo();
1058    /* dead operand: return position in vector */
1059    for (unsigned i = 0; i < instr->operands.size(); i++) {
1060       if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1061           instr->operands[i].isKillBeforeDef()) {
1062          assert(!reg_file.test(reg, instr->operands[i].bytes()));
1063          if (info.rc.is_subdword() || reg.byte() == 0)
1064             return reg;
1065          else
1066             return {};
1067       }
1068       reg.reg_b += instr->operands[i].bytes();
1069    }
1070 
1071    /* GFX9+ has a VGPR swap instruction. */
1072    if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1073       return {};
1074 
1075    /* check if the previous position was in vector */
1076    assignment& var = ctx.assignments[id];
1077    if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1078       reg = def_reg.lo();
1079       /* try to use the previous register of the operand */
1080       for (unsigned i = 0; i < instr->operands.size(); i++) {
1081          if (reg != var.reg) {
1082             reg.reg_b += instr->operands[i].bytes();
1083             continue;
1084          }
1085 
1086          /* check if we can swap positions */
1087          if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1088              instr->operands[i].regClass() == info.rc) {
1089             assignment& op = ctx.assignments[instr->operands[i].tempId()];
1090             /* if everything matches, create parallelcopy for the killed operand */
1091             if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1092                 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1093                Definition pc_def = Definition(reg, info.rc);
1094                parallelcopies.emplace_back(instr->operands[i], pc_def);
1095                return op.reg;
1096             }
1097          }
1098          return {};
1099       }
1100    }
1101    return {};
1102 }
1103 
1104 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1105 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1106                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
1107                     const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1108                     const PhysRegInterval def_reg)
1109 {
1110    /* Variables are sorted from large to small and with increasing assigned register */
1111    for (unsigned id : vars) {
1112       assignment& var = ctx.assignments[id];
1113       PhysRegInterval bounds = get_reg_bounds(ctx.program, var.rc.type());
1114       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1115       uint32_t size = info.size;
1116 
1117       /* check if this is a dead operand, then we can re-use the space from the definition
1118        * also use the correct stride for sub-dword operands */
1119       bool is_dead_operand = false;
1120       std::optional<PhysReg> res;
1121       if (instr->opcode == aco_opcode::p_create_vector) {
1122          res =
1123             get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1124       } else {
1125          for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1126             if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1127                info = DefInfo(ctx, instr, var.rc, i);
1128                if (instr->operands[i].isKillBeforeDef()) {
1129                   info.bounds = def_reg;
1130                   res = get_reg_simple(ctx, reg_file, info);
1131                   is_dead_operand = true;
1132                }
1133                break;
1134             }
1135          }
1136       }
1137       if (!res && !def_reg.size) {
1138          /* If this is before definitions are handled, def_reg may be an empty interval. */
1139          info.bounds = bounds;
1140          res = get_reg_simple(ctx, reg_file, info);
1141       } else if (!res) {
1142          /* Try to find space within the bounds but outside of the definition */
1143          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1144          res = get_reg_simple(ctx, reg_file, info);
1145          if (!res && def_reg.hi() <= bounds.hi()) {
1146             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1147             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1148             res = get_reg_simple(ctx, reg_file, info);
1149          }
1150       }
1151 
1152       if (res) {
1153          /* mark the area as blocked */
1154          reg_file.block(*res, var.rc);
1155 
1156          /* create parallelcopy pair (without definition id) */
1157          Temp tmp = Temp(id, var.rc);
1158          Operand pc_op = Operand(tmp);
1159          pc_op.setFixed(var.reg);
1160          Definition pc_def = Definition(*res, pc_op.regClass());
1161          parallelcopies.emplace_back(pc_op, pc_def);
1162          continue;
1163       }
1164 
1165       PhysReg best_pos = bounds.lo();
1166       unsigned num_moves = 0xFF;
1167       unsigned num_vars = 0;
1168 
1169       /* we use a sliding window to find potential positions */
1170       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1171       for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1172            reg_win += stride) {
1173          if (!is_dead_operand && intersects(reg_win, def_reg))
1174             continue;
1175 
1176          /* second, check that we have at most k=num_moves elements in the window
1177           * and no element is larger than the currently processed one */
1178          unsigned k = 0;
1179          unsigned n = 0;
1180          unsigned last_var = 0;
1181          bool found = true;
1182          for (PhysReg j : reg_win) {
1183             if (reg_file[j] == 0 || reg_file[j] == last_var)
1184                continue;
1185 
1186             if (reg_file.is_blocked(j) || k > num_moves) {
1187                found = false;
1188                break;
1189             }
1190             if (reg_file[j] == 0xF0000000) {
1191                k += 1;
1192                n++;
1193                continue;
1194             }
1195             /* we cannot split live ranges of linear vgprs inside control flow */
1196             if (!(ctx.block->kind & block_kind_top_level) &&
1197                 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1198                found = false;
1199                break;
1200             }
1201             bool is_kill = false;
1202             for (const Operand& op : instr->operands) {
1203                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1204                   is_kill = true;
1205                   break;
1206                }
1207             }
1208             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1209                found = false;
1210                break;
1211             }
1212 
1213             k += ctx.assignments[reg_file[j]].rc.size();
1214             last_var = reg_file[j];
1215             n++;
1216             if (k > num_moves || (k == num_moves && n <= num_vars)) {
1217                found = false;
1218                break;
1219             }
1220          }
1221 
1222          if (found) {
1223             best_pos = reg_win.lo();
1224             num_moves = k;
1225             num_vars = n;
1226          }
1227       }
1228 
1229       /* FIXME: we messed up and couldn't find space for the variables to be copied */
1230       if (num_moves == 0xFF)
1231          return false;
1232 
1233       PhysRegInterval reg_win{best_pos, size};
1234 
1235       /* collect variables and block reg file */
1236       std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1237 
1238       /* mark the area as blocked */
1239       reg_file.block(reg_win.lo(), var.rc);
1240       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1241 
1242       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1243          return false;
1244 
1245       /* create parallelcopy pair (without definition id) */
1246       Temp tmp = Temp(id, var.rc);
1247       Operand pc_op = Operand(tmp);
1248       pc_op.setFixed(var.reg);
1249       Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1250       parallelcopies.emplace_back(pc_op, pc_def);
1251    }
1252 
1253    return true;
1254 }
1255 
1256 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1257 get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
1258              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1259              aco_ptr<Instruction>& instr)
1260 {
1261    const PhysRegInterval& bounds = info.bounds;
1262    uint32_t size = info.size;
1263    uint32_t stride = info.stride;
1264    RegClass rc = info.rc;
1265 
1266    /* check how many free regs we have */
1267    unsigned regs_free = reg_file.count_zero(bounds);
1268 
1269    /* mark and count killed operands */
1270    unsigned killed_ops = 0;
1271    std::bitset<256> is_killed_operand; /* per-register */
1272    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1273       Operand& op = instr->operands[j];
1274       if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1275           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1276          assert(op.isFixed());
1277 
1278          for (unsigned i = 0; i < op.size(); ++i) {
1279             is_killed_operand[(op.physReg() & 0xff) + i] = true;
1280          }
1281 
1282          killed_ops += op.getTemp().size();
1283       }
1284    }
1285 
1286    assert(regs_free >= size);
1287    /* we might have to move dead operands to dst in order to make space */
1288    unsigned op_moves = 0;
1289 
1290    if (size > (regs_free - killed_ops))
1291       op_moves = size - (regs_free - killed_ops);
1292 
1293    /* find the best position to place the definition */
1294    PhysRegInterval best_win = {bounds.lo(), size};
1295    unsigned num_moves = 0xFF;
1296    unsigned num_vars = 0;
1297 
1298    /* we use a sliding window to check potential positions */
1299    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1300         reg_win += stride) {
1301       /* first check if the register window starts in the middle of an
1302        * allocated variable: this is what we have to fix to allow for
1303        * num_moves > size */
1304       if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1305           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1306          continue;
1307       if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1308           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1309          continue;
1310 
1311       /* second, check that we have at most k=num_moves elements in the window
1312        * and no element is larger than the currently processed one */
1313       unsigned k = op_moves;
1314       unsigned n = 0;
1315       unsigned remaining_op_moves = op_moves;
1316       unsigned last_var = 0;
1317       bool found = true;
1318       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1319       for (const PhysReg j : reg_win) {
1320          /* dead operands effectively reduce the number of estimated moves */
1321          if (is_killed_operand[j & 0xFF]) {
1322             if (remaining_op_moves) {
1323                k--;
1324                remaining_op_moves--;
1325             }
1326             continue;
1327          }
1328 
1329          if (reg_file[j] == 0 || reg_file[j] == last_var)
1330             continue;
1331 
1332          if (reg_file[j] == 0xF0000000) {
1333             k += 1;
1334             n++;
1335             continue;
1336          }
1337 
1338          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1339             found = false;
1340             break;
1341          }
1342 
1343          /* we cannot split live ranges of linear vgprs inside control flow */
1344          // TODO: ensure that live range splits inside control flow are never necessary
1345          if (!(ctx.block->kind & block_kind_top_level) &&
1346              ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1347             found = false;
1348             break;
1349          }
1350 
1351          k += ctx.assignments[reg_file[j]].rc.size();
1352          n++;
1353          last_var = reg_file[j];
1354       }
1355 
1356       if (!found || k > num_moves)
1357          continue;
1358       if (k == num_moves && n < num_vars)
1359          continue;
1360       if (!aligned && k == num_moves && n == num_vars)
1361          continue;
1362 
1363       if (found) {
1364          best_win = reg_win;
1365          num_moves = k;
1366          num_vars = n;
1367       }
1368    }
1369 
1370    if (num_moves == 0xFF)
1371       return {};
1372 
1373    /* now, we figured the placement for our definition */
1374    RegisterFile tmp_file(reg_file);
1375 
1376    /* p_create_vector: also re-place killed operands in the definition space */
1377    if (instr->opcode == aco_opcode::p_create_vector) {
1378       for (Operand& op : instr->operands) {
1379          if (op.isTemp() && op.isFirstKillBeforeDef())
1380             tmp_file.fill(op);
1381       }
1382    }
1383 
1384    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1385 
1386    /* re-enable killed operands */
1387    if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
1388       for (Operand& op : instr->operands) {
1389          if (op.isTemp() && op.isFirstKillBeforeDef())
1390             tmp_file.fill(op);
1391       }
1392    }
1393 
1394    std::vector<std::pair<Operand, Definition>> pc;
1395    if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1396       return {};
1397 
1398    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1399 
1400    adjust_max_used_regs(ctx, rc, best_win.lo());
1401    return best_win.lo();
1402 }
1403 
1404 bool
get_reg_specified(ra_ctx & ctx,RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg)1405 get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
1406                   PhysReg reg)
1407 {
1408    /* catch out-of-range registers */
1409    if (reg >= PhysReg{512})
1410       return false;
1411 
1412    std::pair<unsigned, unsigned> sdw_def_info;
1413    if (rc.is_subdword())
1414       sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
1415 
1416    if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
1417       return false;
1418    if (!rc.is_subdword() && reg.byte())
1419       return false;
1420 
1421    if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
1422       return false;
1423 
1424    PhysRegInterval reg_win = {reg, rc.size()};
1425    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1426    PhysRegInterval vcc_win = {vcc, 2};
1427    /* VCC is outside the bounds */
1428    bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1429    bool is_m0 = rc == s1 && reg == m0;
1430    if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
1431       return false;
1432 
1433    if (rc.is_subdword()) {
1434       PhysReg test_reg;
1435       test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
1436       if (reg_file.test(test_reg, sdw_def_info.second))
1437          return false;
1438    } else {
1439       if (reg_file.test(reg, rc.bytes()))
1440          return false;
1441    }
1442 
1443    adjust_max_used_regs(ctx, rc, reg_win.lo());
1444    return true;
1445 }
1446 
1447 bool
increase_register_file(ra_ctx & ctx,RegType type)1448 increase_register_file(ra_ctx& ctx, RegType type)
1449 {
1450    if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
1451       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
1452                                                           ctx.program->max_reg_demand.sgpr));
1453    } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1454       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
1455                                                           ctx.program->max_reg_demand.sgpr + 1));
1456    } else {
1457       return false;
1458    }
1459    return true;
1460 }
1461 
1462 struct IDAndRegClass {
IDAndRegClassaco::__anonc76b5ea00111::IDAndRegClass1463    IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1464 
1465    unsigned id;
1466    RegClass rc;
1467 };
1468 
1469 struct IDAndInfo {
IDAndInfoaco::__anonc76b5ea00111::IDAndInfo1470    IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1471 
1472    unsigned id;
1473    DefInfo info;
1474 };
1475 
1476 /* Reallocates vars by sorting them and placing each variable after the previous
1477  * one. If one of the variables has 0xffffffff as an ID, the register assigned
1478  * for that variable will be returned.
1479  */
1480 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1481 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1482                       std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1483 {
1484    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1485     * temporary sizes to dwords.
1486     */
1487    std::vector<IDAndInfo> sorted;
1488    for (IDAndRegClass var : vars) {
1489       DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1490       sorted.emplace_back(var.id, info);
1491    }
1492 
1493    std::sort(
1494       sorted.begin(), sorted.end(),
1495       [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1496       {
1497          unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1498          unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1499          if (a_stride > b_stride)
1500             return true;
1501          if (a_stride < b_stride)
1502             return false;
1503          if (a.id == 0xffffffff || b.id == 0xffffffff)
1504             return a.id ==
1505                    0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1506          return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1507       });
1508 
1509    PhysReg next_reg = start;
1510    PhysReg space_reg;
1511    for (IDAndInfo& var : sorted) {
1512       unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1513       next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1514 
1515       /* 0xffffffff is a special variable ID used reserve a space for killed
1516        * operands and definitions.
1517        */
1518       if (var.id != 0xffffffff) {
1519          if (next_reg != ctx.assignments[var.id].reg) {
1520             RegClass rc = ctx.assignments[var.id].rc;
1521             Temp tmp(var.id, rc);
1522 
1523             Operand pc_op(tmp);
1524             pc_op.setFixed(ctx.assignments[var.id].reg);
1525             Definition pc_def(next_reg, rc);
1526             parallelcopies.emplace_back(pc_op, pc_def);
1527          }
1528       } else {
1529          space_reg = next_reg;
1530       }
1531 
1532       adjust_max_used_regs(ctx, var.info.rc, next_reg);
1533 
1534       next_reg = next_reg.advance(var.info.rc.size() * 4);
1535    }
1536 
1537    return space_reg;
1538 }
1539 
1540 bool
is_mimg_vaddr_intact(ra_ctx & ctx,RegisterFile & reg_file,Instruction * instr)1541 is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
1542 {
1543    PhysReg first{512};
1544    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1545       Operand op = instr->operands[i + 3];
1546 
1547       if (ctx.assignments[op.tempId()].assigned) {
1548          PhysReg reg = ctx.assignments[op.tempId()].reg;
1549 
1550          if (first.reg() == 512) {
1551             PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
1552             first = reg.advance(i * -4);
1553             PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1554             if (!bounds.contains(vec)) /* not enough space for other operands */
1555                return false;
1556          } else {
1557             if (reg != first.advance(i * 4)) /* not at the best position */
1558                return false;
1559          }
1560       } else {
1561          /* If there's an unexpected temporary, this operand is unlikely to be
1562           * placed in the best position.
1563           */
1564          if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1565             return false;
1566       }
1567    }
1568 
1569    return true;
1570 }
1571 
1572 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr)1573 get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
1574 {
1575    Instruction* vec = ctx.vectors[temp.id()];
1576    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1577    unsigned our_offset = 0;
1578    for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1579       Operand& op = vec->operands[i];
1580       if (op.isTemp() && op.tempId() == temp.id())
1581          break;
1582       else
1583          our_offset += op.bytes();
1584    }
1585 
1586    if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1587       unsigned their_offset = 0;
1588       /* check for every operand of the vector
1589        * - whether the operand is assigned and
1590        * - we can use the register relative to that operand
1591        */
1592       for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1593          Operand& op = vec->operands[i];
1594          if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1595              ctx.assignments[op.tempId()].assigned) {
1596             PhysReg reg = ctx.assignments[op.tempId()].reg;
1597             reg.reg_b += (our_offset - their_offset);
1598             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1599                return reg;
1600 
1601             /* return if MIMG vaddr components don't remain vector-aligned */
1602             if (vec->format == Format::MIMG)
1603                return {};
1604          }
1605          their_offset += op.bytes();
1606       }
1607 
1608       /* We didn't find a register relative to other vector operands.
1609        * Try to find new space which fits the whole vector.
1610        */
1611       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1612       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1613       std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1614       if (reg) {
1615          reg->reg_b += our_offset;
1616          /* make sure to only use byte offset if the instruction supports it */
1617          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg))
1618             return reg;
1619       }
1620    }
1621    return {};
1622 }
1623 
1624 PhysReg
get_reg(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1625 get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1626         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1627         int operand_index = -1)
1628 {
1629    auto split_vec = ctx.split_vectors.find(temp.id());
1630    if (split_vec != ctx.split_vectors.end()) {
1631       unsigned offset = 0;
1632       for (Definition def : split_vec->second->definitions) {
1633          if (ctx.assignments[def.tempId()].affinity) {
1634             assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1635             if (affinity.assigned) {
1636                PhysReg reg = affinity.reg;
1637                reg.reg_b -= offset;
1638                if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1639                   return reg;
1640             }
1641          }
1642          offset += def.bytes();
1643       }
1644    }
1645 
1646    if (ctx.assignments[temp.id()].affinity) {
1647       assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1648       if (affinity.assigned) {
1649          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
1650             return affinity.reg;
1651       }
1652    }
1653    if (ctx.assignments[temp.id()].vcc) {
1654       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc))
1655          return vcc;
1656    }
1657    if (ctx.assignments[temp.id()].m0) {
1658       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr))
1659          return m0;
1660    }
1661 
1662    std::optional<PhysReg> res;
1663 
1664    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1665       res = get_reg_vector(ctx, reg_file, temp, instr);
1666       if (res)
1667          return *res;
1668    }
1669 
1670    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1671 
1672    if (!ctx.policy.skip_optimistic_path) {
1673       /* try to find space without live-range splits */
1674       res = get_reg_simple(ctx, reg_file, info);
1675 
1676       if (res)
1677          return *res;
1678    }
1679 
1680    /* try to find space with live-range splits */
1681    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1682 
1683    if (res)
1684       return *res;
1685 
1686    /* try using more registers */
1687 
1688    /* We should only fail here because keeping under the limit would require
1689     * too many moves. */
1690    assert(reg_file.count_zero(info.bounds) >= info.size);
1691 
1692    if (!increase_register_file(ctx, info.rc.type())) {
1693       /* fallback algorithm: reallocate all variables at once */
1694       unsigned def_size = info.rc.size();
1695       for (Definition def : instr->definitions) {
1696          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1697             def_size += def.regClass().size();
1698       }
1699 
1700       unsigned killed_op_size = 0;
1701       for (Operand op : instr->operands) {
1702          if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1703             killed_op_size += op.regClass().size();
1704       }
1705 
1706       const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
1707 
1708       /* reallocate passthrough variables and non-killed operands */
1709       std::vector<IDAndRegClass> vars;
1710       for (unsigned id : find_vars(ctx, reg_file, regs))
1711          vars.emplace_back(id, ctx.assignments[id].rc);
1712       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1713 
1714       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1715 
1716       /* reallocate killed operands */
1717       std::vector<IDAndRegClass> killed_op_vars;
1718       for (Operand op : instr->operands) {
1719          if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1720             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1721       }
1722       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1723 
1724       /* reallocate definitions */
1725       std::vector<IDAndRegClass> def_vars;
1726       for (Definition def : instr->definitions) {
1727          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1728             def_vars.emplace_back(def.tempId(), def.regClass());
1729       }
1730       def_vars.emplace_back(0xffffffff, info.rc);
1731       return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1732    }
1733 
1734    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1735 }
1736 
1737 PhysReg
get_reg_create_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1738 get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1739                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
1740                       aco_ptr<Instruction>& instr)
1741 {
1742    RegClass rc = temp.regClass();
1743    /* create_vector instructions have different costs w.r.t. register coalescing */
1744    uint32_t size = rc.size();
1745    uint32_t bytes = rc.bytes();
1746    uint32_t stride = get_stride(rc);
1747    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1748 
1749    // TODO: improve p_create_vector for sub-dword vectors
1750 
1751    PhysReg best_pos{0xFFF};
1752    unsigned num_moves = 0xFF;
1753    bool best_avoid = true;
1754    uint32_t correct_pos_mask = 0;
1755 
1756    /* test for each operand which definition placement causes the least shuffle instructions */
1757    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1758         offset += instr->operands[i].bytes(), i++) {
1759       // TODO: think about, if we can alias live operands on the same register
1760       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1761           instr->operands[i].getTemp().type() != rc.type())
1762          continue;
1763 
1764       if (offset > instr->operands[i].physReg().reg_b)
1765          continue;
1766 
1767       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1768       if (reg_lower % 4)
1769          continue;
1770       PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1771       unsigned k = 0;
1772 
1773       /* no need to check multiple times */
1774       if (reg_win.lo() == best_pos)
1775          continue;
1776 
1777       /* check borders */
1778       // TODO: this can be improved */
1779       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1780          continue;
1781       if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1782           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1783          continue;
1784       if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1785           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1786          continue;
1787 
1788       /* count variables to be moved and check "avoid" */
1789       bool avoid = false;
1790       bool linear_vgpr = false;
1791       for (PhysReg j : reg_win) {
1792          if (reg_file[j] != 0) {
1793             if (reg_file[j] == 0xF0000000) {
1794                PhysReg reg;
1795                reg.reg_b = j * 4;
1796                unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1797                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1798                   k += reg_file.test(reg, 1);
1799             } else {
1800                k += 4;
1801                linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1802             }
1803          }
1804          avoid |= ctx.war_hint[j];
1805       }
1806 
1807       if (linear_vgpr) {
1808          /* we cannot split live ranges of linear vgprs inside control flow */
1809          if (ctx.block->kind & block_kind_top_level)
1810             avoid = true;
1811          else
1812             continue;
1813       }
1814 
1815       if (avoid && !best_avoid)
1816          continue;
1817 
1818       /* count operands in wrong positions */
1819       uint32_t correct_pos_mask_new = 0;
1820       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1821            offset2 += instr->operands[j].bytes(), j++) {
1822          Operand& op = instr->operands[j];
1823          if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1824             correct_pos_mask_new |= 1 << j;
1825          else
1826             k += op.bytes();
1827       }
1828       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1829       if (k > num_moves || (!aligned && k == num_moves))
1830          continue;
1831 
1832       best_pos = reg_win.lo();
1833       num_moves = k;
1834       best_avoid = avoid;
1835       correct_pos_mask = correct_pos_mask_new;
1836    }
1837 
1838    /* too many moves: try the generic get_reg() function */
1839    if (num_moves >= 2 * bytes) {
1840       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1841    } else if (num_moves > bytes) {
1842       DefInfo info(ctx, instr, rc, -1);
1843       std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1844       if (res)
1845          return *res;
1846    }
1847 
1848    /* re-enable killed operands which are in the wrong position */
1849    RegisterFile tmp_file(reg_file);
1850    for (Operand& op : instr->operands) {
1851       if (op.isTemp() && op.isFirstKillBeforeDef())
1852          tmp_file.fill(op);
1853    }
1854    for (unsigned i = 0; i < instr->operands.size(); i++) {
1855       if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
1856          tmp_file.clear(instr->operands[i]);
1857    }
1858 
1859    /* collect variables to be moved */
1860    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1861 
1862    bool success = false;
1863    std::vector<std::pair<Operand, Definition>> pc;
1864    success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
1865 
1866    if (!success) {
1867       if (!increase_register_file(ctx, temp.type())) {
1868          /* use the fallback algorithm in get_reg() */
1869          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1870       }
1871       return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1872    }
1873 
1874    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1875    adjust_max_used_regs(ctx, rc, best_pos);
1876 
1877    return best_pos;
1878 }
1879 
1880 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1881 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1882 {
1883    if (instr->format != Format::PSEUDO)
1884       return;
1885 
1886    /* all instructions which use handle_operands() need this information */
1887    switch (instr->opcode) {
1888    case aco_opcode::p_extract_vector:
1889    case aco_opcode::p_create_vector:
1890    case aco_opcode::p_split_vector:
1891    case aco_opcode::p_parallelcopy:
1892    case aco_opcode::p_start_linear_vgpr: break;
1893    default: return;
1894    }
1895 
1896    bool writes_linear = false;
1897    /* if all definitions are logical vgpr, no need to care for SCC */
1898    for (Definition& def : instr->definitions) {
1899       if (def.getTemp().regClass().is_linear())
1900          writes_linear = true;
1901    }
1902    /* if all operands are constant, no need to care either */
1903    bool reads_linear = false;
1904    bool reads_subdword = false;
1905    for (Operand& op : instr->operands) {
1906       if (op.isTemp() && op.getTemp().regClass().is_linear())
1907          reads_linear = true;
1908       if (op.isTemp() && op.regClass().is_subdword())
1909          reads_subdword = true;
1910    }
1911    bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
1912                             (ctx.program->gfx_level <= GFX7 && reads_subdword);
1913    if (!needs_scratch_reg)
1914       return;
1915 
1916    instr->pseudo().tmp_in_scc = reg_file[scc];
1917 
1918    int reg = ctx.max_used_sgpr;
1919    for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
1920       ;
1921    if (reg < 0) {
1922       reg = ctx.max_used_sgpr + 1;
1923       for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
1924          ;
1925       if (reg == ctx.program->max_reg_demand.sgpr) {
1926          assert(reads_subdword && reg_file[m0] == 0);
1927          reg = m0;
1928       }
1929    }
1930 
1931    adjust_max_used_regs(ctx, s1, reg);
1932    instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
1933 }
1934 
1935 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)1936 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
1937                     RegClass rc)
1938 {
1939    if (reg.byte()) {
1940       unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
1941       if (reg.byte() % stride)
1942          return false;
1943    }
1944 
1945    switch (instr->format) {
1946    case Format::SMEM:
1947       return reg != scc && reg != exec &&
1948              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1949              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
1950               gfx_level >= GFX10); /* sdata can be vcc */
1951    default:
1952       // TODO: there are more instructions with restrictions on registers
1953       return true;
1954    }
1955 }
1956 
1957 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)1958 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
1959                       std::vector<std::pair<Operand, Definition>>& parallelcopy,
1960                       aco_ptr<Instruction>& instr)
1961 {
1962    assert(instr->operands.size() <= 128);
1963 
1964    RegisterFile tmp_file(register_file);
1965 
1966    BITSET_DECLARE(mask, 128) = {0};
1967 
1968    for (unsigned i = 0; i < instr->operands.size(); i++) {
1969       Operand& op = instr->operands[i];
1970 
1971       if (!op.isTemp() || !op.isFixed())
1972          continue;
1973 
1974       PhysReg src = ctx.assignments[op.tempId()].reg;
1975       adjust_max_used_regs(ctx, op.regClass(), op.physReg());
1976 
1977       if (op.physReg() == src) {
1978          tmp_file.block(op.physReg(), op.regClass());
1979          continue;
1980       }
1981 
1982       unsigned j;
1983       bool found = false;
1984       BITSET_FOREACH_SET (j, mask, i) {
1985          if (instr->operands[j].tempId() == op.tempId() &&
1986              instr->operands[j].physReg() == op.physReg()) {
1987             found = true;
1988             break;
1989          }
1990       }
1991       if (found)
1992          continue; /* the copy is already added to the list */
1993 
1994       /* clear from register_file so fixed operands are not collected be collect_vars() */
1995       tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
1996 
1997       BITSET_SET(mask, i);
1998 
1999       Operand pc_op(instr->operands[i].getTemp(), src);
2000       Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2001       parallelcopy.emplace_back(pc_op, pc_def);
2002    }
2003 
2004    if (BITSET_IS_EMPTY(mask))
2005       return;
2006 
2007    unsigned i;
2008    std::vector<unsigned> blocking_vars;
2009    BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2010       Operand& op = instr->operands[i];
2011       PhysRegInterval target{op.physReg(), op.size()};
2012       std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2013       blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2014 
2015       /* prevent get_regs_for_copies() from using these registers */
2016       tmp_file.block(op.physReg(), op.regClass());
2017    }
2018 
2019    get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2020    update_renames(ctx, register_file, parallelcopy, instr,
2021                   rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
2022 }
2023 
2024 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2025 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2026                     std::vector<std::pair<Operand, Definition>>& parallelcopy,
2027                     aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2028 {
2029    /* clear the operand in case it's only a stride mismatch */
2030    PhysReg src = ctx.assignments[operand.tempId()].reg;
2031    register_file.clear(src, operand.regClass());
2032    PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2033 
2034    Operand pc_op = operand;
2035    pc_op.setFixed(src);
2036    Definition pc_def = Definition(dst, pc_op.regClass());
2037    parallelcopy.emplace_back(pc_op, pc_def);
2038    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
2039 }
2040 
2041 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2042 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2043             std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2044             aco_ptr<Instruction>& phi, Temp tmp)
2045 {
2046    std::vector<std::pair<Operand, Definition>> parallelcopy;
2047    PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2048    update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2049 
2050    /* process parallelcopy */
2051    for (std::pair<Operand, Definition> pc : parallelcopy) {
2052       /* see if it's a copy from a different phi */
2053       // TODO: prefer moving some previous phis over live-ins
2054       // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2055       // problem in practice since they can only be fixed to exec)
2056       Instruction* prev_phi = NULL;
2057       std::vector<aco_ptr<Instruction>>::iterator phi_it;
2058       for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2059          if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2060             prev_phi = phi_it->get();
2061       }
2062       if (prev_phi) {
2063          /* if so, just update that phi's register */
2064          prev_phi->definitions[0].setFixed(pc.second.physReg());
2065          register_file.fill(prev_phi->definitions[0]);
2066          ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2067                                                                pc.second.regClass()};
2068          continue;
2069       }
2070 
2071       /* rename */
2072       std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
2073       Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2074       ctx.orig_names[pc.second.tempId()] = orig;
2075       ctx.renames[block.index][orig.id()] = pc.second.getTemp();
2076 
2077       /* otherwise, this is a live-in and we need to create a new phi
2078        * to move it in this block's predecessors */
2079       aco_opcode opcode =
2080          pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2081       std::vector<unsigned>& preds =
2082          pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2083       aco_ptr<Instruction> new_phi{
2084          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2085       new_phi->definitions[0] = pc.second;
2086       for (unsigned i = 0; i < preds.size(); i++)
2087          new_phi->operands[i] = Operand(pc.first);
2088       instructions.emplace_back(std::move(new_phi));
2089 
2090       /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2091        * a loop header.
2092        */
2093       live_in.erase(orig.id());
2094    }
2095 
2096    return reg;
2097 }
2098 
2099 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2100 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2101                   std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2102 {
2103    /* move all phis to instructions */
2104    for (aco_ptr<Instruction>& phi : block.instructions) {
2105       if (!is_phi(phi))
2106          break;
2107       if (!phi->definitions[0].isKill())
2108          instructions.emplace_back(std::move(phi));
2109    }
2110 
2111    /* assign phis with all-matching registers to that register */
2112    for (aco_ptr<Instruction>& phi : instructions) {
2113       Definition& definition = phi->definitions[0];
2114       if (definition.isFixed())
2115          continue;
2116 
2117       if (!phi->operands[0].isTemp())
2118          continue;
2119 
2120       PhysReg reg = phi->operands[0].physReg();
2121       auto OpsSame = [=](const Operand& op) -> bool
2122       { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2123       bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2124       if (!all_same)
2125          continue;
2126 
2127       if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
2128          continue;
2129 
2130       definition.setFixed(reg);
2131       register_file.fill(definition);
2132       ctx.assignments[definition.tempId()].set(definition);
2133    }
2134 
2135    /* try to find a register that is used by at least one operand */
2136    for (aco_ptr<Instruction>& phi : instructions) {
2137       Definition& definition = phi->definitions[0];
2138       if (definition.isFixed())
2139          continue;
2140 
2141       /* use affinity if available */
2142       if (ctx.assignments[definition.tempId()].affinity &&
2143           ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2144          assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2145          assert(affinity.rc == definition.regClass());
2146          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
2147             definition.setFixed(affinity.reg);
2148             register_file.fill(definition);
2149             ctx.assignments[definition.tempId()].set(definition);
2150             continue;
2151          }
2152       }
2153 
2154       /* by going backwards, we aim to avoid copies in else-blocks */
2155       for (int i = phi->operands.size() - 1; i >= 0; i--) {
2156          const Operand& op = phi->operands[i];
2157          if (!op.isTemp() || !op.isFixed())
2158             continue;
2159 
2160          PhysReg reg = op.physReg();
2161          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
2162             definition.setFixed(reg);
2163             register_file.fill(definition);
2164             ctx.assignments[definition.tempId()].set(definition);
2165             break;
2166          }
2167       }
2168    }
2169 
2170    /* find registers for phis where the register was blocked or no operand was assigned */
2171 
2172    /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2173    for (unsigned i = 0; i < instructions.size(); i++) {
2174       aco_ptr<Instruction>& phi = instructions[i];
2175       Definition& definition = phi->definitions[0];
2176       if (definition.isFixed())
2177          continue;
2178 
2179       definition.setFixed(
2180          get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2181 
2182       register_file.fill(definition);
2183       ctx.assignments[definition.tempId()].set(definition);
2184    }
2185 }
2186 
2187 Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2188 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2189 {
2190    std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
2191    if (it == ctx.renames[block_idx].end())
2192       return val;
2193    else
2194       return it->second;
2195 }
2196 
2197 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2198 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2199 {
2200    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2201    if (preds.size() == 0)
2202       return val;
2203 
2204    if (preds.size() == 1) {
2205       /* if the block has only one predecessor, just look there for the name */
2206       return read_variable(ctx, val, preds[0]);
2207    }
2208 
2209    /* there are multiple predecessors and the block is sealed */
2210    Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2211 
2212    /* get the rename from each predecessor and check if they are the same */
2213    Temp new_val;
2214    bool needs_phi = false;
2215    for (unsigned i = 0; i < preds.size(); i++) {
2216       ops[i] = read_variable(ctx, val, preds[i]);
2217       if (i == 0)
2218          new_val = ops[i];
2219       else
2220          needs_phi |= !(new_val == ops[i]);
2221    }
2222 
2223    if (needs_phi) {
2224       assert(!val.regClass().is_linear_vgpr());
2225 
2226       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2227       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2228       aco_ptr<Instruction> phi{
2229          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2230       new_val = ctx.program->allocateTmp(val.regClass());
2231       phi->definitions[0] = Definition(new_val);
2232       ctx.assignments.emplace_back();
2233       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2234       for (unsigned i = 0; i < preds.size(); i++) {
2235          /* update the operands so that it uses the new affinity */
2236          phi->operands[i] = Operand(ops[i]);
2237          assert(ctx.assignments[ops[i].id()].assigned);
2238          assert(ops[i].regClass() == new_val.regClass());
2239          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2240       }
2241       block->instructions.insert(block->instructions.begin(), std::move(phi));
2242    }
2243 
2244    return new_val;
2245 }
2246 
2247 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx)2248 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2249                  uint32_t loop_exit_idx)
2250 {
2251    Block& loop_header = ctx.program->blocks[loop_header_idx];
2252    std::unordered_map<unsigned, Temp> renames;
2253 
2254    /* create phis for variables renamed during the loop */
2255    for (unsigned t : live_in) {
2256       Temp val = Temp(t, ctx.program->temp_rc[t]);
2257       Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2258       Temp renamed = handle_live_in(ctx, val, &loop_header);
2259       if (renamed == prev)
2260          continue;
2261 
2262       /* insert additional renames at block end, but don't overwrite */
2263       renames[prev.id()] = renamed;
2264       ctx.orig_names[renamed.id()] = val;
2265       for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2266          auto it = ctx.renames[idx].emplace(val.id(), renamed);
2267          /* if insertion is unsuccessful, update if necessary */
2268          if (!it.second && it.first->second == prev)
2269             it.first->second = renamed;
2270       }
2271 
2272       /* update loop-carried values of the phi created by handle_live_in() */
2273       for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2274          Operand& op = loop_header.instructions[0]->operands[i];
2275          if (op.getTemp() == prev)
2276             op.setTemp(renamed);
2277       }
2278 
2279       /* use the assignment from the loop preheader and fix def reg */
2280       assignment& var = ctx.assignments[prev.id()];
2281       ctx.assignments[renamed.id()] = var;
2282       loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2283    }
2284 
2285    /* rename loop carried phi operands */
2286    for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2287       aco_ptr<Instruction>& phi = loop_header.instructions[i];
2288       if (!is_phi(phi))
2289          break;
2290       const std::vector<unsigned>& preds =
2291          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2292       for (unsigned j = 1; j < phi->operands.size(); j++) {
2293          Operand& op = phi->operands[j];
2294          if (!op.isTemp())
2295             continue;
2296 
2297          /* Find the original name, since this operand might not use the original name if the phi
2298           * was created after init_reg_file().
2299           */
2300          std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());
2301          Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2302 
2303          op.setTemp(read_variable(ctx, orig, preds[j]));
2304          op.setFixed(ctx.assignments[op.tempId()].reg);
2305       }
2306    }
2307 
2308    /* return early if no new phi was created */
2309    if (renames.empty())
2310       return;
2311 
2312    /* propagate new renames through loop */
2313    for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2314       Block& current = ctx.program->blocks[idx];
2315       /* rename all uses in this block */
2316       for (aco_ptr<Instruction>& instr : current.instructions) {
2317          /* phis are renamed after RA */
2318          if (idx == loop_header_idx && is_phi(instr))
2319             continue;
2320 
2321          for (Operand& op : instr->operands) {
2322             if (!op.isTemp())
2323                continue;
2324 
2325             auto rename = renames.find(op.tempId());
2326             if (rename != renames.end()) {
2327                assert(rename->second.id());
2328                op.setTemp(rename->second);
2329             }
2330          }
2331       }
2332    }
2333 }
2334 
2335 /**
2336  * This function serves the purpose to correctly initialize the register file
2337  * at the beginning of a block (before any existing phis).
2338  * In order to do so, all live-in variables are entered into the RegisterFile.
2339  * Reg-to-reg moves (renames) from previous blocks are taken into account and
2340  * the SSA is repaired by inserting corresponding phi-nodes.
2341  */
2342 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2343 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2344 {
2345    if (block.kind & block_kind_loop_exit) {
2346       uint32_t header = ctx.loop_header.back();
2347       ctx.loop_header.pop_back();
2348       handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2349    }
2350 
2351    RegisterFile register_file;
2352    const IDSet& live_in = live_out_per_block[block.index];
2353    assert(block.index != 0 || live_in.empty());
2354 
2355    if (block.kind & block_kind_loop_header) {
2356       ctx.loop_header.emplace_back(block.index);
2357       /* already rename phis incoming value */
2358       for (aco_ptr<Instruction>& instr : block.instructions) {
2359          if (!is_phi(instr))
2360             break;
2361          Operand& operand = instr->operands[0];
2362          if (operand.isTemp()) {
2363             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2364             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2365          }
2366       }
2367       for (unsigned t : live_in) {
2368          Temp val = Temp(t, ctx.program->temp_rc[t]);
2369          Temp renamed = read_variable(ctx, val, block.index - 1);
2370          if (renamed != val)
2371             ctx.renames[block.index][val.id()] = renamed;
2372          assignment& var = ctx.assignments[renamed.id()];
2373          assert(var.assigned);
2374          register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2375       }
2376    } else {
2377       /* rename phi operands */
2378       for (aco_ptr<Instruction>& instr : block.instructions) {
2379          if (!is_phi(instr))
2380             break;
2381          const std::vector<unsigned>& preds =
2382             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2383 
2384          for (unsigned i = 0; i < instr->operands.size(); i++) {
2385             Operand& operand = instr->operands[i];
2386             if (!operand.isTemp())
2387                continue;
2388             operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2389             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2390          }
2391       }
2392       for (unsigned t : live_in) {
2393          Temp val = Temp(t, ctx.program->temp_rc[t]);
2394          Temp renamed = handle_live_in(ctx, val, &block);
2395          assignment& var = ctx.assignments[renamed.id()];
2396          /* due to live-range splits, the live-in might be a phi, now */
2397          if (var.assigned) {
2398             register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2399          }
2400          if (renamed != val) {
2401             ctx.renames[block.index].emplace(t, renamed);
2402             ctx.orig_names[renamed.id()] = val;
2403          }
2404       }
2405    }
2406 
2407    return register_file;
2408 }
2409 
2410 void
get_affinities(ra_ctx & ctx,std::vector<IDSet> & live_out_per_block)2411 get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
2412 {
2413    std::vector<std::vector<Temp>> phi_resources;
2414    std::unordered_map<unsigned, unsigned> temp_to_phi_resources;
2415 
2416    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2417         block_rit++) {
2418       Block& block = *block_rit;
2419 
2420       /* first, compute the death points of all live vars within the block */
2421       IDSet& live = live_out_per_block[block.index];
2422 
2423       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2424       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2425          aco_ptr<Instruction>& instr = *rit;
2426          if (is_phi(instr))
2427             break;
2428 
2429          /* add vector affinities */
2430          if (instr->opcode == aco_opcode::p_create_vector) {
2431             for (const Operand& op : instr->operands) {
2432                if (op.isTemp() && op.isFirstKill() &&
2433                    op.getTemp().type() == instr->definitions[0].getTemp().type())
2434                   ctx.vectors[op.tempId()] = instr.get();
2435             }
2436          } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2437                     !instr->mimg().strict_wqm) {
2438             for (unsigned i = 3; i < instr->operands.size(); i++)
2439                ctx.vectors[instr->operands[i].tempId()] = instr.get();
2440          } else if (instr->opcode == aco_opcode::p_split_vector &&
2441                     instr->operands[0].isFirstKillBeforeDef()) {
2442             ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2443          } else if (instr->isVOPC() && !instr->isVOP3()) {
2444             if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2445                ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2446          } else if (instr->isVOP2() && !instr->isVOP3()) {
2447             if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2448                 instr->operands[2].regClass().type() == RegType::sgpr)
2449                ctx.assignments[instr->operands[2].tempId()].vcc = true;
2450             if (instr->definitions.size() == 2)
2451                ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2452          } else if (instr->opcode == aco_opcode::s_and_b32 ||
2453                     instr->opcode == aco_opcode::s_and_b64) {
2454             /* If SCC is used by a branch, we might be able to use
2455              * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2456              */
2457             if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2458                 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2459                ctx.assignments[instr->operands[0].tempId()].vcc = true;
2460          } else if (instr->opcode == aco_opcode::s_sendmsg) {
2461             ctx.assignments[instr->operands[0].tempId()].m0 = true;
2462          }
2463 
2464          /* add operands to live variables */
2465          for (const Operand& op : instr->operands) {
2466             if (op.isTemp())
2467                live.insert(op.tempId());
2468          }
2469 
2470          /* erase definitions from live */
2471          for (unsigned i = 0; i < instr->definitions.size(); i++) {
2472             const Definition& def = instr->definitions[i];
2473             if (!def.isTemp())
2474                continue;
2475             live.erase(def.tempId());
2476             /* mark last-seen phi operand */
2477             std::unordered_map<unsigned, unsigned>::iterator it =
2478                temp_to_phi_resources.find(def.tempId());
2479             if (it != temp_to_phi_resources.end() &&
2480                 def.regClass() == phi_resources[it->second][0].regClass()) {
2481                phi_resources[it->second][0] = def.getTemp();
2482                /* try to coalesce phi affinities with parallelcopies */
2483                Operand op = Operand();
2484                switch (instr->opcode) {
2485                case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
2486 
2487                case aco_opcode::v_interp_p2_f32:
2488                case aco_opcode::v_writelane_b32:
2489                case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
2490 
2491                case aco_opcode::v_fma_f32:
2492                case aco_opcode::v_fma_f16:
2493                case aco_opcode::v_pk_fma_f16:
2494                   if (ctx.program->gfx_level < GFX10)
2495                      continue;
2496                   FALLTHROUGH;
2497                case aco_opcode::v_mad_f32:
2498                case aco_opcode::v_mad_f16:
2499                   if (instr->usesModifiers())
2500                      continue;
2501                   op = instr->operands[2];
2502                   break;
2503 
2504                case aco_opcode::v_mad_legacy_f32:
2505                case aco_opcode::v_fma_legacy_f32:
2506                   if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32)
2507                      continue;
2508                   op = instr->operands[2];
2509                   break;
2510 
2511                default: continue;
2512                }
2513 
2514                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2515                   phi_resources[it->second].emplace_back(op.getTemp());
2516                   temp_to_phi_resources[op.tempId()] = it->second;
2517                }
2518             }
2519          }
2520       }
2521 
2522       /* collect phi affinities */
2523       for (; rit != block.instructions.rend(); ++rit) {
2524          aco_ptr<Instruction>& instr = *rit;
2525          assert(is_phi(instr));
2526 
2527          live.erase(instr->definitions[0].tempId());
2528          if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2529             continue;
2530 
2531          assert(instr->definitions[0].isTemp());
2532          std::unordered_map<unsigned, unsigned>::iterator it =
2533             temp_to_phi_resources.find(instr->definitions[0].tempId());
2534          unsigned index = phi_resources.size();
2535          std::vector<Temp>* affinity_related;
2536          if (it != temp_to_phi_resources.end()) {
2537             index = it->second;
2538             phi_resources[index][0] = instr->definitions[0].getTemp();
2539             affinity_related = &phi_resources[index];
2540          } else {
2541             phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2542             affinity_related = &phi_resources.back();
2543          }
2544 
2545          for (const Operand& op : instr->operands) {
2546             if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2547                affinity_related->emplace_back(op.getTemp());
2548                if (block.kind & block_kind_loop_header)
2549                   continue;
2550                temp_to_phi_resources[op.tempId()] = index;
2551             }
2552          }
2553       }
2554 
2555       /* visit the loop header phis first in order to create nested affinities */
2556       if (block.kind & block_kind_loop_exit) {
2557          /* find loop header */
2558          auto header_rit = block_rit;
2559          while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2560             header_rit++;
2561 
2562          for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2563             if (!is_phi(phi))
2564                break;
2565             if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2566                continue;
2567 
2568             /* create an (empty) merge-set for the phi-related variables */
2569             auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2570             unsigned index = phi_resources.size();
2571             if (it == temp_to_phi_resources.end()) {
2572                temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2573                phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2574             } else {
2575                index = it->second;
2576             }
2577             for (unsigned i = 1; i < phi->operands.size(); i++) {
2578                const Operand& op = phi->operands[i];
2579                if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2580                   temp_to_phi_resources[op.tempId()] = index;
2581                }
2582             }
2583          }
2584       }
2585    }
2586    /* create affinities */
2587    for (std::vector<Temp>& vec : phi_resources) {
2588       for (unsigned i = 1; i < vec.size(); i++)
2589          if (vec[i].id() != vec[0].id())
2590             ctx.assignments[vec[i].id()].affinity = vec[0].id();
2591    }
2592 }
2593 
2594 void
optimize_encoding_vop2(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2595 optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2596                        aco_ptr<Instruction>& instr)
2597 {
2598    /* try to optimize v_mad_f32 -> v_mac_f32 */
2599    if ((instr->opcode != aco_opcode::v_mad_f32 &&
2600         (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) &&
2601         instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 &&
2602         (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) &&
2603         (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) &&
2604         (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) &&
2605         (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) &&
2606         (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) ||
2607        !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() ||
2608        instr->operands[2].getTemp().type() != RegType::vgpr ||
2609        (!instr->operands[0].isOfType(RegType::vgpr) &&
2610         !instr->operands[1].isOfType(RegType::vgpr)) ||
2611        instr->operands[2].physReg().byte() != 0 || instr->valu().opsel[2])
2612       return;
2613 
2614    if (instr->isVOP3P() && (instr->valu().opsel_lo != 0 || instr->valu().opsel_hi != 0x7))
2615       return;
2616 
2617    if ((instr->operands[0].physReg().byte() != 0 || instr->operands[1].physReg().byte() != 0 ||
2618         instr->valu().opsel) &&
2619        program->gfx_level < GFX11)
2620       return;
2621 
2622    unsigned im_mask = instr->isDPP16() ? 0x3 : 0;
2623    if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2624        (instr->valu().neg & ~im_mask))
2625       return;
2626 
2627    if (!instr->operands[1].isOfType(RegType::vgpr))
2628       instr->valu().swapOperands(0, 1);
2629 
2630    if (!instr->operands[0].isOfType(RegType::vgpr) && instr->valu().opsel[0])
2631       return;
2632 
2633    unsigned def_id = instr->definitions[0].tempId();
2634    if (ctx.assignments[def_id].affinity) {
2635       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2636       if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2637           !register_file.test(affinity.reg, instr->operands[2].bytes()))
2638          return;
2639    }
2640 
2641    instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2642                             (unsigned)Format::VOP2);
2643    instr->valu().opsel_hi = 0;
2644    switch (instr->opcode) {
2645    case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2646    case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2647    case aco_opcode::v_mad_f16:
2648    case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2649    case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2650    case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2651    case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2652    case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2653    case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2654    default: break;
2655    }
2656 }
2657 
2658 void
optimize_encoding_sopk(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2659 optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2660                        aco_ptr<Instruction>& instr)
2661 {
2662    /* try to optimize sop2 with literal source to sopk */
2663    if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 &&
2664        instr->opcode != aco_opcode::s_cselect_b32)
2665       return;
2666 
2667    uint32_t literal_idx = 0;
2668 
2669    if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2670       literal_idx = 1;
2671 
2672    if (!instr->operands[!literal_idx].isTemp() ||
2673        !instr->operands[!literal_idx].isKillBeforeDef() ||
2674        instr->operands[!literal_idx].getTemp().type() != RegType::sgpr ||
2675        instr->operands[!literal_idx].physReg() >= 128)
2676       return;
2677 
2678    if (!instr->operands[literal_idx].isLiteral())
2679       return;
2680 
2681    const uint32_t i16_mask = 0xffff8000u;
2682    uint32_t value = instr->operands[literal_idx].constantValue();
2683    if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2684       return;
2685 
2686    unsigned def_id = instr->definitions[0].tempId();
2687    if (ctx.assignments[def_id].affinity) {
2688       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2689       if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2690           !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2691          return;
2692    }
2693 
2694    static_assert(sizeof(SOPK_instruction) <= sizeof(SOP2_instruction),
2695                  "Invalid direct instruction cast.");
2696    instr->format = Format::SOPK;
2697    SOPK_instruction* instr_sopk = &instr->sopk();
2698 
2699    instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff;
2700    if (literal_idx == 0)
2701       std::swap(instr_sopk->operands[0], instr_sopk->operands[1]);
2702    if (instr_sopk->operands.size() > 2)
2703       std::swap(instr_sopk->operands[1], instr_sopk->operands[2]);
2704    instr_sopk->operands.pop_back();
2705 
2706    switch (instr_sopk->opcode) {
2707    case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break;
2708    case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break;
2709    case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break;
2710    default: unreachable("illegal instruction");
2711    }
2712 }
2713 
2714 void
optimize_encoding(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2715 optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2716                   aco_ptr<Instruction>& instr)
2717 {
2718    if (instr->isVALU())
2719       optimize_encoding_vop2(program, ctx, register_file, instr);
2720    if (instr->isSALU())
2721       optimize_encoding_sopk(program, ctx, register_file, instr);
2722 }
2723 
2724 } /* end namespace */
2725 
2726 void
register_allocation(Program * program,std::vector<IDSet> & live_out_per_block,ra_test_policy policy)2727 register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
2728 {
2729    ra_ctx ctx(program, policy);
2730    get_affinities(ctx, live_out_per_block);
2731 
2732    for (Block& block : program->blocks) {
2733       ctx.block = &block;
2734 
2735       /* initialize register file */
2736       RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
2737       ctx.war_hint.reset();
2738 
2739       std::vector<aco_ptr<Instruction>> instructions;
2740       instructions.reserve(block.instructions.size());
2741 
2742       /* this is a slight adjustment from the paper as we already have phi nodes:
2743        * We consider them incomplete phis and only handle the definition. */
2744       get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
2745 
2746       /* If this is a merge block, the state of the register file at the branch instruction of the
2747        * predecessors corresponds to the state after phis at the merge block. So, we allocate a
2748        * register for the predecessor's branch definitions as if there was a phi.
2749        */
2750       if (!block.linear_preds.empty() &&
2751           (block.linear_preds.size() != 1 ||
2752            program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
2753          PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file,
2754                                       instructions, block, ctx.phi_dummy, Temp(0, s2));
2755          for (unsigned pred : block.linear_preds) {
2756             program->blocks[pred].scc_live_out = register_file[scc];
2757             aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
2758 
2759             assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
2760                    br->definitions[0].isKill());
2761 
2762             br->definitions[0].setFixed(br_reg);
2763          }
2764       }
2765 
2766       /* Handle all other instructions of the block */
2767       auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
2768       std::vector<aco_ptr<Instruction>>::iterator instr_it =
2769          std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
2770       for (; instr_it != block.instructions.end(); ++instr_it) {
2771          aco_ptr<Instruction>& instr = *instr_it;
2772 
2773          /* parallelcopies from p_phi are inserted here which means
2774           * live ranges of killed operands end here as well */
2775          if (instr->opcode == aco_opcode::p_logical_end) {
2776             /* no need to process this instruction any further */
2777             if (block.logical_succs.size() != 1) {
2778                instructions.emplace_back(std::move(instr));
2779                continue;
2780             }
2781 
2782             Block& succ = program->blocks[block.logical_succs[0]];
2783             unsigned idx = 0;
2784             for (; idx < succ.logical_preds.size(); idx++) {
2785                if (succ.logical_preds[idx] == block.index)
2786                   break;
2787             }
2788             for (aco_ptr<Instruction>& phi : succ.instructions) {
2789                if (phi->opcode == aco_opcode::p_phi) {
2790                   if (phi->operands[idx].isTemp() &&
2791                       phi->operands[idx].getTemp().type() == RegType::sgpr &&
2792                       phi->operands[idx].isFirstKillBeforeDef()) {
2793                      Definition phi_op(
2794                         read_variable(ctx, phi->operands[idx].getTemp(), block.index));
2795                      phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
2796                      register_file.clear(phi_op);
2797                   }
2798                } else if (phi->opcode != aco_opcode::p_linear_phi) {
2799                   break;
2800                }
2801             }
2802             instructions.emplace_back(std::move(instr));
2803             continue;
2804          }
2805 
2806          /* unconditional branches are handled after phis of the target */
2807          if (instr->opcode == aco_opcode::p_branch) {
2808             /* last instruction of the block */
2809             instructions.emplace_back(std::move(instr));
2810             break;
2811          }
2812 
2813          std::vector<std::pair<Operand, Definition>> parallelcopy;
2814 
2815          assert(!is_phi(instr));
2816 
2817          bool temp_in_scc = register_file[scc];
2818 
2819          /* handle operands */
2820          bool fixed = false;
2821          for (unsigned i = 0; i < instr->operands.size(); ++i) {
2822             auto& operand = instr->operands[i];
2823             if (!operand.isTemp())
2824                continue;
2825 
2826             /* rename operands */
2827             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
2828             assert(ctx.assignments[operand.tempId()].assigned);
2829 
2830             fixed |=
2831                operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
2832          }
2833 
2834          bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
2835                              instr->opcode == aco_opcode::v_writelane_b32_e64;
2836          if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
2837              instr->operands[1].isTemp()) {
2838             /* v_writelane_b32 can take two sgprs but only if one is m0. */
2839             if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
2840                 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
2841                instr->operands[0].setFixed(m0);
2842                fixed = true;
2843             }
2844          }
2845 
2846          if (fixed)
2847             handle_fixed_operands(ctx, register_file, parallelcopy, instr);
2848 
2849          for (unsigned i = 0; i < instr->operands.size(); ++i) {
2850             auto& operand = instr->operands[i];
2851             if (!operand.isTemp() || operand.isFixed())
2852                continue;
2853 
2854             PhysReg reg = ctx.assignments[operand.tempId()].reg;
2855             if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
2856                operand.setFixed(reg);
2857             else
2858                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
2859 
2860             if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
2861                 (instr->isDS() && instr->ds().gds)) {
2862                for (unsigned j = 0; j < operand.size(); j++)
2863                   ctx.war_hint.set(operand.physReg().reg() + j);
2864             }
2865          }
2866 
2867          /* remove dead vars from register file */
2868          for (const Operand& op : instr->operands) {
2869             if (op.isTemp() && op.isFirstKillBeforeDef())
2870                register_file.clear(op);
2871          }
2872 
2873          optimize_encoding(program, ctx, register_file, instr);
2874 
2875          /* Handle definitions which must have the same register as an operand.
2876           * We expect that the definition has the same size as the operand, otherwise the new
2877           * location for the operand (if it's not killed) might intersect with the old one.
2878           * We can't read from the old location because it's corrupted, and we can't write the new
2879           * location because that's used by a live-through operand.
2880           */
2881          int op_fixed_to_def = get_op_fixed_to_def(instr.get());
2882          if (op_fixed_to_def != -1)
2883             instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
2884 
2885          /* handle fixed definitions first */
2886          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2887             auto& definition = instr->definitions[i];
2888             if (!definition.isFixed())
2889                continue;
2890 
2891             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
2892             /* check if the target register is blocked */
2893             if (register_file.test(definition.physReg(), definition.bytes())) {
2894                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
2895 
2896                /* create parallelcopy pair to move blocking vars */
2897                std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
2898 
2899                RegisterFile tmp_file(register_file);
2900                /* re-enable the killed operands, so that we don't move the blocking vars there */
2901                for (const Operand& op : instr->operands) {
2902                   if (op.isTemp() && op.isFirstKillBeforeDef())
2903                      tmp_file.fill(op);
2904                }
2905 
2906                ASSERTED bool success = false;
2907                success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
2908                assert(success);
2909 
2910                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2911             }
2912 
2913             if (!definition.isTemp())
2914                continue;
2915 
2916             ctx.assignments[definition.tempId()].set(definition);
2917             register_file.fill(definition);
2918          }
2919 
2920          /* handle all other definitions */
2921          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2922             Definition* definition = &instr->definitions[i];
2923 
2924             if (definition->isFixed() || !definition->isTemp())
2925                continue;
2926 
2927             /* find free reg */
2928             if (instr->opcode == aco_opcode::p_split_vector) {
2929                PhysReg reg = instr->operands[0].physReg();
2930                RegClass rc = definition->regClass();
2931                for (unsigned j = 0; j < i; j++)
2932                   reg.reg_b += instr->definitions[j].bytes();
2933                if (get_reg_specified(ctx, register_file, rc, instr, reg)) {
2934                   definition->setFixed(reg);
2935                } else if (i == 0) {
2936                   RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
2937                   DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
2938                   std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
2939                   if (res && get_reg_specified(ctx, register_file, rc, instr, *res))
2940                      definition->setFixed(*res);
2941                } else if (instr->definitions[i - 1].isFixed()) {
2942                   reg = instr->definitions[i - 1].physReg();
2943                   reg.reg_b += instr->definitions[i - 1].bytes();
2944                   if (get_reg_specified(ctx, register_file, rc, instr, reg))
2945                      definition->setFixed(reg);
2946                }
2947             } else if (instr->opcode == aco_opcode::p_parallelcopy ||
2948                        (instr->opcode == aco_opcode::p_start_linear_vgpr &&
2949                         !instr->operands.empty())) {
2950                PhysReg reg = instr->operands[i].physReg();
2951                if (instr->operands[i].isTemp() &&
2952                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
2953                    !register_file.test(reg, definition->bytes()))
2954                   definition->setFixed(reg);
2955             } else if (instr->opcode == aco_opcode::p_extract_vector) {
2956                PhysReg reg = instr->operands[0].physReg();
2957                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
2958                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
2959                   definition->setFixed(reg);
2960             } else if (instr->opcode == aco_opcode::p_create_vector) {
2961                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
2962                                                    parallelcopy, instr);
2963                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2964                definition->setFixed(reg);
2965             } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
2966                        instr->operands[2].isTemp() && instr->operands[2].isKill() &&
2967                        instr->operands[2].regClass() == definition->regClass()) {
2968                /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
2969                 * Here we set a policy of forcing them the same if operands[2] gets killed (and
2970                 * otherwise they don't overlap). This may not be optimal if RA would select a
2971                 * different location due to affinity, but that gets complicated very quickly. */
2972                definition->setFixed(instr->operands[2].physReg());
2973             }
2974 
2975             if (!definition->isFixed()) {
2976                Temp tmp = definition->getTemp();
2977                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
2978                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2979                   definition->setFixed(reg);
2980                   if (reg.byte() || register_file.test(reg, 4)) {
2981                      add_subdword_definition(program, instr, reg);
2982                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
2983                                                              the reference */
2984                   }
2985                } else {
2986                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
2987                }
2988                update_renames(ctx, register_file, parallelcopy, instr,
2989                               instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
2990                                                                            : (UpdateRenames)0);
2991             }
2992 
2993             assert(
2994                definition->isFixed() &&
2995                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
2996                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
2997             ctx.assignments[definition->tempId()].set(*definition);
2998             register_file.fill(*definition);
2999          }
3000 
3001          handle_pseudo(ctx, register_file, instr.get());
3002 
3003          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3004           * be read */
3005          for (const Definition& def : instr->definitions) {
3006             if (def.isTemp() && def.isKill())
3007                register_file.clear(def);
3008          }
3009          for (unsigned i = 0; i < instr->operands.size(); i++) {
3010             const Operand& op = instr->operands[i];
3011             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3012                register_file.clear(op);
3013             if (op.isTemp() && op.physReg().byte() != 0)
3014                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3015          }
3016 
3017          /* emit parallelcopy */
3018          if (!parallelcopy.empty()) {
3019             aco_ptr<Pseudo_instruction> pc;
3020             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
3021                                                             Format::PSEUDO, parallelcopy.size(),
3022                                                             parallelcopy.size()));
3023             bool linear_vgpr = false;
3024             bool sgpr_operands_alias_defs = false;
3025             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
3026             for (unsigned i = 0; i < parallelcopy.size(); i++) {
3027                linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
3028 
3029                if (temp_in_scc && parallelcopy[i].first.isTemp() &&
3030                    parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
3031                   if (!sgpr_operands_alias_defs) {
3032                      unsigned reg = parallelcopy[i].first.physReg().reg();
3033                      unsigned size = parallelcopy[i].first.getTemp().size();
3034                      sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
3035 
3036                      reg = parallelcopy[i].second.physReg().reg();
3037                      size = parallelcopy[i].second.getTemp().size();
3038                      if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
3039                         sgpr_operands_alias_defs = true;
3040                   }
3041                }
3042 
3043                pc->operands[i] = parallelcopy[i].first;
3044                pc->definitions[i] = parallelcopy[i].second;
3045                assert(pc->operands[i].size() == pc->definitions[i].size());
3046 
3047                /* it might happen that the operand is already renamed. we have to restore the
3048                 * original name. */
3049                std::unordered_map<unsigned, Temp>::iterator it =
3050                   ctx.orig_names.find(pc->operands[i].tempId());
3051                Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
3052                ctx.orig_names[pc->definitions[i].tempId()] = orig;
3053                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
3054             }
3055 
3056             if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
3057                /* disable definitions and re-enable operands */
3058                RegisterFile tmp_file(register_file);
3059                for (const Definition& def : instr->definitions) {
3060                   if (def.isTemp() && !def.isKill())
3061                      tmp_file.clear(def);
3062                }
3063                for (const Operand& op : instr->operands) {
3064                   if (op.isTemp() && op.isFirstKill())
3065                      tmp_file.block(op.physReg(), op.regClass());
3066                }
3067 
3068                handle_pseudo(ctx, tmp_file, pc.get());
3069             } else {
3070                pc->tmp_in_scc = false;
3071             }
3072 
3073             instructions.emplace_back(std::move(pc));
3074          }
3075 
3076          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3077          bool instr_needs_vop3 =
3078             !instr->isVOP3() &&
3079             ((withoutDPP(instr->format) == Format::VOPC &&
3080               instr->definitions[0].physReg() != vcc) ||
3081              (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3082              ((instr->opcode == aco_opcode::v_add_co_u32 ||
3083                instr->opcode == aco_opcode::v_addc_co_u32 ||
3084                instr->opcode == aco_opcode::v_sub_co_u32 ||
3085                instr->opcode == aco_opcode::v_subb_co_u32 ||
3086                instr->opcode == aco_opcode::v_subrev_co_u32 ||
3087                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3088               instr->definitions[1].physReg() != vcc) ||
3089              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3090                instr->opcode == aco_opcode::v_subb_co_u32 ||
3091                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3092               instr->operands[2].physReg() != vcc));
3093          if (instr_needs_vop3) {
3094 
3095             /* if the first operand is a literal, we have to move it to a reg */
3096             if (instr->operands.size() && instr->operands[0].isLiteral() &&
3097                 program->gfx_level < GFX10) {
3098                bool can_sgpr = true;
3099                /* check, if we have to move to vgpr */
3100                for (const Operand& op : instr->operands) {
3101                   if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
3102                      can_sgpr = false;
3103                      break;
3104                   }
3105                }
3106                /* disable definitions and re-enable operands */
3107                RegisterFile tmp_file(register_file);
3108                for (const Definition& def : instr->definitions)
3109                   tmp_file.clear(def);
3110                for (const Operand& op : instr->operands) {
3111                   if (op.isTemp() && op.isFirstKill())
3112                      tmp_file.block(op.physReg(), op.regClass());
3113                }
3114                Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
3115                ctx.assignments.emplace_back();
3116                PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);
3117                update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3118 
3119                aco_ptr<Instruction> mov;
3120                if (can_sgpr)
3121                   mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
3122                                                                  Format::SOP1, 1, 1));
3123                else
3124                   mov.reset(create_instruction<VALU_instruction>(aco_opcode::v_mov_b32,
3125                                                                  Format::VOP1, 1, 1));
3126                mov->operands[0] = instr->operands[0];
3127                mov->definitions[0] = Definition(tmp);
3128                mov->definitions[0].setFixed(reg);
3129 
3130                instr->operands[0] = Operand(tmp);
3131                instr->operands[0].setFixed(reg);
3132                instr->operands[0].setFirstKill(true);
3133 
3134                instructions.emplace_back(std::move(mov));
3135             }
3136 
3137             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3138             instr->format = asVOP3(instr->format);
3139          }
3140 
3141          instructions.emplace_back(std::move(*instr_it));
3142 
3143       } /* end for Instr */
3144 
3145       block.instructions = std::move(instructions);
3146    } /* end for BB */
3147 
3148    /* num_gpr = rnd_up(max_used_gpr + 1) */
3149    program->config->num_vgprs =
3150       std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3151    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3152 
3153    program->progress = CompilationProgress::after_ra;
3154 }
3155 
3156 } // namespace aco
3157