• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include <algorithm>
28 #include <array>
29 #include <bitset>
30 #include <map>
31 #include <set>
32 #include <unordered_map>
33 #include <vector>
34 
35 namespace aco {
36 namespace {
37 
38 struct ra_ctx;
39 
40 unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
41                                      unsigned idx, RegClass rc);
42 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
43                           RegClass rc);
44 std::pair<unsigned, unsigned>
45 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
46 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
47 
48 struct assignment {
49    PhysReg reg;
50    RegClass rc;
51    bool assigned = false;
52    uint32_t affinity = 0;
53    assignment() = default;
assignmentaco::__anon99bf8b5d0111::assignment54    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
setaco::__anon99bf8b5d0111::assignment55    void set(const Definition& def)
56    {
57       assigned = true;
58       reg = def.physReg();
59       rc = def.regClass();
60    }
61 };
62 
63 struct ra_ctx {
64 
65    Program* program;
66    Block* block = NULL;
67    std::vector<assignment> assignments;
68    std::vector<std::unordered_map<unsigned, Temp>> renames;
69    std::vector<uint32_t> loop_header;
70    std::unordered_map<unsigned, Temp> orig_names;
71    std::unordered_map<unsigned, Instruction*> vectors;
72    std::unordered_map<unsigned, Instruction*> split_vectors;
73    aco_ptr<Instruction> pseudo_dummy;
74    uint16_t max_used_sgpr = 0;
75    uint16_t max_used_vgpr = 0;
76    uint16_t sgpr_limit;
77    uint16_t vgpr_limit;
78    std::bitset<512> war_hint;
79    std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
80 
81    ra_test_policy policy;
82 
ra_ctxaco::__anon99bf8b5d0111::ra_ctx83    ra_ctx(Program* program_, ra_test_policy policy_)
84        : program(program_), assignments(program->peekAllocationId()),
85          renames(program->blocks.size()), policy(policy_)
86    {
87       pseudo_dummy.reset(
88          create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
89       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
90       vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
91    }
92 };
93 
94 /* Iterator type for making PhysRegInterval compatible with range-based for */
95 struct PhysRegIterator {
96    using difference_type = int;
97    using value_type = unsigned;
98    using reference = const unsigned&;
99    using pointer = const unsigned*;
100    using iterator_category = std::bidirectional_iterator_tag;
101 
102    PhysReg reg;
103 
operator *aco::__anon99bf8b5d0111::PhysRegIterator104    PhysReg operator*() const { return reg; }
105 
operator ++aco::__anon99bf8b5d0111::PhysRegIterator106    PhysRegIterator& operator++()
107    {
108       reg.reg_b += 4;
109       return *this;
110    }
111 
operator --aco::__anon99bf8b5d0111::PhysRegIterator112    PhysRegIterator& operator--()
113    {
114       reg.reg_b -= 4;
115       return *this;
116    }
117 
operator ==aco::__anon99bf8b5d0111::PhysRegIterator118    bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
119 
operator !=aco::__anon99bf8b5d0111::PhysRegIterator120    bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
121 
operator <aco::__anon99bf8b5d0111::PhysRegIterator122    bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
123 };
124 
125 /* Half-open register interval used in "sliding window"-style for-loops */
126 struct PhysRegInterval {
127    PhysReg lo_;
128    unsigned size;
129 
130    /* Inclusive lower bound */
loaco::__anon99bf8b5d0111::PhysRegInterval131    PhysReg lo() const { return lo_; }
132 
133    /* Exclusive upper bound */
hiaco::__anon99bf8b5d0111::PhysRegInterval134    PhysReg hi() const { return PhysReg{lo() + size}; }
135 
operator +=aco::__anon99bf8b5d0111::PhysRegInterval136    PhysRegInterval& operator+=(uint32_t stride)
137    {
138       lo_ = PhysReg{lo_.reg() + stride};
139       return *this;
140    }
141 
operator !=aco::__anon99bf8b5d0111::PhysRegInterval142    bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
143 
144    /* Construct a half-open interval, excluding the end register */
from_untilaco::__anon99bf8b5d0111::PhysRegInterval145    static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
146 
containsaco::__anon99bf8b5d0111::PhysRegInterval147    bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
148 
containsaco::__anon99bf8b5d0111::PhysRegInterval149    bool contains(const PhysRegInterval& needle) const
150    {
151       return needle.lo() >= lo() && needle.hi() <= hi();
152    }
153 
beginaco::__anon99bf8b5d0111::PhysRegInterval154    PhysRegIterator begin() const { return {lo_}; }
155 
endaco::__anon99bf8b5d0111::PhysRegInterval156    PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
157 };
158 
159 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)160 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
161 {
162    return a.hi() > b.lo() && b.hi() > a.lo();
163 }
164 
165 /* Gets the stride for full (non-subdword) registers */
166 uint32_t
get_stride(RegClass rc)167 get_stride(RegClass rc)
168 {
169    if (rc.type() == RegType::vgpr) {
170       return 1;
171    } else {
172       uint32_t size = rc.size();
173       if (size == 2) {
174          return 2;
175       } else if (size >= 4) {
176          return 4;
177       } else {
178          return 1;
179       }
180    }
181 }
182 
183 PhysRegInterval
get_reg_bounds(Program * program,RegType type)184 get_reg_bounds(Program* program, RegType type)
185 {
186    if (type == RegType::vgpr) {
187       return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
188    } else {
189       return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
190    }
191 }
192 
193 struct DefInfo {
194    PhysRegInterval bounds;
195    uint8_t size;
196    uint8_t stride;
197    RegClass rc;
198 
DefInfoaco::__anon99bf8b5d0111::DefInfo199    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
200    {
201       size = rc.size();
202       stride = get_stride(rc);
203 
204       bounds = get_reg_bounds(ctx.program, rc.type());
205 
206       if (rc.is_subdword() && operand >= 0) {
207          /* stride in bytes */
208          stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
209       } else if (rc.is_subdword()) {
210          std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
211          stride = info.first;
212          if (info.second > rc.bytes()) {
213             rc = RegClass::get(rc.type(), info.second);
214             size = rc.size();
215             /* we might still be able to put the definition in the high half,
216              * but that's only useful for affinities and this information isn't
217              * used for them */
218             stride = align(stride, info.second);
219             if (!rc.is_subdword())
220                stride = DIV_ROUND_UP(stride, 4);
221          }
222          assert(stride > 0);
223       }
224    }
225 };
226 
227 class RegisterFile {
228 public:
RegisterFile()229    RegisterFile() { regs.fill(0); }
230 
231    std::array<uint32_t, 512> regs;
232    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
233 
operator [](PhysReg index) const234    const uint32_t& operator[](PhysReg index) const { return regs[index]; }
235 
operator [](PhysReg index)236    uint32_t& operator[](PhysReg index) { return regs[index]; }
237 
count_zero(PhysRegInterval reg_interval)238    unsigned count_zero(PhysRegInterval reg_interval)
239    {
240       unsigned res = 0;
241       for (PhysReg reg : reg_interval)
242          res += !regs[reg];
243       return res;
244    }
245 
246    /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes)247    bool test(PhysReg start, unsigned num_bytes)
248    {
249       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
250          assert(i <= 511);
251          if (regs[i] & 0x0FFFFFFF)
252             return true;
253          if (regs[i] == 0xF0000000) {
254             assert(subdword_regs.find(i) != subdword_regs.end());
255             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
256                if (subdword_regs[i][j])
257                   return true;
258             }
259          }
260       }
261       return false;
262    }
263 
block(PhysReg start,RegClass rc)264    void block(PhysReg start, RegClass rc)
265    {
266       if (rc.is_subdword())
267          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
268       else
269          fill(start, rc.size(), 0xFFFFFFFF);
270    }
271 
is_blocked(PhysReg start)272    bool is_blocked(PhysReg start)
273    {
274       if (regs[start] == 0xFFFFFFFF)
275          return true;
276       if (regs[start] == 0xF0000000) {
277          for (unsigned i = start.byte(); i < 4; i++)
278             if (subdword_regs[start][i] == 0xFFFFFFFF)
279                return true;
280       }
281       return false;
282    }
283 
is_empty_or_blocked(PhysReg start)284    bool is_empty_or_blocked(PhysReg start)
285    {
286       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
287        * incremented value to 1 */
288       if (regs[start] == 0xF0000000) {
289          return subdword_regs[start][start.byte()] + 1 <= 1;
290       }
291       return regs[start] + 1 <= 1;
292    }
293 
clear(PhysReg start,RegClass rc)294    void clear(PhysReg start, RegClass rc)
295    {
296       if (rc.is_subdword())
297          fill_subdword(start, rc.bytes(), 0);
298       else
299          fill(start, rc.size(), 0);
300    }
301 
fill(Operand op)302    void fill(Operand op)
303    {
304       if (op.regClass().is_subdword())
305          fill_subdword(op.physReg(), op.bytes(), op.tempId());
306       else
307          fill(op.physReg(), op.size(), op.tempId());
308    }
309 
clear(Operand op)310    void clear(Operand op) { clear(op.physReg(), op.regClass()); }
311 
fill(Definition def)312    void fill(Definition def)
313    {
314       if (def.regClass().is_subdword())
315          fill_subdword(def.physReg(), def.bytes(), def.tempId());
316       else
317          fill(def.physReg(), def.size(), def.tempId());
318    }
319 
clear(Definition def)320    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
321 
get_id(PhysReg reg)322    unsigned get_id(PhysReg reg)
323    {
324       return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
325    }
326 
327 private:
fill(PhysReg start,unsigned size,uint32_t val)328    void fill(PhysReg start, unsigned size, uint32_t val)
329    {
330       for (unsigned i = 0; i < size; i++)
331          regs[start + i] = val;
332    }
333 
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)334    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
335    {
336       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
337       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
338          /* emplace or get */
339          std::array<uint32_t, 4>& sub =
340             subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
341          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
342             sub[j] = val;
343 
344          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
345             subdword_regs.erase(i);
346             regs[i] = 0;
347          }
348       }
349    }
350 };
351 
352 std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
353                                                   const PhysRegInterval reg_interval);
354 
355 /* helper function for debugging */
356 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)357 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
358 {
359    if (reg_file[reg] == 0xFFFFFFFF) {
360       printf("☐");
361    } else if (reg_file[reg]) {
362       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
363       if (show_subdword_alloc) {
364          const char* block_chars[] = {
365             // clang-format off
366             "?", "▘", "▝", "▀",
367             "▖", "▌", "▞", "▛",
368             "▗", "▚", "▐", "▜",
369             "▄", "▙", "▟", "▉"
370             // clang-format on
371          };
372          unsigned index = 0;
373          for (int i = 0; i < 4; ++i) {
374             if (reg_file.subdword_regs.at(reg)[i]) {
375                index |= 1 << i;
376             }
377          }
378          printf("%s", block_chars[index]);
379       } else {
380          /* Indicate filled register slot */
381          if (!has_adjacent_variable) {
382             printf("█");
383          } else {
384             /* Use a slightly shorter box to leave a small gap between adjacent variables */
385             printf("▉");
386          }
387       }
388    } else {
389       printf("·");
390    }
391 }
392 
393 /* helper function for debugging */
394 UNUSED void
print_regs(ra_ctx & ctx,bool vgprs,RegisterFile & reg_file)395 print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
396 {
397    PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
398    char reg_char = vgprs ? 'v' : 's';
399    const int max_regs_per_line = 64;
400 
401    /* print markers */
402    printf("       ");
403    for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
404       printf("%-3.2u ", i);
405    }
406    printf("\n");
407 
408    /* print usage */
409    auto line_begin_it = regs.begin();
410    while (line_begin_it != regs.end()) {
411       const int regs_in_line =
412          std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
413 
414       if (line_begin_it == regs.begin()) {
415          printf("%cgprs: ", reg_char);
416       } else {
417          printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
418       }
419       const auto line_end_it = std::next(line_begin_it, regs_in_line);
420 
421       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
422          bool has_adjacent_variable =
423             (std::next(reg_it) != line_end_it &&
424              reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
425          print_reg(reg_file, *reg_it, has_adjacent_variable);
426       }
427 
428       line_begin_it = line_end_it;
429       printf("\n");
430    }
431 
432    const unsigned free_regs =
433       std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
434    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
435 
436    /* print assignments ordered by registers */
437    std::map<PhysReg, std::pair<unsigned, unsigned>>
438       regs_to_vars; /* maps to byte size and temp id */
439    for (const auto& size_id : find_vars(ctx, reg_file, regs)) {
440       auto reg = ctx.assignments[size_id.second].reg;
441       ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);
442       assert(inserted.second);
443    }
444 
445    for (const auto& reg_and_var : regs_to_vars) {
446       const auto& first_reg = reg_and_var.first;
447       const auto& size_id = reg_and_var.second;
448 
449       printf("%%%u ", size_id.second);
450       if (ctx.orig_names.count(size_id.second) &&
451           ctx.orig_names[size_id.second].id() != size_id.second) {
452          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
453       }
454       printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
455       PhysReg last_reg = first_reg.advance(size_id.first - 1);
456       if (first_reg.reg() != last_reg.reg()) {
457          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
458          printf("-%d", last_reg.reg() - regs.lo());
459       }
460       printf("]");
461       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
462          printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
463       }
464       printf("\n");
465    }
466 }
467 
468 unsigned
get_subdword_operand_stride(chip_class chip,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)469 get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
470                             RegClass rc)
471 {
472    if (instr->isPseudo()) {
473       /* v_readfirstlane_b32 cannot use SDWA */
474       if (instr->opcode == aco_opcode::p_as_uniform)
475          return 4;
476       else if (chip >= GFX8)
477          return rc.bytes() % 2 == 0 ? 2 : 1;
478       else
479          return 4;
480    }
481 
482    assert(rc.bytes() <= 2);
483    if (instr->isVALU()) {
484       if (can_use_SDWA(chip, instr, false))
485          return rc.bytes();
486       if (can_use_opsel(chip, instr->opcode, idx, true))
487          return 2;
488       if (instr->format == Format::VOP3P)
489          return 2;
490    }
491 
492    switch (instr->opcode) {
493    case aco_opcode::v_cvt_f32_ubyte0: return 1;
494    case aco_opcode::ds_write_b8:
495    case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4;
496    case aco_opcode::buffer_store_byte:
497    case aco_opcode::buffer_store_short:
498    case aco_opcode::flat_store_byte:
499    case aco_opcode::flat_store_short:
500    case aco_opcode::scratch_store_byte:
501    case aco_opcode::scratch_store_short:
502    case aco_opcode::global_store_byte:
503    case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
504    default: return 4;
505    }
506 }
507 
508 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)509 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
510                      RegClass rc)
511 {
512    chip_class chip = ctx.program->chip_class;
513    if (instr->isPseudo() || byte == 0)
514       return;
515 
516    assert(rc.bytes() <= 2);
517    if (instr->isVALU()) {
518       /* check if we can use opsel */
519       if (instr->format == Format::VOP3) {
520          assert(byte == 2);
521          instr->vop3().opsel |= 1 << idx;
522          return;
523       }
524       if (instr->isVOP3P()) {
525          assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
526          instr->vop3p().opsel_lo |= 1 << idx;
527          instr->vop3p().opsel_hi |= 1 << idx;
528          return;
529       }
530       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
531          switch (byte) {
532          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
533          case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
534          case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
535          case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
536          }
537          return;
538       }
539 
540       /* use SDWA */
541       assert(can_use_SDWA(chip, instr, false));
542       convert_to_SDWA(chip, instr);
543       return;
544    }
545 
546    assert(byte == 2);
547    if (instr->opcode == aco_opcode::ds_write_b8)
548       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
549    else if (instr->opcode == aco_opcode::ds_write_b16)
550       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
551    else if (instr->opcode == aco_opcode::buffer_store_byte)
552       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
553    else if (instr->opcode == aco_opcode::buffer_store_short)
554       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
555    else if (instr->opcode == aco_opcode::flat_store_byte)
556       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
557    else if (instr->opcode == aco_opcode::flat_store_short)
558       instr->opcode = aco_opcode::flat_store_short_d16_hi;
559    else if (instr->opcode == aco_opcode::scratch_store_byte)
560       instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
561    else if (instr->opcode == aco_opcode::scratch_store_short)
562       instr->opcode = aco_opcode::scratch_store_short_d16_hi;
563    else if (instr->opcode == aco_opcode::global_store_byte)
564       instr->opcode = aco_opcode::global_store_byte_d16_hi;
565    else if (instr->opcode == aco_opcode::global_store_short)
566       instr->opcode = aco_opcode::global_store_short_d16_hi;
567    else
568       unreachable("Something went wrong: Impossible register assignment.");
569    return;
570 }
571 
572 /* minimum_stride, bytes_written */
573 std::pair<unsigned, unsigned>
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr,RegClass rc)574 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
575 {
576    chip_class chip = program->chip_class;
577 
578    if (instr->isPseudo()) {
579       if (chip >= GFX8)
580          return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
581       else
582          return std::make_pair(4, rc.size() * 4u);
583    }
584 
585    if (instr->isVALU() || instr->isVINTRP()) {
586       assert(rc.bytes() <= 2);
587 
588       if (can_use_SDWA(chip, instr, false))
589          return std::make_pair(rc.bytes(), rc.bytes());
590 
591       unsigned bytes_written = 4u;
592       if (instr_is_16bit(chip, instr->opcode))
593          bytes_written = 2u;
594 
595       unsigned stride = 4u;
596       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
597           can_use_opsel(chip, instr->opcode, -1, true))
598          stride = 2u;
599 
600       return std::make_pair(stride, bytes_written);
601    }
602 
603    switch (instr->opcode) {
604    case aco_opcode::ds_read_u8_d16:
605    case aco_opcode::ds_read_i8_d16:
606    case aco_opcode::ds_read_u16_d16:
607    case aco_opcode::flat_load_ubyte_d16:
608    case aco_opcode::flat_load_sbyte_d16:
609    case aco_opcode::flat_load_short_d16:
610    case aco_opcode::global_load_ubyte_d16:
611    case aco_opcode::global_load_sbyte_d16:
612    case aco_opcode::global_load_short_d16:
613    case aco_opcode::scratch_load_ubyte_d16:
614    case aco_opcode::scratch_load_sbyte_d16:
615    case aco_opcode::scratch_load_short_d16:
616    case aco_opcode::buffer_load_ubyte_d16:
617    case aco_opcode::buffer_load_sbyte_d16:
618    case aco_opcode::buffer_load_short_d16: {
619       assert(chip >= GFX9);
620       if (!program->dev.sram_ecc_enabled)
621          return std::make_pair(2u, 2u);
622       else
623          return std::make_pair(2u, 4u);
624    }
625 
626    default: return std::make_pair(4, rc.size() * 4u);
627    }
628 }
629 
630 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg)631 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
632 {
633    if (instr->isPseudo())
634       return;
635 
636    if (instr->isVALU()) {
637       chip_class chip = program->chip_class;
638       assert(instr->definitions[0].bytes() <= 2);
639 
640       if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode))
641          return;
642 
643       /* check if we can use opsel */
644       if (instr->format == Format::VOP3) {
645          assert(reg.byte() == 2);
646          assert(can_use_opsel(chip, instr->opcode, -1, true));
647          instr->vop3().opsel |= (1 << 3); /* dst in high half */
648          return;
649       }
650 
651       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
652          instr->opcode = aco_opcode::v_fma_mixhi_f16;
653          return;
654       }
655 
656       /* use SDWA */
657       assert(can_use_SDWA(chip, instr, false));
658       convert_to_SDWA(chip, instr);
659       return;
660    }
661 
662    if (reg.byte() == 0)
663       return;
664    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
665       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
666    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
667       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
668    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
669       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
670    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
671       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
672    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
673       instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
674    else if (instr->opcode == aco_opcode::flat_load_short_d16)
675       instr->opcode = aco_opcode::flat_load_short_d16_hi;
676    else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
677       instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
678    else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
679       instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
680    else if (instr->opcode == aco_opcode::scratch_load_short_d16)
681       instr->opcode = aco_opcode::scratch_load_short_d16_hi;
682    else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
683       instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
684    else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
685       instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
686    else if (instr->opcode == aco_opcode::global_load_short_d16)
687       instr->opcode = aco_opcode::global_load_short_d16_hi;
688    else if (instr->opcode == aco_opcode::ds_read_u8_d16)
689       instr->opcode = aco_opcode::ds_read_u8_d16_hi;
690    else if (instr->opcode == aco_opcode::ds_read_i8_d16)
691       instr->opcode = aco_opcode::ds_read_i8_d16_hi;
692    else if (instr->opcode == aco_opcode::ds_read_u16_d16)
693       instr->opcode = aco_opcode::ds_read_u16_d16_hi;
694    else
695       unreachable("Something went wrong: Impossible register assignment.");
696 }
697 
698 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)699 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
700 {
701    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
702    unsigned size = rc.size();
703    if (rc.type() == RegType::vgpr) {
704       assert(reg >= 256);
705       uint16_t hi = reg - 256 + size - 1;
706       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
707    } else if (reg + rc.size() <= max_addressible_sgpr) {
708       uint16_t hi = reg + size - 1;
709       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
710    }
711 }
712 
713 enum UpdateRenames {
714    rename_not_killed_ops = 0x1,
715    fill_killed_ops = 0x2,
716 };
717 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
718 
719 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)720 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
721                std::vector<std::pair<Operand, Definition>>& parallelcopies,
722                aco_ptr<Instruction>& instr, UpdateRenames flags)
723 {
724    /* clear operands */
725    for (std::pair<Operand, Definition>& copy : parallelcopies) {
726       /* the definitions with id are not from this function and already handled */
727       if (copy.second.isTemp())
728          continue;
729       reg_file.clear(copy.first);
730    }
731 
732    /* allocate id's and rename operands: this is done transparently here */
733    auto it = parallelcopies.begin();
734    while (it != parallelcopies.end()) {
735       if (it->second.isTemp()) {
736          ++it;
737          continue;
738       }
739 
740       /* check if we moved a definition: change the register and remove copy */
741       bool is_def = false;
742       for (Definition& def : instr->definitions) {
743          if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
744             // FIXME: ensure that the definition can use this reg
745             def.setFixed(it->second.physReg());
746             reg_file.fill(def);
747             ctx.assignments[def.tempId()].reg = def.physReg();
748             it = parallelcopies.erase(it);
749             is_def = true;
750             break;
751          }
752       }
753       if (is_def)
754          continue;
755 
756       /* check if we moved another parallelcopy definition */
757       for (std::pair<Operand, Definition>& other : parallelcopies) {
758          if (!other.second.isTemp())
759             continue;
760          if (it->first.getTemp() == other.second.getTemp()) {
761             other.second.setFixed(it->second.physReg());
762             ctx.assignments[other.second.tempId()].reg = other.second.physReg();
763             it = parallelcopies.erase(it);
764             is_def = true;
765             /* check if we moved an operand, again */
766             bool fill = true;
767             for (Operand& op : instr->operands) {
768                if (op.isTemp() && op.tempId() == other.second.tempId()) {
769                   // FIXME: ensure that the operand can use this reg
770                   op.setFixed(other.second.physReg());
771                   fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
772                }
773             }
774             if (fill)
775                reg_file.fill(other.second);
776             break;
777          }
778       }
779       if (is_def)
780          continue;
781 
782       std::pair<Operand, Definition>& copy = *it;
783       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
784       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
785       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
786 
787       /* check if we moved an operand */
788       bool first = true;
789       bool fill = true;
790       for (unsigned i = 0; i < instr->operands.size(); i++) {
791          Operand& op = instr->operands[i];
792          if (!op.isTemp())
793             continue;
794          if (op.tempId() == copy.first.tempId()) {
795             bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
796             for (std::pair<Operand, Definition>& pc : parallelcopies) {
797                PhysReg def_reg = pc.second.physReg();
798                omit_renaming &= def_reg > copy.first.physReg()
799                                    ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
800                                    : (def_reg + pc.second.size() <= copy.first.physReg().reg());
801             }
802             if (omit_renaming) {
803                if (first)
804                   op.setFirstKill(true);
805                else
806                   op.setKill(true);
807                first = false;
808                continue;
809             }
810             op.setTemp(copy.second.getTemp());
811             op.setFixed(copy.second.physReg());
812 
813             fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
814          }
815       }
816 
817       if (fill)
818          reg_file.fill(copy.second);
819 
820       ++it;
821    }
822 }
823 
824 std::pair<PhysReg, bool>
get_reg_simple(ra_ctx & ctx,RegisterFile & reg_file,DefInfo info)825 get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
826 {
827    const PhysRegInterval& bounds = info.bounds;
828    uint32_t size = info.size;
829    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
830    RegClass rc = info.rc;
831 
832    DefInfo new_info = info;
833    new_info.rc = RegClass(rc.type(), size);
834    for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {
835       if (size % new_stride)
836          continue;
837       new_info.stride = new_stride;
838       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);
839       if (res.second)
840          return res;
841    }
842 
843    auto is_free = [&](PhysReg reg_index)
844    { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
845 
846    if (stride == 1) {
847       /* best fit algorithm: find the smallest gap to fit in the variable */
848       PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
849       const unsigned max_gpr =
850          (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
851 
852       PhysRegIterator reg_it = bounds.begin();
853       const PhysRegIterator end_it =
854          std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
855       while (reg_it != bounds.end()) {
856          /* Find the next chunk of available register slots */
857          reg_it = std::find_if(reg_it, end_it, is_free);
858          auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);
859          if (reg_it == bounds.end()) {
860             break;
861          }
862 
863          if (next_nonfree_it == end_it) {
864             /* All registers past max_used_gpr are free */
865             next_nonfree_it = bounds.end();
866          }
867 
868          PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);
869 
870          /* early return on exact matches */
871          if (size == gap.size) {
872             adjust_max_used_regs(ctx, rc, gap.lo());
873             return {gap.lo(), true};
874          }
875 
876          /* check if it fits and the gap size is smaller */
877          if (size < gap.size && gap.size < best_gap.size) {
878             best_gap = gap;
879          }
880 
881          /* Move past the processed chunk */
882          reg_it = next_nonfree_it;
883       }
884 
885       if (best_gap.size == UINT_MAX)
886          return {{}, false};
887 
888       /* find best position within gap by leaving a good stride for other variables*/
889       unsigned buffer = best_gap.size - size;
890       if (buffer > 1) {
891          if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
892              ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
893              ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
894             best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
895       }
896 
897       adjust_max_used_regs(ctx, rc, best_gap.lo());
898       return {best_gap.lo(), true};
899    }
900 
901    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
902         reg_win += stride) {
903       if (reg_file[reg_win.lo()] != 0) {
904          continue;
905       }
906 
907       bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
908       if (is_valid) {
909          adjust_max_used_regs(ctx, rc, reg_win.lo());
910          return {reg_win.lo(), true};
911       }
912    }
913 
914    /* do this late because using the upper bytes of a register can require
915     * larger instruction encodings or copies
916     * TODO: don't do this in situations where it doesn't benefit */
917    if (rc.is_subdword()) {
918       for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
919          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
920          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
921             continue;
922 
923          for (unsigned i = 0; i < 4; i += info.stride) {
924             /* check if there's a block of free bytes large enough to hold the register */
925             bool reg_found =
926                std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
927                            [](unsigned v) { return v == 0; });
928 
929             /* check if also the neighboring reg is free if needed */
930             if (reg_found && i + rc.bytes() > 4)
931                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
932 
933             if (reg_found) {
934                PhysReg res{entry.first};
935                res.reg_b += i;
936                adjust_max_used_regs(ctx, rc, entry.first);
937                return {res, true};
938             }
939          }
940       }
941    }
942 
943    return {{}, false};
944 }
945 
946 /* collect variables from a register area and clear reg_file */
947 std::set<std::pair<unsigned, unsigned>>
find_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)948 find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
949 {
950    std::set<std::pair<unsigned, unsigned>> vars;
951    for (PhysReg j : reg_interval) {
952       if (reg_file.is_blocked(j))
953          continue;
954       if (reg_file[j] == 0xF0000000) {
955          for (unsigned k = 0; k < 4; k++) {
956             unsigned id = reg_file.subdword_regs[j][k];
957             if (id) {
958                assignment& var = ctx.assignments[id];
959                vars.emplace(var.rc.bytes(), id);
960             }
961          }
962       } else if (reg_file[j] != 0) {
963          unsigned id = reg_file[j];
964          assignment& var = ctx.assignments[id];
965          vars.emplace(var.rc.bytes(), id);
966       }
967    }
968    return vars;
969 }
970 
971 /* collect variables from a register area and clear reg_file */
972 std::set<std::pair<unsigned, unsigned>>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)973 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
974 {
975    std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);
976    for (std::pair<unsigned, unsigned> size_id : vars) {
977       assignment& var = ctx.assignments[size_id.second];
978       reg_file.clear(var.reg, var.rc);
979    }
980    return vars;
981 }
982 
983 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::set<std::pair<unsigned,unsigned>> & vars,const PhysRegInterval bounds,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)984 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
985                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
986                     const std::set<std::pair<unsigned, unsigned>>& vars,
987                     const PhysRegInterval bounds, aco_ptr<Instruction>& instr,
988                     const PhysRegInterval def_reg)
989 {
990    /* variables are sorted from small sized to large */
991    /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
992     * slightly though. */
993    for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
994         it != vars.rend(); ++it) {
995       unsigned id = it->second;
996       assignment& var = ctx.assignments[id];
997       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
998       uint32_t size = info.size;
999 
1000       /* check if this is a dead operand, then we can re-use the space from the definition
1001        * also use the correct stride for sub-dword operands */
1002       bool is_dead_operand = false;
1003       for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1004          if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1005             if (instr->operands[i].isKillBeforeDef())
1006                is_dead_operand = true;
1007             info = DefInfo(ctx, instr, var.rc, i);
1008             break;
1009          }
1010       }
1011 
1012       std::pair<PhysReg, bool> res;
1013       if (is_dead_operand) {
1014          if (instr->opcode == aco_opcode::p_create_vector) {
1015             PhysReg reg(def_reg.lo());
1016             for (unsigned i = 0; i < instr->operands.size(); i++) {
1017                if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1018                   res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
1019                                  !reg_file.test(reg, var.rc.bytes())};
1020                   break;
1021                }
1022                reg.reg_b += instr->operands[i].bytes();
1023             }
1024             if (!res.second)
1025                res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
1026          } else {
1027             info.bounds = def_reg;
1028             res = get_reg_simple(ctx, reg_file, info);
1029          }
1030       } else {
1031          /* Try to find space within the bounds but outside of the definition */
1032          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1033          res = get_reg_simple(ctx, reg_file, info);
1034          if (!res.second && def_reg.hi() <= bounds.hi()) {
1035             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1036             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1037             res = get_reg_simple(ctx, reg_file, info);
1038          }
1039       }
1040 
1041       if (res.second) {
1042          /* mark the area as blocked */
1043          reg_file.block(res.first, var.rc);
1044 
1045          /* create parallelcopy pair (without definition id) */
1046          Temp tmp = Temp(id, var.rc);
1047          Operand pc_op = Operand(tmp);
1048          pc_op.setFixed(var.reg);
1049          Definition pc_def = Definition(res.first, pc_op.regClass());
1050          parallelcopies.emplace_back(pc_op, pc_def);
1051          continue;
1052       }
1053 
1054       PhysReg best_pos = bounds.lo();
1055       unsigned num_moves = 0xFF;
1056       unsigned num_vars = 0;
1057 
1058       /* we use a sliding window to find potential positions */
1059       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1060       for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1061            reg_win += stride) {
1062          if (!is_dead_operand && intersects(reg_win, def_reg))
1063             continue;
1064 
1065          /* second, check that we have at most k=num_moves elements in the window
1066           * and no element is larger than the currently processed one */
1067          unsigned k = 0;
1068          unsigned n = 0;
1069          unsigned last_var = 0;
1070          bool found = true;
1071          for (PhysReg j : reg_win) {
1072             if (reg_file[j] == 0 || reg_file[j] == last_var)
1073                continue;
1074 
1075             if (reg_file.is_blocked(j) || k > num_moves) {
1076                found = false;
1077                break;
1078             }
1079             if (reg_file[j] == 0xF0000000) {
1080                k += 1;
1081                n++;
1082                continue;
1083             }
1084             /* we cannot split live ranges of linear vgprs inside control flow */
1085             if (!(ctx.block->kind & block_kind_top_level) &&
1086                 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1087                found = false;
1088                break;
1089             }
1090             bool is_kill = false;
1091             for (const Operand& op : instr->operands) {
1092                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1093                   is_kill = true;
1094                   break;
1095                }
1096             }
1097             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1098                found = false;
1099                break;
1100             }
1101 
1102             k += ctx.assignments[reg_file[j]].rc.size();
1103             last_var = reg_file[j];
1104             n++;
1105             if (k > num_moves || (k == num_moves && n <= num_vars)) {
1106                found = false;
1107                break;
1108             }
1109          }
1110 
1111          if (found) {
1112             best_pos = reg_win.lo();
1113             num_moves = k;
1114             num_vars = n;
1115          }
1116       }
1117 
1118       /* FIXME: we messed up and couldn't find space for the variables to be copied */
1119       if (num_moves == 0xFF)
1120          return false;
1121 
1122       PhysRegInterval reg_win{best_pos, size};
1123 
1124       /* collect variables and block reg file */
1125       std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);
1126 
1127       /* mark the area as blocked */
1128       reg_file.block(reg_win.lo(), var.rc);
1129       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1130 
1131       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))
1132          return false;
1133 
1134       /* create parallelcopy pair (without definition id) */
1135       Temp tmp = Temp(id, var.rc);
1136       Operand pc_op = Operand(tmp);
1137       pc_op.setFixed(var.reg);
1138       Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1139       parallelcopies.emplace_back(pc_op, pc_def);
1140    }
1141 
1142    return true;
1143 }
1144 
1145 std::pair<PhysReg, bool>
get_reg_impl(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1146 get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
1147              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1148              aco_ptr<Instruction>& instr)
1149 {
1150    const PhysRegInterval& bounds = info.bounds;
1151    uint32_t size = info.size;
1152    uint32_t stride = info.stride;
1153    RegClass rc = info.rc;
1154 
1155    /* check how many free regs we have */
1156    unsigned regs_free = reg_file.count_zero(bounds);
1157 
1158    /* mark and count killed operands */
1159    unsigned killed_ops = 0;
1160    std::bitset<256> is_killed_operand; /* per-register */
1161    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1162       Operand& op = instr->operands[j];
1163       if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1164           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1165          assert(op.isFixed());
1166 
1167          for (unsigned i = 0; i < op.size(); ++i) {
1168             is_killed_operand[(op.physReg() & 0xff) + i] = true;
1169          }
1170 
1171          killed_ops += op.getTemp().size();
1172       }
1173    }
1174 
1175    assert(regs_free >= size);
1176    /* we might have to move dead operands to dst in order to make space */
1177    unsigned op_moves = 0;
1178 
1179    if (size > (regs_free - killed_ops))
1180       op_moves = size - (regs_free - killed_ops);
1181 
1182    /* find the best position to place the definition */
1183    PhysRegInterval best_win = {bounds.lo(), size};
1184    unsigned num_moves = 0xFF;
1185    unsigned num_vars = 0;
1186 
1187    /* we use a sliding window to check potential positions */
1188    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1189         reg_win += stride) {
1190       /* first check if the register window starts in the middle of an
1191        * allocated variable: this is what we have to fix to allow for
1192        * num_moves > size */
1193       if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1194           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1195          continue;
1196       if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1197           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1198          continue;
1199 
1200       /* second, check that we have at most k=num_moves elements in the window
1201        * and no element is larger than the currently processed one */
1202       unsigned k = op_moves;
1203       unsigned n = 0;
1204       unsigned remaining_op_moves = op_moves;
1205       unsigned last_var = 0;
1206       bool found = true;
1207       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1208       for (const PhysReg j : reg_win) {
1209          /* dead operands effectively reduce the number of estimated moves */
1210          if (is_killed_operand[j & 0xFF]) {
1211             if (remaining_op_moves) {
1212                k--;
1213                remaining_op_moves--;
1214             }
1215             continue;
1216          }
1217 
1218          if (reg_file[j] == 0 || reg_file[j] == last_var)
1219             continue;
1220 
1221          if (reg_file[j] == 0xF0000000) {
1222             k += 1;
1223             n++;
1224             continue;
1225          }
1226 
1227          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1228             found = false;
1229             break;
1230          }
1231 
1232          /* we cannot split live ranges of linear vgprs inside control flow */
1233          // TODO: ensure that live range splits inside control flow are never necessary
1234          if (!(ctx.block->kind & block_kind_top_level) &&
1235              ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1236             found = false;
1237             break;
1238          }
1239 
1240          k += ctx.assignments[reg_file[j]].rc.size();
1241          n++;
1242          last_var = reg_file[j];
1243       }
1244 
1245       if (!found || k > num_moves)
1246          continue;
1247       if (k == num_moves && n < num_vars)
1248          continue;
1249       if (!aligned && k == num_moves && n == num_vars)
1250          continue;
1251 
1252       if (found) {
1253          best_win = reg_win;
1254          num_moves = k;
1255          num_vars = n;
1256       }
1257    }
1258 
1259    if (num_moves == 0xFF)
1260       return {{}, false};
1261 
1262    /* now, we figured the placement for our definition */
1263    RegisterFile tmp_file(reg_file);
1264    std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
1265 
1266    if (instr->opcode == aco_opcode::p_create_vector) {
1267       /* move killed operands which aren't yet at the correct position (GFX9+)
1268        * or which are in the definition space */
1269       PhysReg reg = best_win.lo();
1270       for (Operand& op : instr->operands) {
1271          if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
1272             if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
1273                                         (op.physReg().advance(op.bytes()) > best_win.lo() &&
1274                                          op.physReg() < best_win.hi()))) {
1275                vars.emplace(op.bytes(), op.tempId());
1276                tmp_file.clear(op);
1277             } else {
1278                tmp_file.fill(op);
1279             }
1280          }
1281          reg.reg_b += op.bytes();
1282       }
1283    } else if (!is_phi(instr)) {
1284       /* re-enable killed operands */
1285       for (Operand& op : instr->operands) {
1286          if (op.isTemp() && op.isFirstKillBeforeDef())
1287             tmp_file.fill(op);
1288       }
1289    }
1290 
1291    std::vector<std::pair<Operand, Definition>> pc;
1292    if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))
1293       return {{}, false};
1294 
1295    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1296 
1297    adjust_max_used_regs(ctx, rc, best_win.lo());
1298    return {best_win.lo(), true};
1299 }
1300 
1301 bool
get_reg_specified(ra_ctx & ctx,RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg)1302 get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
1303                   PhysReg reg)
1304 {
1305    /* catch out-of-range registers */
1306    if (reg >= PhysReg{512})
1307       return false;
1308 
1309    std::pair<unsigned, unsigned> sdw_def_info;
1310    if (rc.is_subdword())
1311       sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
1312 
1313    if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
1314       return false;
1315    if (!rc.is_subdword() && reg.byte())
1316       return false;
1317 
1318    if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
1319       return false;
1320 
1321    PhysRegInterval reg_win = {reg, rc.size()};
1322    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1323    PhysRegInterval vcc_win = {vcc, 2};
1324    /* VCC is outside the bounds */
1325    bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);
1326    bool is_m0 = rc == s1 && reg == m0;
1327    if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
1328       return false;
1329 
1330    if (rc.is_subdword()) {
1331       PhysReg test_reg;
1332       test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
1333       if (reg_file.test(test_reg, sdw_def_info.second))
1334          return false;
1335    } else {
1336       if (reg_file.test(reg, rc.bytes()))
1337          return false;
1338    }
1339 
1340    adjust_max_used_regs(ctx, rc, reg_win.lo());
1341    return true;
1342 }
1343 
1344 bool
increase_register_file(ra_ctx & ctx,RegType type)1345 increase_register_file(ra_ctx& ctx, RegType type)
1346 {
1347    if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
1348       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
1349                                                           ctx.program->max_reg_demand.sgpr));
1350    } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1351       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
1352                                                           ctx.program->max_reg_demand.sgpr + 1));
1353    } else {
1354       return false;
1355    }
1356    return true;
1357 }
1358 
1359 struct IDAndRegClass {
IDAndRegClassaco::__anon99bf8b5d0111::IDAndRegClass1360    IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1361 
1362    unsigned id;
1363    RegClass rc;
1364 };
1365 
1366 struct IDAndInfo {
IDAndInfoaco::__anon99bf8b5d0111::IDAndInfo1367    IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1368 
1369    unsigned id;
1370    DefInfo info;
1371 };
1372 
1373 /* Reallocates vars by sorting them and placing each variable after the previous
1374  * one. If one of the variables has 0xffffffff as an ID, the register assigned
1375  * for that variable will be returned.
1376  */
1377 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1378 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1379                       std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1380 {
1381    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1382     * temporary sizes to dwords.
1383     */
1384    std::vector<IDAndInfo> sorted;
1385    for (IDAndRegClass var : vars) {
1386       DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1387       sorted.emplace_back(var.id, info);
1388    }
1389 
1390    std::sort(
1391       sorted.begin(), sorted.end(),
1392       [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1393       {
1394          unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1395          unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1396          if (a_stride > b_stride)
1397             return true;
1398          if (a_stride < b_stride)
1399             return false;
1400          if (a.id == 0xffffffff || b.id == 0xffffffff)
1401             return a.id ==
1402                    0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1403          return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1404       });
1405 
1406    PhysReg next_reg = start;
1407    PhysReg space_reg;
1408    for (IDAndInfo& var : sorted) {
1409       unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1410       next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1411 
1412       /* 0xffffffff is a special variable ID used reserve a space for killed
1413        * operands and definitions.
1414        */
1415       if (var.id != 0xffffffff) {
1416          if (next_reg != ctx.assignments[var.id].reg) {
1417             RegClass rc = ctx.assignments[var.id].rc;
1418             Temp tmp(var.id, rc);
1419 
1420             Operand pc_op(tmp);
1421             pc_op.setFixed(ctx.assignments[var.id].reg);
1422             Definition pc_def(next_reg, rc);
1423             parallelcopies.emplace_back(pc_op, pc_def);
1424          }
1425       } else {
1426          space_reg = next_reg;
1427       }
1428 
1429       adjust_max_used_regs(ctx, var.info.rc, next_reg);
1430 
1431       next_reg = next_reg.advance(var.info.rc.size() * 4);
1432    }
1433 
1434    return space_reg;
1435 }
1436 
1437 bool
is_mimg_vaddr_intact(ra_ctx & ctx,RegisterFile & reg_file,Instruction * instr)1438 is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
1439 {
1440    PhysReg first{512};
1441    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1442       Operand op = instr->operands[i + 3];
1443 
1444       if (ctx.assignments[op.tempId()].assigned) {
1445          PhysReg reg = ctx.assignments[op.tempId()].reg;
1446 
1447          if (first.reg() == 512) {
1448             PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
1449             first = reg.advance(i * -4);
1450             PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1451             if (!bounds.contains(vec)) /* not enough space for other operands */
1452                return false;
1453          } else {
1454             if (reg != first.advance(i * 4)) /* not at the best position */
1455                return false;
1456          }
1457       } else {
1458          /* If there's an unexpected temporary, this operand is unlikely to be
1459           * placed in the best position.
1460           */
1461          if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1462             return false;
1463       }
1464    }
1465 
1466    return true;
1467 }
1468 
1469 std::pair<PhysReg, bool>
get_reg_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr)1470 get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
1471 {
1472    Instruction* vec = ctx.vectors[temp.id()];
1473    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1474    unsigned our_offset = 0;
1475    for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1476       Operand& op = vec->operands[i];
1477       if (op.isTemp() && op.tempId() == temp.id())
1478          break;
1479       else
1480          our_offset += op.bytes();
1481    }
1482 
1483    if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1484       unsigned their_offset = 0;
1485       /* check for every operand of the vector
1486        * - whether the operand is assigned and
1487        * - we can use the register relative to that operand
1488        */
1489       for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1490          Operand& op = vec->operands[i];
1491          if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1492              ctx.assignments[op.tempId()].assigned) {
1493             PhysReg reg = ctx.assignments[op.tempId()].reg;
1494             reg.reg_b += (our_offset - their_offset);
1495             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1496                return {reg, true};
1497 
1498             /* return if MIMG vaddr components don't remain vector-aligned */
1499             if (vec->format == Format::MIMG)
1500                return {{}, false};
1501          }
1502          their_offset += op.bytes();
1503       }
1504 
1505       /* We didn't find a register relative to other vector operands.
1506        * Try to find new space which fits the whole vector.
1507        */
1508       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1509       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1510       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
1511       PhysReg reg = res.first;
1512       if (res.second) {
1513          reg.reg_b += our_offset;
1514          /* make sure to only use byte offset if the instruction supports it */
1515          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1516             return {reg, true};
1517       }
1518    }
1519    return {{}, false};
1520 }
1521 
1522 PhysReg
get_reg(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1523 get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1524         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1525         int operand_index = -1)
1526 {
1527    auto split_vec = ctx.split_vectors.find(temp.id());
1528    if (split_vec != ctx.split_vectors.end()) {
1529       unsigned offset = 0;
1530       for (Definition def : split_vec->second->definitions) {
1531          if (ctx.assignments[def.tempId()].affinity) {
1532             assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1533             if (affinity.assigned) {
1534                PhysReg reg = affinity.reg;
1535                reg.reg_b -= offset;
1536                if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1537                   return reg;
1538             }
1539          }
1540          offset += def.bytes();
1541       }
1542    }
1543 
1544    if (ctx.assignments[temp.id()].affinity) {
1545       assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1546       if (affinity.assigned) {
1547          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
1548             return affinity.reg;
1549       }
1550    }
1551 
1552    std::pair<PhysReg, bool> res;
1553 
1554    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1555       res = get_reg_vector(ctx, reg_file, temp, instr);
1556       if (res.second)
1557          return res.first;
1558    }
1559 
1560    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1561 
1562    if (!ctx.policy.skip_optimistic_path) {
1563       /* try to find space without live-range splits */
1564       res = get_reg_simple(ctx, reg_file, info);
1565 
1566       if (res.second)
1567          return res.first;
1568    }
1569 
1570    /* try to find space with live-range splits */
1571    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1572 
1573    if (res.second)
1574       return res.first;
1575 
1576    /* try using more registers */
1577 
1578    /* We should only fail here because keeping under the limit would require
1579     * too many moves. */
1580    assert(reg_file.count_zero(info.bounds) >= info.size);
1581 
1582    if (!increase_register_file(ctx, info.rc.type())) {
1583       /* fallback algorithm: reallocate all variables at once */
1584       unsigned def_size = info.rc.size();
1585       for (Definition def : instr->definitions) {
1586          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1587             def_size += def.regClass().size();
1588       }
1589 
1590       unsigned killed_op_size = 0;
1591       for (Operand op : instr->operands) {
1592          if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1593             killed_op_size += op.regClass().size();
1594       }
1595 
1596       const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
1597 
1598       /* reallocate passthrough variables and non-killed operands */
1599       std::vector<IDAndRegClass> vars;
1600       for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs))
1601          vars.emplace_back(var.second, ctx.assignments[var.second].rc);
1602       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1603 
1604       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1605 
1606       /* reallocate killed operands */
1607       std::vector<IDAndRegClass> killed_op_vars;
1608       for (Operand op : instr->operands) {
1609          if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1610             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1611       }
1612       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1613 
1614       /* reallocate definitions */
1615       std::vector<IDAndRegClass> def_vars;
1616       for (Definition def : instr->definitions) {
1617          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1618             def_vars.emplace_back(def.tempId(), def.regClass());
1619       }
1620       def_vars.emplace_back(0xffffffff, info.rc);
1621       return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1622    }
1623 
1624    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1625 }
1626 
1627 PhysReg
get_reg_create_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1628 get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1629                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
1630                       aco_ptr<Instruction>& instr)
1631 {
1632    RegClass rc = temp.regClass();
1633    /* create_vector instructions have different costs w.r.t. register coalescing */
1634    uint32_t size = rc.size();
1635    uint32_t bytes = rc.bytes();
1636    uint32_t stride = get_stride(rc);
1637    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1638 
1639    // TODO: improve p_create_vector for sub-dword vectors
1640 
1641    PhysReg best_pos{0xFFF};
1642    unsigned num_moves = 0xFF;
1643    bool best_avoid = true;
1644 
1645    /* test for each operand which definition placement causes the least shuffle instructions */
1646    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1647         offset += instr->operands[i].bytes(), i++) {
1648       // TODO: think about, if we can alias live operands on the same register
1649       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1650           instr->operands[i].getTemp().type() != rc.type())
1651          continue;
1652 
1653       if (offset > instr->operands[i].physReg().reg_b)
1654          continue;
1655 
1656       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1657       if (reg_lower % 4)
1658          continue;
1659       PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1660       unsigned k = 0;
1661 
1662       /* no need to check multiple times */
1663       if (reg_win.lo() == best_pos)
1664          continue;
1665 
1666       /* check borders */
1667       // TODO: this can be improved */
1668       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1669          continue;
1670       if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1671           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1672          continue;
1673       if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1674           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1675          continue;
1676 
1677       /* count variables to be moved and check "avoid" */
1678       bool avoid = false;
1679       bool linear_vgpr = false;
1680       for (PhysReg j : reg_win) {
1681          if (reg_file[j] != 0) {
1682             if (reg_file[j] == 0xF0000000) {
1683                PhysReg reg;
1684                reg.reg_b = j * 4;
1685                unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1686                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1687                   k += reg_file.test(reg, 1);
1688             } else {
1689                k += 4;
1690                linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1691             }
1692          }
1693          avoid |= ctx.war_hint[j];
1694       }
1695 
1696       if (linear_vgpr) {
1697          /* we cannot split live ranges of linear vgprs inside control flow */
1698          if (ctx.block->kind & block_kind_top_level)
1699             avoid = true;
1700          else
1701             continue;
1702       }
1703 
1704       if (avoid && !best_avoid)
1705          continue;
1706 
1707       /* count operands in wrong positions */
1708       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1709            offset2 += instr->operands[j].bytes(), j++) {
1710          if (j == i || !instr->operands[j].isTemp() ||
1711              instr->operands[j].getTemp().type() != rc.type())
1712             continue;
1713          if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)
1714             k += instr->operands[j].bytes();
1715       }
1716       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1717       if (k > num_moves || (!aligned && k == num_moves))
1718          continue;
1719 
1720       best_pos = reg_win.lo();
1721       num_moves = k;
1722       best_avoid = avoid;
1723    }
1724 
1725    if (num_moves >= bytes)
1726       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1727 
1728    /* re-enable killed operands which are in the wrong position */
1729    RegisterFile tmp_file(reg_file);
1730    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1731         offset += instr->operands[i].bytes(), i++) {
1732       if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
1733           instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)
1734          tmp_file.fill(instr->operands[i]);
1735    }
1736 
1737    /* collect variables to be moved */
1738    std::set<std::pair<unsigned, unsigned>> vars =
1739       collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1740 
1741    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1742         offset += instr->operands[i].bytes(), i++) {
1743       if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
1744           instr->operands[i].getTemp().type() != rc.type())
1745          continue;
1746       bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset;
1747       /* GFX9+: move killed operands which aren't yet at the correct position
1748        * Moving all killed operands generally leads to more register swaps.
1749        * This is only done on GFX9+ because of the cheap v_swap instruction.
1750        */
1751       if (ctx.program->chip_class >= GFX9 && !correct_pos) {
1752          vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1753          tmp_file.clear(instr->operands[i]);
1754          /* fill operands which are in the correct position to avoid overwriting */
1755       } else if (correct_pos) {
1756          tmp_file.fill(instr->operands[i]);
1757       }
1758    }
1759    bool success = false;
1760    std::vector<std::pair<Operand, Definition>> pc;
1761    success =
1762       get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
1763 
1764    if (!success) {
1765       if (!increase_register_file(ctx, temp.type())) {
1766          /* use the fallback algorithm in get_reg() */
1767          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1768       }
1769       return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1770    }
1771 
1772    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1773    adjust_max_used_regs(ctx, rc, best_pos);
1774 
1775    return best_pos;
1776 }
1777 
1778 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1779 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1780 {
1781    if (instr->format != Format::PSEUDO)
1782       return;
1783 
1784    /* all instructions which use handle_operands() need this information */
1785    switch (instr->opcode) {
1786    case aco_opcode::p_extract_vector:
1787    case aco_opcode::p_create_vector:
1788    case aco_opcode::p_split_vector:
1789    case aco_opcode::p_parallelcopy:
1790    case aco_opcode::p_wqm: break;
1791    default: return;
1792    }
1793 
1794    bool writes_linear = false;
1795    /* if all definitions are logical vgpr, no need to care for SCC */
1796    for (Definition& def : instr->definitions) {
1797       if (def.getTemp().regClass().is_linear())
1798          writes_linear = true;
1799    }
1800    /* if all operands are constant, no need to care either */
1801    bool reads_linear = false;
1802    bool reads_subdword = false;
1803    for (Operand& op : instr->operands) {
1804       if (op.isTemp() && op.getTemp().regClass().is_linear())
1805          reads_linear = true;
1806       if (op.isTemp() && op.regClass().is_subdword())
1807          reads_subdword = true;
1808    }
1809    bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
1810                             (ctx.program->chip_class <= GFX7 && reads_subdword);
1811    if (!needs_scratch_reg)
1812       return;
1813 
1814    instr->pseudo().tmp_in_scc = reg_file[scc];
1815 
1816    int reg = ctx.max_used_sgpr;
1817    for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
1818       ;
1819    if (reg < 0) {
1820       reg = ctx.max_used_sgpr + 1;
1821       for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
1822          ;
1823       if (reg == ctx.program->max_reg_demand.sgpr) {
1824          assert(reads_subdword && reg_file[m0] == 0);
1825          reg = m0;
1826       }
1827    }
1828 
1829    adjust_max_used_regs(ctx, s1, reg);
1830    instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
1831 }
1832 
1833 bool
operand_can_use_reg(chip_class chip,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)1834 operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
1835                     RegClass rc)
1836 {
1837    if (instr->operands[idx].isFixed())
1838       return instr->operands[idx].physReg() == reg;
1839 
1840    bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
1841                        instr->opcode == aco_opcode::v_writelane_b32_e64;
1842    if (chip <= GFX9 && is_writelane && idx <= 1) {
1843       /* v_writelane_b32 can take two sgprs but only if one is m0. */
1844       bool is_other_sgpr =
1845          instr->operands[!idx].isTemp() &&
1846          (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
1847       if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
1848          instr->operands[idx].setFixed(m0);
1849          return reg == m0;
1850       }
1851    }
1852 
1853    if (reg.byte()) {
1854       unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
1855       if (reg.byte() % stride)
1856          return false;
1857    }
1858 
1859    switch (instr->format) {
1860    case Format::SMEM:
1861       return reg != scc && reg != exec &&
1862              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1863              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
1864               chip >= GFX10); /* sdata can be vcc */
1865    default:
1866       // TODO: there are more instructions with restrictions on registers
1867       return true;
1868    }
1869 }
1870 
1871 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)1872 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1873                     std::vector<std::pair<Operand, Definition>>& parallelcopy,
1874                     aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
1875 {
1876    /* check if the operand is fixed */
1877    PhysReg src = ctx.assignments[operand.tempId()].reg;
1878    PhysReg dst;
1879    if (operand.isFixed()) {
1880       assert(operand.physReg() != src);
1881 
1882       /* check if target reg is blocked, and move away the blocking var */
1883       if (register_file.test(operand.physReg(), operand.bytes())) {
1884          PhysRegInterval target{operand.physReg(), operand.size()};
1885 
1886          RegisterFile tmp_file(register_file);
1887 
1888          std::set<std::pair<unsigned, unsigned>> blocking_vars =
1889             collect_vars(ctx, tmp_file, target);
1890 
1891          tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
1892          tmp_file.block(operand.physReg(), operand.regClass());
1893 
1894          DefInfo info(ctx, instr, operand.regClass(), -1);
1895          get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
1896                              PhysRegInterval());
1897       }
1898       dst = operand.physReg();
1899 
1900    } else {
1901       /* clear the operand in case it's only a stride mismatch */
1902       register_file.clear(src, operand.regClass());
1903       dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
1904    }
1905 
1906    Operand pc_op = operand;
1907    pc_op.setFixed(src);
1908    Definition pc_def = Definition(dst, pc_op.regClass());
1909    parallelcopy.emplace_back(pc_op, pc_def);
1910    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
1911 }
1912 
1913 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)1914 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
1915                   std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
1916 {
1917    /* assign phis with all-matching registers to that register */
1918    for (aco_ptr<Instruction>& phi : block.instructions) {
1919       if (!is_phi(phi))
1920          break;
1921       Definition& definition = phi->definitions[0];
1922       if (definition.isKill() || definition.isFixed())
1923          continue;
1924 
1925       if (!phi->operands[0].isTemp())
1926          continue;
1927 
1928       PhysReg reg = phi->operands[0].physReg();
1929       auto OpsSame = [=](const Operand& op) -> bool
1930       { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
1931       bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
1932       if (!all_same)
1933          continue;
1934 
1935       if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
1936          continue;
1937 
1938       definition.setFixed(reg);
1939       register_file.fill(definition);
1940       ctx.assignments[definition.tempId()].set(definition);
1941    }
1942 
1943    /* try to find a register that is used by at least one operand */
1944    for (aco_ptr<Instruction>& phi : block.instructions) {
1945       if (!is_phi(phi))
1946          break;
1947       Definition& definition = phi->definitions[0];
1948       if (definition.isKill() || definition.isFixed())
1949          continue;
1950 
1951       /* use affinity if available */
1952       if (ctx.assignments[definition.tempId()].affinity &&
1953           ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
1954          assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
1955          assert(affinity.rc == definition.regClass());
1956          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
1957             definition.setFixed(affinity.reg);
1958             register_file.fill(definition);
1959             ctx.assignments[definition.tempId()].set(definition);
1960             continue;
1961          }
1962       }
1963 
1964       /* by going backwards, we aim to avoid copies in else-blocks */
1965       for (int i = phi->operands.size() - 1; i >= 0; i--) {
1966          const Operand& op = phi->operands[i];
1967          if (!op.isTemp() || !op.isFixed())
1968             continue;
1969 
1970          PhysReg reg = op.physReg();
1971          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
1972             definition.setFixed(reg);
1973             register_file.fill(definition);
1974             ctx.assignments[definition.tempId()].set(definition);
1975             break;
1976          }
1977       }
1978    }
1979 
1980    /* find registers for phis where the register was blocked or no operand was assigned */
1981    for (aco_ptr<Instruction>& phi : block.instructions) {
1982       if (!is_phi(phi))
1983          break;
1984 
1985       Definition& definition = phi->definitions[0];
1986       if (definition.isKill())
1987          continue;
1988 
1989       if (definition.isFixed()) {
1990          instructions.emplace_back(std::move(phi));
1991          continue;
1992       }
1993 
1994       std::vector<std::pair<Operand, Definition>> parallelcopy;
1995       definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1996       update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
1997 
1998       /* process parallelcopy */
1999       for (std::pair<Operand, Definition> pc : parallelcopy) {
2000          /* see if it's a copy from a different phi */
2001          // TODO: prefer moving some previous phis over live-ins
2002          // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2003          // problem in practice since they can only be fixed to exec)
2004          Instruction* prev_phi = NULL;
2005          std::vector<aco_ptr<Instruction>>::iterator phi_it;
2006          for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2007             if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2008                prev_phi = phi_it->get();
2009          }
2010          if (prev_phi) {
2011             /* if so, just update that phi's register */
2012             prev_phi->definitions[0].setFixed(pc.second.physReg());
2013             ctx.assignments[prev_phi->definitions[0].tempId()].set(pc.second);
2014             continue;
2015          }
2016 
2017          /* rename */
2018          std::unordered_map<unsigned, Temp>::iterator orig_it =
2019             ctx.orig_names.find(pc.first.tempId());
2020          Temp orig = pc.first.getTemp();
2021          if (orig_it != ctx.orig_names.end())
2022             orig = orig_it->second;
2023          else
2024             ctx.orig_names[pc.second.tempId()] = orig;
2025          ctx.renames[block.index][orig.id()] = pc.second.getTemp();
2026 
2027          /* otherwise, this is a live-in and we need to create a new phi
2028           * to move it in this block's predecessors */
2029          aco_opcode opcode =
2030             pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2031          std::vector<unsigned>& preds =
2032             pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2033          aco_ptr<Instruction> new_phi{
2034             create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2035          new_phi->definitions[0] = pc.second;
2036          for (unsigned i = 0; i < preds.size(); i++)
2037             new_phi->operands[i] = Operand(pc.first);
2038          instructions.emplace_back(std::move(new_phi));
2039 
2040          /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
2041           * would re-create this phi later if this is a loop header.
2042           */
2043          live_in.erase(orig.id());
2044       }
2045 
2046       register_file.fill(definition);
2047       ctx.assignments[definition.tempId()].set(definition);
2048       instructions.emplace_back(std::move(phi));
2049    }
2050 }
2051 
2052 Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2053 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2054 {
2055    std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
2056    if (it == ctx.renames[block_idx].end())
2057       return val;
2058    else
2059       return it->second;
2060 }
2061 
2062 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2063 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2064 {
2065    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2066    if (preds.size() == 0)
2067       return val;
2068 
2069    if (preds.size() == 1) {
2070       /* if the block has only one predecessor, just look there for the name */
2071       return read_variable(ctx, val, preds[0]);
2072    }
2073 
2074    /* there are multiple predecessors and the block is sealed */
2075    Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2076 
2077    /* get the rename from each predecessor and check if they are the same */
2078    Temp new_val;
2079    bool needs_phi = false;
2080    for (unsigned i = 0; i < preds.size(); i++) {
2081       ops[i] = read_variable(ctx, val, preds[i]);
2082       if (i == 0)
2083          new_val = ops[i];
2084       else
2085          needs_phi |= !(new_val == ops[i]);
2086    }
2087 
2088    if (needs_phi) {
2089       assert(!val.regClass().is_linear_vgpr());
2090 
2091       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2092       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2093       aco_ptr<Instruction> phi{
2094          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2095       new_val = ctx.program->allocateTmp(val.regClass());
2096       phi->definitions[0] = Definition(new_val);
2097       ctx.assignments.emplace_back();
2098       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2099       for (unsigned i = 0; i < preds.size(); i++) {
2100          /* update the operands so that it uses the new affinity */
2101          phi->operands[i] = Operand(ops[i]);
2102          assert(ctx.assignments[ops[i].id()].assigned);
2103          assert(ops[i].regClass() == new_val.regClass());
2104          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2105       }
2106       block->instructions.insert(block->instructions.begin(), std::move(phi));
2107    }
2108 
2109    return new_val;
2110 }
2111 
2112 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx)2113 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2114                  uint32_t loop_exit_idx)
2115 {
2116    Block& loop_header = ctx.program->blocks[loop_header_idx];
2117    std::unordered_map<unsigned, Temp> renames;
2118 
2119    /* create phis for variables renamed during the loop */
2120    for (unsigned t : live_in) {
2121       Temp val = Temp(t, ctx.program->temp_rc[t]);
2122       Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2123       Temp renamed = handle_live_in(ctx, val, &loop_header);
2124       if (renamed == prev)
2125          continue;
2126 
2127       /* insert additional renames at block end, but don't overwrite */
2128       renames[prev.id()] = renamed;
2129       ctx.orig_names[renamed.id()] = val;
2130       for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2131          auto it = ctx.renames[idx].emplace(val.id(), renamed);
2132          /* if insertion is unsuccessful, update if necessary */
2133          if (!it.second && it.first->second == prev)
2134             it.first->second = renamed;
2135       }
2136 
2137       /* update loop-carried values of the phi created by handle_live_in() */
2138       for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2139          Operand& op = loop_header.instructions[0]->operands[i];
2140          if (op.getTemp() == prev)
2141             op.setTemp(renamed);
2142       }
2143 
2144       /* use the assignment from the loop preheader and fix def reg */
2145       assignment& var = ctx.assignments[prev.id()];
2146       ctx.assignments[renamed.id()] = var;
2147       loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2148    }
2149 
2150    /* rename loop carried phi operands */
2151    for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2152       aco_ptr<Instruction>& phi = loop_header.instructions[i];
2153       if (!is_phi(phi))
2154          break;
2155       const std::vector<unsigned>& preds =
2156          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2157       for (unsigned j = 1; j < phi->operands.size(); j++) {
2158          Operand& op = phi->operands[j];
2159          if (!op.isTemp())
2160             continue;
2161 
2162          /* Find the original name, since this operand might not use the original name if the phi
2163           * was created after init_reg_file().
2164           */
2165          std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());
2166          Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2167 
2168          op.setTemp(read_variable(ctx, orig, preds[j]));
2169          op.setFixed(ctx.assignments[op.tempId()].reg);
2170       }
2171    }
2172 
2173    /* return early if no new phi was created */
2174    if (renames.empty())
2175       return;
2176 
2177    /* propagate new renames through loop */
2178    for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2179       Block& current = ctx.program->blocks[idx];
2180       /* rename all uses in this block */
2181       for (aco_ptr<Instruction>& instr : current.instructions) {
2182          /* phis are renamed after RA */
2183          if (idx == loop_header_idx && is_phi(instr))
2184             continue;
2185 
2186          for (Operand& op : instr->operands) {
2187             if (!op.isTemp())
2188                continue;
2189 
2190             auto rename = renames.find(op.tempId());
2191             if (rename != renames.end()) {
2192                assert(rename->second.id());
2193                op.setTemp(rename->second);
2194             }
2195          }
2196       }
2197    }
2198 }
2199 
2200 /**
2201  * This function serves the purpose to correctly initialize the register file
2202  * at the beginning of a block (before any existing phis).
2203  * In order to do so, all live-in variables are entered into the RegisterFile.
2204  * Reg-to-reg moves (renames) from previous blocks are taken into account and
2205  * the SSA is repaired by inserting corresponding phi-nodes.
2206  */
2207 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2208 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2209 {
2210    if (block.kind & block_kind_loop_exit) {
2211       uint32_t header = ctx.loop_header.back();
2212       ctx.loop_header.pop_back();
2213       handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2214    }
2215 
2216    RegisterFile register_file;
2217    const IDSet& live_in = live_out_per_block[block.index];
2218    assert(block.index != 0 || live_in.empty());
2219 
2220    if (block.kind & block_kind_loop_header) {
2221       ctx.loop_header.emplace_back(block.index);
2222       /* already rename phis incoming value */
2223       for (aco_ptr<Instruction>& instr : block.instructions) {
2224          if (!is_phi(instr))
2225             break;
2226          Operand& operand = instr->operands[0];
2227          if (operand.isTemp()) {
2228             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2229             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2230          }
2231       }
2232       for (unsigned t : live_in) {
2233          Temp val = Temp(t, ctx.program->temp_rc[t]);
2234          Temp renamed = read_variable(ctx, val, block.index - 1);
2235          if (renamed != val)
2236             ctx.renames[block.index][val.id()] = renamed;
2237          assignment& var = ctx.assignments[renamed.id()];
2238          assert(var.assigned);
2239          register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2240       }
2241    } else {
2242       /* rename phi operands */
2243       for (aco_ptr<Instruction>& instr : block.instructions) {
2244          if (!is_phi(instr))
2245             break;
2246          const std::vector<unsigned>& preds =
2247             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2248 
2249          for (unsigned i = 0; i < instr->operands.size(); i++) {
2250             Operand& operand = instr->operands[i];
2251             if (!operand.isTemp())
2252                continue;
2253             operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2254             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2255          }
2256       }
2257       for (unsigned t : live_in) {
2258          Temp val = Temp(t, ctx.program->temp_rc[t]);
2259          Temp renamed = handle_live_in(ctx, val, &block);
2260          assignment& var = ctx.assignments[renamed.id()];
2261          /* due to live-range splits, the live-in might be a phi, now */
2262          if (var.assigned) {
2263             register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2264          }
2265          if (renamed != val) {
2266             ctx.renames[block.index].emplace(t, renamed);
2267             ctx.orig_names[renamed.id()] = val;
2268          }
2269       }
2270    }
2271 
2272    return register_file;
2273 }
2274 
2275 void
get_affinities(ra_ctx & ctx,std::vector<IDSet> & live_out_per_block)2276 get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
2277 {
2278    std::vector<std::vector<Temp>> phi_ressources;
2279    std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
2280 
2281    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2282         block_rit++) {
2283       Block& block = *block_rit;
2284 
2285       /* first, compute the death points of all live vars within the block */
2286       IDSet& live = live_out_per_block[block.index];
2287 
2288       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2289       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2290          aco_ptr<Instruction>& instr = *rit;
2291          if (is_phi(instr))
2292             break;
2293 
2294          /* add vector affinities */
2295          if (instr->opcode == aco_opcode::p_create_vector) {
2296             for (const Operand& op : instr->operands) {
2297                if (op.isTemp() && op.isFirstKill() &&
2298                    op.getTemp().type() == instr->definitions[0].getTemp().type())
2299                   ctx.vectors[op.tempId()] = instr.get();
2300             }
2301          } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
2302             for (unsigned i = 3; i < instr->operands.size(); i++)
2303                ctx.vectors[instr->operands[i].tempId()] = instr.get();
2304          }
2305 
2306          if (instr->opcode == aco_opcode::p_split_vector &&
2307              instr->operands[0].isFirstKillBeforeDef())
2308             ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2309 
2310          /* add operands to live variables */
2311          for (const Operand& op : instr->operands) {
2312             if (op.isTemp())
2313                live.insert(op.tempId());
2314          }
2315 
2316          /* erase definitions from live */
2317          for (unsigned i = 0; i < instr->definitions.size(); i++) {
2318             const Definition& def = instr->definitions[i];
2319             if (!def.isTemp())
2320                continue;
2321             live.erase(def.tempId());
2322             /* mark last-seen phi operand */
2323             std::unordered_map<unsigned, unsigned>::iterator it =
2324                temp_to_phi_ressources.find(def.tempId());
2325             if (it != temp_to_phi_ressources.end() &&
2326                 def.regClass() == phi_ressources[it->second][0].regClass()) {
2327                phi_ressources[it->second][0] = def.getTemp();
2328                /* try to coalesce phi affinities with parallelcopies */
2329                Operand op = Operand();
2330                switch (instr->opcode) {
2331                case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
2332 
2333                case aco_opcode::v_interp_p2_f32:
2334                case aco_opcode::v_writelane_b32:
2335                case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
2336 
2337                case aco_opcode::v_fma_f32:
2338                case aco_opcode::v_fma_f16:
2339                case aco_opcode::v_pk_fma_f16:
2340                   if (ctx.program->chip_class < GFX10)
2341                      continue;
2342                   FALLTHROUGH;
2343                case aco_opcode::v_mad_f32:
2344                case aco_opcode::v_mad_f16:
2345                   if (instr->usesModifiers())
2346                      continue;
2347                   op = instr->operands[2];
2348                   break;
2349 
2350                default: continue;
2351                }
2352 
2353                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2354                   phi_ressources[it->second].emplace_back(op.getTemp());
2355                   temp_to_phi_ressources[op.tempId()] = it->second;
2356                }
2357             }
2358          }
2359       }
2360 
2361       /* collect phi affinities */
2362       for (; rit != block.instructions.rend(); ++rit) {
2363          aco_ptr<Instruction>& instr = *rit;
2364          assert(is_phi(instr));
2365 
2366          live.erase(instr->definitions[0].tempId());
2367          if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2368             continue;
2369 
2370          assert(instr->definitions[0].isTemp());
2371          std::unordered_map<unsigned, unsigned>::iterator it =
2372             temp_to_phi_ressources.find(instr->definitions[0].tempId());
2373          unsigned index = phi_ressources.size();
2374          std::vector<Temp>* affinity_related;
2375          if (it != temp_to_phi_ressources.end()) {
2376             index = it->second;
2377             phi_ressources[index][0] = instr->definitions[0].getTemp();
2378             affinity_related = &phi_ressources[index];
2379          } else {
2380             phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2381             affinity_related = &phi_ressources.back();
2382          }
2383 
2384          for (const Operand& op : instr->operands) {
2385             if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2386                affinity_related->emplace_back(op.getTemp());
2387                if (block.kind & block_kind_loop_header)
2388                   continue;
2389                temp_to_phi_ressources[op.tempId()] = index;
2390             }
2391          }
2392       }
2393 
2394       /* visit the loop header phis first in order to create nested affinities */
2395       if (block.kind & block_kind_loop_exit) {
2396          /* find loop header */
2397          auto header_rit = block_rit;
2398          while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2399             header_rit++;
2400 
2401          for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2402             if (!is_phi(phi))
2403                break;
2404             if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2405                continue;
2406 
2407             /* create an (empty) merge-set for the phi-related variables */
2408             auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId());
2409             unsigned index = phi_ressources.size();
2410             if (it == temp_to_phi_ressources.end()) {
2411                temp_to_phi_ressources[phi->definitions[0].tempId()] = index;
2412                phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2413             } else {
2414                index = it->second;
2415             }
2416             for (unsigned i = 1; i < phi->operands.size(); i++) {
2417                const Operand& op = phi->operands[i];
2418                if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2419                   temp_to_phi_ressources[op.tempId()] = index;
2420                }
2421             }
2422          }
2423       }
2424    }
2425    /* create affinities */
2426    for (std::vector<Temp>& vec : phi_ressources) {
2427       for (unsigned i = 1; i < vec.size(); i++)
2428          if (vec[i].id() != vec[0].id())
2429             ctx.assignments[vec[i].id()].affinity = vec[0].id();
2430    }
2431 }
2432 
2433 } /* end namespace */
2434 
2435 void
register_allocation(Program * program,std::vector<IDSet> & live_out_per_block,ra_test_policy policy)2436 register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
2437 {
2438    ra_ctx ctx(program, policy);
2439    get_affinities(ctx, live_out_per_block);
2440 
2441    /* state of register file after phis */
2442    std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
2443 
2444    for (Block& block : program->blocks) {
2445       ctx.block = &block;
2446 
2447       /* initialize register file */
2448       RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
2449       ctx.war_hint.reset();
2450 
2451       std::vector<aco_ptr<Instruction>> instructions;
2452 
2453       /* this is a slight adjustment from the paper as we already have phi nodes:
2454        * We consider them incomplete phis and only handle the definition. */
2455       get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
2456 
2457       /* fill in sgpr_live_in */
2458       for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
2459          sgpr_live_in[block.index][i] = register_file[PhysReg{i}];
2460       sgpr_live_in[block.index][127] = register_file[scc];
2461 
2462       /* Handle all other instructions of the block */
2463       auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
2464       std::vector<aco_ptr<Instruction>>::iterator instr_it =
2465          std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
2466       for (; instr_it != block.instructions.end(); ++instr_it) {
2467          aco_ptr<Instruction>& instr = *instr_it;
2468 
2469          /* parallelcopies from p_phi are inserted here which means
2470           * live ranges of killed operands end here as well */
2471          if (instr->opcode == aco_opcode::p_logical_end) {
2472             /* no need to process this instruction any further */
2473             if (block.logical_succs.size() != 1) {
2474                instructions.emplace_back(std::move(instr));
2475                continue;
2476             }
2477 
2478             Block& succ = program->blocks[block.logical_succs[0]];
2479             unsigned idx = 0;
2480             for (; idx < succ.logical_preds.size(); idx++) {
2481                if (succ.logical_preds[idx] == block.index)
2482                   break;
2483             }
2484             for (aco_ptr<Instruction>& phi : succ.instructions) {
2485                if (phi->opcode == aco_opcode::p_phi) {
2486                   if (phi->operands[idx].isTemp() &&
2487                       phi->operands[idx].getTemp().type() == RegType::sgpr &&
2488                       phi->operands[idx].isFirstKillBeforeDef()) {
2489                      Definition phi_op(
2490                         read_variable(ctx, phi->operands[idx].getTemp(), block.index));
2491                      phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
2492                      register_file.clear(phi_op);
2493                   }
2494                } else if (phi->opcode != aco_opcode::p_linear_phi) {
2495                   break;
2496                }
2497             }
2498             instructions.emplace_back(std::move(instr));
2499             continue;
2500          }
2501 
2502          std::vector<std::pair<Operand, Definition>> parallelcopy;
2503 
2504          assert(!is_phi(instr));
2505 
2506          bool temp_in_scc = register_file[scc];
2507 
2508          /* handle operands */
2509          for (unsigned i = 0; i < instr->operands.size(); ++i) {
2510             auto& operand = instr->operands[i];
2511             if (!operand.isTemp())
2512                continue;
2513 
2514             /* rename operands */
2515             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
2516             assert(ctx.assignments[operand.tempId()].assigned);
2517 
2518             PhysReg reg = ctx.assignments[operand.tempId()].reg;
2519             if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
2520                operand.setFixed(reg);
2521             else
2522                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
2523 
2524             if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
2525                 (instr->isDS() && instr->ds().gds)) {
2526                for (unsigned j = 0; j < operand.size(); j++)
2527                   ctx.war_hint.set(operand.physReg().reg() + j);
2528             }
2529          }
2530 
2531          /* remove dead vars from register file */
2532          for (const Operand& op : instr->operands) {
2533             if (op.isTemp() && op.isFirstKillBeforeDef())
2534                register_file.clear(op);
2535          }
2536 
2537          /* try to optimize v_mad_f32 -> v_mac_f32 */
2538          if ((instr->opcode == aco_opcode::v_mad_f32 ||
2539               (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
2540               instr->opcode == aco_opcode::v_mad_f16 ||
2541               instr->opcode == aco_opcode::v_mad_legacy_f16 ||
2542               (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
2543               (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) ||
2544               (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) &&
2545              instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
2546              instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
2547              instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
2548              instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
2549              instr->operands[2].physReg().byte() == 0) {
2550             unsigned def_id = instr->definitions[0].tempId();
2551             bool use_vop2 = true;
2552             if (ctx.assignments[def_id].affinity) {
2553                assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2554                if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2555                    !register_file.test(affinity.reg, instr->operands[2].bytes()))
2556                   use_vop2 = false;
2557             }
2558             if (use_vop2) {
2559                instr->format = Format::VOP2;
2560                switch (instr->opcode) {
2561                case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2562                case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2563                case aco_opcode::v_mad_f16:
2564                case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2565                case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2566                case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2567                case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2568                default: break;
2569                }
2570             }
2571          }
2572 
2573          /* handle definitions which must have the same register as an operand */
2574          if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
2575              instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
2576              instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
2577              instr->opcode == aco_opcode::v_pk_fmac_f16 ||
2578              instr->opcode == aco_opcode::v_writelane_b32 ||
2579              instr->opcode == aco_opcode::v_writelane_b32_e64 ||
2580              instr->opcode == aco_opcode::v_dot4c_i32_i8) {
2581             instr->definitions[0].setFixed(instr->operands[2].physReg());
2582          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
2583                     instr->opcode == aco_opcode::s_mulk_i32) {
2584             instr->definitions[0].setFixed(instr->operands[0].physReg());
2585          } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
2586                     instr->operands.size() == 4) {
2587             instr->definitions[0].setFixed(instr->operands[3].physReg());
2588          } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
2589                     !instr->operands[2].isUndefined()) {
2590             instr->definitions[0].setFixed(instr->operands[2].physReg());
2591          }
2592 
2593          ctx.defs_done.reset();
2594 
2595          /* handle fixed definitions first */
2596          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2597             auto& definition = instr->definitions[i];
2598             if (!definition.isFixed())
2599                continue;
2600 
2601             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
2602             /* check if the target register is blocked */
2603             if (register_file.test(definition.physReg(), definition.bytes())) {
2604                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
2605 
2606                /* create parallelcopy pair to move blocking vars */
2607                std::set<std::pair<unsigned, unsigned>> vars =
2608                   collect_vars(ctx, register_file, def_regs);
2609 
2610                RegisterFile tmp_file(register_file);
2611                /* re-enable the killed operands, so that we don't move the blocking vars there */
2612                for (const Operand& op : instr->operands) {
2613                   if (op.isTemp() && op.isFirstKillBeforeDef())
2614                      tmp_file.fill(op);
2615                }
2616 
2617                ASSERTED bool success = false;
2618                DefInfo info(ctx, instr, definition.regClass(), -1);
2619                success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
2620                                              def_regs);
2621                assert(success);
2622 
2623                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2624             }
2625             ctx.defs_done.set(i);
2626 
2627             if (!definition.isTemp())
2628                continue;
2629 
2630             ctx.assignments[definition.tempId()].set(definition);
2631             register_file.fill(definition);
2632          }
2633 
2634          /* handle all other definitions */
2635          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2636             Definition* definition = &instr->definitions[i];
2637 
2638             if (definition->isFixed() || !definition->isTemp())
2639                continue;
2640 
2641             /* find free reg */
2642             if (definition->hasHint() &&
2643                 get_reg_specified(ctx, register_file, definition->regClass(), instr,
2644                                   definition->physReg())) {
2645                definition->setFixed(definition->physReg());
2646             } else if (instr->opcode == aco_opcode::p_split_vector) {
2647                PhysReg reg = instr->operands[0].physReg();
2648                for (unsigned j = 0; j < i; j++)
2649                   reg.reg_b += instr->definitions[j].bytes();
2650                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
2651                   definition->setFixed(reg);
2652             } else if (instr->opcode == aco_opcode::p_wqm ||
2653                        instr->opcode == aco_opcode::p_parallelcopy) {
2654                PhysReg reg = instr->operands[i].physReg();
2655                if (instr->operands[i].isTemp() &&
2656                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
2657                    !register_file.test(reg, definition->bytes()))
2658                   definition->setFixed(reg);
2659             } else if (instr->opcode == aco_opcode::p_extract_vector) {
2660                PhysReg reg = instr->operands[0].physReg();
2661                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
2662                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
2663                   definition->setFixed(reg);
2664             } else if (instr->opcode == aco_opcode::p_create_vector) {
2665                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
2666                                                    parallelcopy, instr);
2667                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2668                definition->setFixed(reg);
2669             }
2670 
2671             if (!definition->isFixed()) {
2672                Temp tmp = definition->getTemp();
2673                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
2674                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2675                   definition->setFixed(reg);
2676                   if (reg.byte() || register_file.test(reg, 4)) {
2677                      add_subdword_definition(program, instr, reg);
2678                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
2679                                                              the reference */
2680                   }
2681                } else {
2682                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
2683                }
2684                update_renames(ctx, register_file, parallelcopy, instr,
2685                               instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
2686                                                                            : (UpdateRenames)0);
2687             }
2688 
2689             assert(
2690                definition->isFixed() &&
2691                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
2692                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
2693             ctx.defs_done.set(i);
2694             ctx.assignments[definition->tempId()].set(*definition);
2695             register_file.fill(*definition);
2696          }
2697 
2698          handle_pseudo(ctx, register_file, instr.get());
2699 
2700          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
2701           * be read */
2702          for (const Definition& def : instr->definitions) {
2703             if (def.isTemp() && def.isKill())
2704                register_file.clear(def);
2705          }
2706          for (unsigned i = 0; i < instr->operands.size(); i++) {
2707             const Operand& op = instr->operands[i];
2708             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
2709                register_file.clear(op);
2710             if (op.isTemp() && op.physReg().byte() != 0)
2711                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
2712          }
2713 
2714          /* emit parallelcopy */
2715          if (!parallelcopy.empty()) {
2716             aco_ptr<Pseudo_instruction> pc;
2717             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
2718                                                             Format::PSEUDO, parallelcopy.size(),
2719                                                             parallelcopy.size()));
2720             bool linear_vgpr = false;
2721             bool sgpr_operands_alias_defs = false;
2722             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
2723             for (unsigned i = 0; i < parallelcopy.size(); i++) {
2724                linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2725 
2726                if (temp_in_scc && parallelcopy[i].first.isTemp() &&
2727                    parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2728                   if (!sgpr_operands_alias_defs) {
2729                      unsigned reg = parallelcopy[i].first.physReg().reg();
2730                      unsigned size = parallelcopy[i].first.getTemp().size();
2731                      sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
2732 
2733                      reg = parallelcopy[i].second.physReg().reg();
2734                      size = parallelcopy[i].second.getTemp().size();
2735                      if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
2736                         sgpr_operands_alias_defs = true;
2737                   }
2738                }
2739 
2740                pc->operands[i] = parallelcopy[i].first;
2741                pc->definitions[i] = parallelcopy[i].second;
2742                assert(pc->operands[i].size() == pc->definitions[i].size());
2743 
2744                /* it might happen that the operand is already renamed. we have to restore the
2745                 * original name. */
2746                std::unordered_map<unsigned, Temp>::iterator it =
2747                   ctx.orig_names.find(pc->operands[i].tempId());
2748                Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
2749                ctx.orig_names[pc->definitions[i].tempId()] = orig;
2750                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
2751             }
2752 
2753             if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
2754                /* disable definitions and re-enable operands */
2755                RegisterFile tmp_file(register_file);
2756                for (const Definition& def : instr->definitions) {
2757                   if (def.isTemp() && !def.isKill())
2758                      tmp_file.clear(def);
2759                }
2760                for (const Operand& op : instr->operands) {
2761                   if (op.isTemp() && op.isFirstKill())
2762                      tmp_file.block(op.physReg(), op.regClass());
2763                }
2764 
2765                handle_pseudo(ctx, tmp_file, pc.get());
2766             } else {
2767                pc->tmp_in_scc = false;
2768             }
2769 
2770             instructions.emplace_back(std::move(pc));
2771          }
2772 
2773          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
2774          bool instr_needs_vop3 =
2775             !instr->isVOP3() &&
2776             ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
2777              (instr->opcode == aco_opcode::v_cndmask_b32 &&
2778               !(instr->operands[2].physReg() == vcc)) ||
2779              ((instr->opcode == aco_opcode::v_add_co_u32 ||
2780                instr->opcode == aco_opcode::v_addc_co_u32 ||
2781                instr->opcode == aco_opcode::v_sub_co_u32 ||
2782                instr->opcode == aco_opcode::v_subb_co_u32 ||
2783                instr->opcode == aco_opcode::v_subrev_co_u32 ||
2784                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2785               !(instr->definitions[1].physReg() == vcc)) ||
2786              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
2787                instr->opcode == aco_opcode::v_subb_co_u32 ||
2788                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2789               !(instr->operands[2].physReg() == vcc)));
2790          if (instr_needs_vop3) {
2791 
2792             /* if the first operand is a literal, we have to move it to a reg */
2793             if (instr->operands.size() && instr->operands[0].isLiteral() &&
2794                 program->chip_class < GFX10) {
2795                bool can_sgpr = true;
2796                /* check, if we have to move to vgpr */
2797                for (const Operand& op : instr->operands) {
2798                   if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
2799                      can_sgpr = false;
2800                      break;
2801                   }
2802                }
2803                /* disable definitions and re-enable operands */
2804                RegisterFile tmp_file(register_file);
2805                for (const Definition& def : instr->definitions)
2806                   tmp_file.clear(def);
2807                for (const Operand& op : instr->operands) {
2808                   if (op.isTemp() && op.isFirstKill())
2809                      tmp_file.block(op.physReg(), op.regClass());
2810                }
2811                Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
2812                ctx.assignments.emplace_back();
2813                PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);
2814                update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
2815 
2816                aco_ptr<Instruction> mov;
2817                if (can_sgpr)
2818                   mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
2819                                                                  Format::SOP1, 1, 1));
2820                else
2821                   mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
2822                                                                  Format::VOP1, 1, 1));
2823                mov->operands[0] = instr->operands[0];
2824                mov->definitions[0] = Definition(tmp);
2825                mov->definitions[0].setFixed(reg);
2826 
2827                instr->operands[0] = Operand(tmp);
2828                instr->operands[0].setFixed(reg);
2829                instr->operands[0].setFirstKill(true);
2830 
2831                instructions.emplace_back(std::move(mov));
2832             }
2833 
2834             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2835             aco_ptr<Instruction> tmp = std::move(instr);
2836             Format format = asVOP3(tmp->format);
2837             instr.reset(create_instruction<VOP3_instruction>(
2838                tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
2839             std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
2840             std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2841          }
2842 
2843          instructions.emplace_back(std::move(*instr_it));
2844 
2845       } /* end for Instr */
2846 
2847       block.instructions = std::move(instructions);
2848    } /* end for BB */
2849 
2850    /* find scc spill registers which may be needed for parallelcopies created by phis */
2851    for (Block& block : program->blocks) {
2852       if (block.linear_preds.size() <= 1)
2853          continue;
2854 
2855       std::bitset<128> regs = sgpr_live_in[block.index];
2856       if (!regs[127])
2857          continue;
2858 
2859       /* choose a register */
2860       int16_t reg = 0;
2861       for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2862          ;
2863       assert(reg < ctx.program->max_reg_demand.sgpr);
2864       adjust_max_used_regs(ctx, s1, reg);
2865 
2866       /* update predecessors */
2867       for (unsigned& pred_index : block.linear_preds) {
2868          Block& pred = program->blocks[pred_index];
2869          pred.scc_live_out = true;
2870          pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2871       }
2872    }
2873 
2874    /* num_gpr = rnd_up(max_used_gpr + 1) */
2875    program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
2876    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
2877 
2878    program->progress = CompilationProgress::after_ra;
2879 }
2880 
2881 } // namespace aco
2882