• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25  *    Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26  *
27  */
28 
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33 
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37 
38 namespace aco {
39 namespace {
40 
41 struct ra_ctx;
42 
43 unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc);
44 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc);
45 std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc);
46 void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg);
47 
48 struct assignment {
49    PhysReg reg;
50    RegClass rc;
51    uint8_t assigned = 0;
52    assignment() = default;
assignmentaco::__anonc49ff5730111::assignment53    assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
54 };
55 
56 struct phi_info {
57    Instruction* phi;
58    unsigned block_idx;
59    std::set<Instruction*> uses;
60 };
61 
62 struct ra_ctx {
63    std::bitset<512> war_hint;
64    Program* program;
65    std::vector<assignment> assignments;
66    std::vector<std::unordered_map<unsigned, Temp>> renames;
67    std::vector<std::vector<Instruction*>> incomplete_phis;
68    std::vector<bool> filled;
69    std::vector<bool> sealed;
70    std::unordered_map<unsigned, Temp> orig_names;
71    std::unordered_map<unsigned, phi_info> phi_map;
72    std::unordered_map<unsigned, unsigned> affinities;
73    std::unordered_map<unsigned, Instruction*> vectors;
74    std::unordered_map<unsigned, Instruction*> split_vectors;
75    aco_ptr<Instruction> pseudo_dummy;
76    unsigned max_used_sgpr = 0;
77    unsigned max_used_vgpr = 0;
78    std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
79 
ra_ctxaco::__anonc49ff5730111::ra_ctx80    ra_ctx(Program* program) : program(program),
81                               assignments(program->peekAllocationId()),
82                               renames(program->blocks.size()),
83                               incomplete_phis(program->blocks.size()),
84                               filled(program->blocks.size()),
85                               sealed(program->blocks.size())
86    {
87       pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
88    }
89 };
90 
91 struct DefInfo {
92    uint16_t lb;
93    uint16_t ub;
94    uint8_t size;
95    uint8_t stride;
96    RegClass rc;
97 
DefInfoaco::__anonc49ff5730111::DefInfo98    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) {
99       size = rc.size();
100       stride = 1;
101 
102       if (rc.type() == RegType::vgpr) {
103          lb = 256;
104          ub = 256 + ctx.program->max_reg_demand.vgpr;
105       } else {
106          lb = 0;
107          ub = ctx.program->max_reg_demand.sgpr;
108          if (size == 2)
109             stride = 2;
110          else if (size >= 4)
111             stride = 4;
112       }
113 
114       if (rc.is_subdword() && operand >= 0) {
115          /* stride in bytes */
116          stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
117       } else if (rc.is_subdword()) {
118          std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
119          stride = info.first;
120          if (info.second > rc.bytes()) {
121             rc = RegClass::get(rc.type(), info.second);
122             size = rc.size();
123             /* we might still be able to put the definition in the high half,
124              * but that's only useful for affinities and this information isn't
125              * used for them */
126             stride = align(stride, info.second);
127             if (!rc.is_subdword())
128                stride = DIV_ROUND_UP(stride, 4);
129          }
130          assert(stride > 0);
131       }
132    }
133 };
134 
135 class RegisterFile {
136 public:
RegisterFile()137    RegisterFile() {regs.fill(0);}
138 
139    std::array<uint32_t, 512> regs;
140    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
141 
operator [](unsigned index) const142    const uint32_t& operator [] (unsigned index) const {
143       return regs[index];
144    }
145 
operator [](unsigned index)146    uint32_t& operator [] (unsigned index) {
147       return regs[index];
148    }
149 
count_zero(PhysReg start,unsigned size)150    unsigned count_zero(PhysReg start, unsigned size) {
151       unsigned res = 0;
152       for (unsigned i = 0; i < size; i++)
153          res += !regs[start + i];
154       return res;
155    }
156 
test(PhysReg start,unsigned num_bytes)157    bool test(PhysReg start, unsigned num_bytes) {
158       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
159          if (regs[i] & 0x0FFFFFFF)
160             return true;
161          if (regs[i] == 0xF0000000) {
162             assert(subdword_regs.find(i) != subdword_regs.end());
163             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
164                if (subdword_regs[i][j])
165                   return true;
166             }
167          }
168       }
169       return false;
170    }
171 
block(PhysReg start,RegClass rc)172    void block(PhysReg start, RegClass rc) {
173       if (rc.is_subdword())
174          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
175       else
176          fill(start, rc.size(), 0xFFFFFFFF);
177    }
178 
is_blocked(PhysReg start)179    bool is_blocked(PhysReg start) {
180       if (regs[start] == 0xFFFFFFFF)
181          return true;
182       if (regs[start] == 0xF0000000) {
183          for (unsigned i = start.byte(); i < 4; i++)
184             if (subdword_regs[start][i] == 0xFFFFFFFF)
185                return true;
186       }
187       return false;
188    }
189 
is_empty_or_blocked(PhysReg start)190    bool is_empty_or_blocked(PhysReg start) {
191       if (regs[start] == 0xF0000000) {
192          return subdword_regs[start][start.byte()] + 1 <= 1;
193       }
194       return regs[start] + 1 <= 1;
195    }
196 
clear(PhysReg start,RegClass rc)197    void clear(PhysReg start, RegClass rc) {
198       if (rc.is_subdword())
199          fill_subdword(start, rc.bytes(), 0);
200       else
201          fill(start, rc.size(), 0);
202    }
203 
fill(Operand op)204    void fill(Operand op) {
205       if (op.regClass().is_subdword())
206          fill_subdword(op.physReg(), op.bytes(), op.tempId());
207       else
208          fill(op.physReg(), op.size(), op.tempId());
209    }
210 
clear(Operand op)211    void clear(Operand op) {
212       clear(op.physReg(), op.regClass());
213    }
214 
fill(Definition def)215    void fill(Definition def) {
216       if (def.regClass().is_subdword())
217          fill_subdword(def.physReg(), def.bytes(), def.tempId());
218       else
219          fill(def.physReg(), def.size(), def.tempId());
220    }
221 
clear(Definition def)222    void clear(Definition def) {
223       clear(def.physReg(), def.regClass());
224    }
225 
get_id(PhysReg reg)226    unsigned get_id(PhysReg reg) {
227       return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
228    }
229 
230 private:
fill(PhysReg start,unsigned size,uint32_t val)231    void fill(PhysReg start, unsigned size, uint32_t val) {
232       for (unsigned i = 0; i < size; i++)
233          regs[start + i] = val;
234    }
235 
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)236    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
237       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
238       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
239          /* emplace or get */
240          std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
241          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
242             sub[j] = val;
243 
244          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
245             subdword_regs.erase(i);
246             regs[i] = 0;
247          }
248       }
249    }
250 };
251 
252 
253 /* helper function for debugging */
254 #if 0
255 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
256 {
257    unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
258    unsigned lb = vgprs ? 256 : 0;
259    unsigned ub = lb + max;
260    char reg_char = vgprs ? 'v' : 's';
261 
262    /* print markers */
263    printf("       ");
264    for (unsigned i = lb; i < ub; i += 3) {
265       printf("%.2u ", i - lb);
266    }
267    printf("\n");
268 
269    /* print usage */
270    printf("%cgprs: ", reg_char);
271    unsigned free_regs = 0;
272    unsigned prev = 0;
273    bool char_select = false;
274    for (unsigned i = lb; i < ub; i++) {
275       if (reg_file[i] == 0xFFFF) {
276          printf("~");
277       } else if (reg_file[i]) {
278          if (reg_file[i] != prev) {
279             prev = reg_file[i];
280             char_select = !char_select;
281          }
282          printf(char_select ? "#" : "@");
283       } else {
284          free_regs++;
285          printf(".");
286       }
287    }
288    printf("\n");
289 
290    printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
291 
292    /* print assignments */
293    prev = 0;
294    unsigned size = 0;
295    for (unsigned i = lb; i < ub; i++) {
296       if (reg_file[i] != prev) {
297          if (prev && size > 1)
298             printf("-%d]\n", i - 1 - lb);
299          else if (prev)
300             printf("]\n");
301          prev = reg_file[i];
302          if (prev && prev != 0xFFFF) {
303             if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
304                printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
305             else
306                printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
307          }
308          size = 1;
309       } else {
310          size++;
311       }
312    }
313    if (prev && size > 1)
314       printf("-%d]\n", ub - lb - 1);
315    else if (prev)
316       printf("]\n");
317 }
318 #endif
319 
320 
get_subdword_operand_stride(chip_class chip,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)321 unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)
322 {
323    /* v_readfirstlane_b32 cannot use SDWA */
324    if (instr->opcode == aco_opcode::p_as_uniform)
325       return 4;
326    if (instr->format == Format::PSEUDO && chip >= GFX8)
327       return rc.bytes() % 2 == 0 ? 2 : 1;
328 
329    if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
330       return 1;
331    } else if (can_use_SDWA(chip, instr)) {
332       return rc.bytes() % 2 == 0 ? 2 : 1;
333    } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, 1)) {
334       return 2;
335    }
336 
337    switch (instr->opcode) {
338    case aco_opcode::ds_write_b8:
339    case aco_opcode::ds_write_b16:
340       return chip >= GFX8 ? 2 : 4;
341    case aco_opcode::buffer_store_byte:
342    case aco_opcode::buffer_store_short:
343    case aco_opcode::flat_store_byte:
344    case aco_opcode::flat_store_short:
345    case aco_opcode::scratch_store_byte:
346    case aco_opcode::scratch_store_short:
347    case aco_opcode::global_store_byte:
348    case aco_opcode::global_store_short:
349       return chip >= GFX9 ? 2 : 4;
350    default:
351       break;
352    }
353 
354    return 4;
355 }
356 
update_phi_map(ra_ctx & ctx,Instruction * old,Instruction * instr)357 void update_phi_map(ra_ctx& ctx, Instruction *old, Instruction *instr)
358 {
359    for (Operand& op : instr->operands) {
360       if (!op.isTemp())
361          continue;
362       std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(op.tempId());
363       if (phi != ctx.phi_map.end()) {
364          phi->second.uses.erase(old);
365          phi->second.uses.emplace(instr);
366       }
367    }
368 }
369 
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)370 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc)
371 {
372    chip_class chip = ctx.program->chip_class;
373    if (instr->format == Format::PSEUDO || byte == 0)
374       return;
375 
376    assert(rc.bytes() <= 2);
377 
378    if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
379       switch (byte) {
380       case 0:
381          instr->opcode = aco_opcode::v_cvt_f32_ubyte0;
382          break;
383       case 1:
384          instr->opcode = aco_opcode::v_cvt_f32_ubyte1;
385          break;
386       case 2:
387          instr->opcode = aco_opcode::v_cvt_f32_ubyte2;
388          break;
389       case 3:
390          instr->opcode = aco_opcode::v_cvt_f32_ubyte3;
391          break;
392       }
393       return;
394    } else if (can_use_SDWA(chip, instr)) {
395       aco_ptr<Instruction> tmp = convert_to_SDWA(chip, instr);
396       if (tmp)
397          update_phi_map(ctx, tmp.get(), instr.get());
398       return;
399    } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, byte / 2)) {
400       VOP3A_instruction *vop3 = static_cast<VOP3A_instruction *>(instr.get());
401       vop3->opsel |= (byte / 2) << idx;
402       return;
403    }
404 
405    if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b8 && byte == 2) {
406       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
407       return;
408    }
409    if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b16 && byte == 2) {
410       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
411       return;
412    }
413 
414    if (chip >= GFX9 && byte == 2) {
415       if (instr->opcode == aco_opcode::buffer_store_byte)
416          instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
417       else if (instr->opcode == aco_opcode::buffer_store_short)
418          instr->opcode = aco_opcode::buffer_store_short_d16_hi;
419       else if (instr->opcode == aco_opcode::flat_store_byte)
420          instr->opcode = aco_opcode::flat_store_byte_d16_hi;
421       else if (instr->opcode == aco_opcode::flat_store_short)
422          instr->opcode = aco_opcode::flat_store_short_d16_hi;
423       else if (instr->opcode == aco_opcode::scratch_store_byte)
424          instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
425       else if (instr->opcode == aco_opcode::scratch_store_short)
426          instr->opcode = aco_opcode::scratch_store_short_d16_hi;
427       else if (instr->opcode == aco_opcode::global_store_byte)
428          instr->opcode = aco_opcode::global_store_byte_d16_hi;
429       else if (instr->opcode == aco_opcode::global_store_short)
430          instr->opcode = aco_opcode::global_store_short_d16_hi;
431       else
432          unreachable("Something went wrong: Impossible register assignment.");
433    }
434 }
435 
436 /* minimum_stride, bytes_written */
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr,RegClass rc)437 std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc)
438 {
439    chip_class chip = program->chip_class;
440 
441    if (instr->format == Format::PSEUDO && chip >= GFX8)
442       return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
443    else if (instr->format == Format::PSEUDO)
444       return std::make_pair(4, rc.size() * 4u);
445 
446    unsigned bytes_written = chip >= GFX10 ? rc.bytes() : 4u;
447    switch (instr->opcode) {
448    case aco_opcode::v_mad_f16:
449    case aco_opcode::v_mad_u16:
450    case aco_opcode::v_mad_i16:
451    case aco_opcode::v_fma_f16:
452    case aco_opcode::v_div_fixup_f16:
453    case aco_opcode::v_interp_p2_f16:
454       bytes_written = chip >= GFX9 ? rc.bytes() : 4u;
455       break;
456    default:
457       break;
458    }
459    bytes_written = bytes_written > 4 ? align(bytes_written, 4) : bytes_written;
460    bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u);
461 
462    if (can_use_SDWA(chip, instr)) {
463       return std::make_pair(rc.bytes(), rc.bytes());
464    } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {
465       return std::make_pair(2u, bytes_written);
466    }
467 
468    switch (instr->opcode) {
469    case aco_opcode::buffer_load_ubyte_d16:
470    case aco_opcode::buffer_load_short_d16:
471    case aco_opcode::flat_load_ubyte_d16:
472    case aco_opcode::flat_load_short_d16:
473    case aco_opcode::scratch_load_ubyte_d16:
474    case aco_opcode::scratch_load_short_d16:
475    case aco_opcode::global_load_ubyte_d16:
476    case aco_opcode::global_load_short_d16:
477    case aco_opcode::ds_read_u8_d16:
478    case aco_opcode::ds_read_u16_d16:
479       if (chip >= GFX9 && !program->sram_ecc_enabled)
480          return std::make_pair(2u, 2u);
481       else
482          return std::make_pair(2u, 4u);
483    default:
484       break;
485    }
486 
487    return std::make_pair(4u, bytes_written);
488 }
489 
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg)490 void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
491 {
492    RegClass rc = instr->definitions[idx].regClass();
493    chip_class chip = program->chip_class;
494 
495    if (instr->format == Format::PSEUDO) {
496       return;
497    } else if (can_use_SDWA(chip, instr)) {
498       unsigned def_size = instr_info.definition_size[(int)instr->opcode];
499       if (reg.byte() || chip < GFX10 || def_size > rc.bytes() * 8u)
500          convert_to_SDWA(chip, instr);
501       return;
502    } else if (reg.byte() && rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {
503       VOP3A_instruction *vop3 = static_cast<VOP3A_instruction *>(instr.get());
504       if (reg.byte() == 2)
505          vop3->opsel |= (1 << 3); /* dst in high half */
506       return;
507    }
508 
509    if (reg.byte() == 2) {
510       if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
511          instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
512       else if (instr->opcode == aco_opcode::buffer_load_short_d16)
513          instr->opcode = aco_opcode::buffer_load_short_d16_hi;
514       else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
515          instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
516       else if (instr->opcode == aco_opcode::flat_load_short_d16)
517          instr->opcode = aco_opcode::flat_load_short_d16_hi;
518       else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
519          instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
520       else if (instr->opcode == aco_opcode::scratch_load_short_d16)
521          instr->opcode = aco_opcode::scratch_load_short_d16_hi;
522       else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
523          instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
524       else if (instr->opcode == aco_opcode::global_load_short_d16)
525          instr->opcode = aco_opcode::global_load_short_d16_hi;
526       else if (instr->opcode == aco_opcode::ds_read_u8_d16)
527          instr->opcode = aco_opcode::ds_read_u8_d16_hi;
528       else if (instr->opcode == aco_opcode::ds_read_u16_d16)
529          instr->opcode = aco_opcode::ds_read_u16_d16_hi;
530       else
531          unreachable("Something went wrong: Impossible register assignment.");
532    }
533 }
534 
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)535 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
536 {
537    unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
538    unsigned size = rc.size();
539    if (rc.type() == RegType::vgpr) {
540       assert(reg >= 256);
541       unsigned hi = reg - 256 + size - 1;
542       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
543    } else if (reg + rc.size() <= max_addressible_sgpr) {
544       unsigned hi = reg + size - 1;
545       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
546    }
547 }
548 
549 
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,bool rename_not_killed_ops)550 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
551                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
552                     aco_ptr<Instruction>& instr, bool rename_not_killed_ops)
553 {
554    /* allocate id's and rename operands: this is done transparently here */
555    for (std::pair<Operand, Definition>& copy : parallelcopies) {
556       /* the definitions with id are not from this function and already handled */
557       if (copy.second.isTemp())
558          continue;
559 
560       /* check if we we moved another parallelcopy definition */
561       for (std::pair<Operand, Definition>& other : parallelcopies) {
562          if (!other.second.isTemp())
563             continue;
564          if (copy.first.getTemp() == other.second.getTemp()) {
565             copy.first.setTemp(other.first.getTemp());
566             copy.first.setFixed(other.first.physReg());
567          }
568       }
569       // FIXME: if a definition got moved, change the target location and remove the parallelcopy
570       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
571       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
572       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
573       reg_file.fill(copy.second);
574 
575       /* check if we moved an operand */
576       bool first = true;
577       for (unsigned i = 0; i < instr->operands.size(); i++) {
578          Operand& op = instr->operands[i];
579          if (!op.isTemp())
580             continue;
581          if (op.tempId() == copy.first.tempId()) {
582             bool omit_renaming = !rename_not_killed_ops && !op.isKillBeforeDef();
583             for (std::pair<Operand, Definition>& pc : parallelcopies) {
584                PhysReg def_reg = pc.second.physReg();
585                omit_renaming &= def_reg > copy.first.physReg() ?
586                                 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
587                                 (def_reg + pc.second.size() <= copy.first.physReg().reg());
588             }
589             if (omit_renaming) {
590                if (first)
591                   op.setFirstKill(true);
592                else
593                   op.setKill(true);
594                first = false;
595                continue;
596             }
597             op.setTemp(copy.second.getTemp());
598             op.setFixed(copy.second.physReg());
599          }
600       }
601    }
602 }
603 
get_reg_simple(ra_ctx & ctx,RegisterFile & reg_file,DefInfo info)604 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
605                                         RegisterFile& reg_file,
606                                         DefInfo info)
607 {
608    uint32_t lb = info.lb;
609    uint32_t ub = info.ub;
610    uint32_t size = info.size;
611    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
612    RegClass rc = info.rc;
613 
614    if (stride == 1) {
615       info.rc = RegClass(rc.type(), size);
616       for (unsigned stride = 8; stride > 1; stride /= 2) {
617          if (size % stride)
618             continue;
619          info.stride = stride;
620          std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
621          if (res.second)
622             return res;
623       }
624 
625       /* best fit algorithm: find the smallest gap to fit in the variable */
626       unsigned best_pos = 0xFFFF;
627       unsigned gap_size = 0xFFFF;
628       unsigned last_pos = 0xFFFF;
629 
630       for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
631 
632          if (reg_file[current_reg] == 0 && !ctx.war_hint[current_reg]) {
633             if (last_pos == 0xFFFF)
634                last_pos = current_reg;
635 
636             /* stop searching after max_used_gpr */
637             if (current_reg == ctx.max_used_sgpr + 1 || current_reg == 256 + ctx.max_used_vgpr + 1)
638                break;
639             else
640                continue;
641          }
642 
643          if (last_pos == 0xFFFF)
644             continue;
645 
646          /* early return on exact matches */
647          if (last_pos + size == current_reg) {
648             adjust_max_used_regs(ctx, rc, last_pos);
649             return {PhysReg{last_pos}, true};
650          }
651 
652          /* check if it fits and the gap size is smaller */
653          if (last_pos + size < current_reg && current_reg - last_pos < gap_size) {
654             best_pos = last_pos;
655             gap_size = current_reg - last_pos;
656          }
657          last_pos = 0xFFFF;
658       }
659 
660       /* final check */
661       if (last_pos + size <= ub && ub - last_pos < gap_size) {
662          best_pos = last_pos;
663          gap_size = ub - last_pos;
664       }
665 
666       if (best_pos == 0xFFFF)
667          return {{}, false};
668 
669       /* find best position within gap by leaving a good stride for other variables*/
670       unsigned buffer = gap_size - size;
671       if (buffer > 1) {
672          if (((best_pos + size) % 8 != 0 && (best_pos + buffer) % 8 == 0) ||
673              ((best_pos + size) % 4 != 0 && (best_pos + buffer) % 4 == 0) ||
674              ((best_pos + size) % 2 != 0 && (best_pos + buffer) % 2 == 0))
675             best_pos = best_pos + buffer;
676       }
677 
678       adjust_max_used_regs(ctx, rc, best_pos);
679       return {PhysReg{best_pos}, true};
680    }
681 
682    bool found = false;
683    unsigned reg_lo = lb;
684    unsigned reg_hi = lb + size - 1;
685    while (!found && reg_lo + size <= ub) {
686       if (reg_file[reg_lo] != 0) {
687          reg_lo += stride;
688          continue;
689       }
690       reg_hi = reg_lo + size - 1;
691       found = true;
692       for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
693          if (reg_file[reg] != 0 || ctx.war_hint[reg])
694             found = false;
695       }
696       if (found) {
697          adjust_max_used_regs(ctx, rc, reg_lo);
698          return {PhysReg{reg_lo}, true};
699       }
700 
701       reg_lo += stride;
702    }
703 
704    /* do this late because using the upper bytes of a register can require
705     * larger instruction encodings or copies
706     * TODO: don't do this in situations where it doesn't benefit */
707    if (rc.is_subdword()) {
708       for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
709          assert(reg_file[entry.first] == 0xF0000000);
710          if (lb > entry.first || entry.first >= ub)
711             continue;
712 
713          for (unsigned i = 0; i < 4; i+= info.stride) {
714             if (entry.second[i] != 0)
715                continue;
716 
717             bool reg_found = true;
718             for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
719                reg_found &= entry.second[i + j] == 0;
720 
721             /* check neighboring reg if needed */
722             reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
723             if (reg_found) {
724                PhysReg res{entry.first};
725                res.reg_b += i;
726                adjust_max_used_regs(ctx, rc, entry.first);
727                return {res, true};
728             }
729          }
730       }
731    }
732 
733    return {{}, false};
734 }
735 
736 /* collect variables from a register area and clear reg_file */
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,PhysReg reg,unsigned size)737 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
738                                                      PhysReg reg, unsigned size)
739 {
740    std::set<std::pair<unsigned, unsigned>> vars;
741    for (unsigned j = reg; j < reg + size; j++) {
742       if (reg_file.is_blocked(PhysReg{j}))
743          continue;
744       if (reg_file[j] == 0xF0000000) {
745          for (unsigned k = 0; k < 4; k++) {
746             unsigned id = reg_file.subdword_regs[j][k];
747             if (id) {
748                assignment& var = ctx.assignments[id];
749                vars.emplace(var.rc.bytes(), id);
750                reg_file.clear(var.reg, var.rc);
751                if (!reg_file[j])
752                   break;
753             }
754          }
755       } else if (reg_file[j] != 0) {
756          unsigned id = reg_file[j];
757          assignment& var = ctx.assignments[id];
758          vars.emplace(var.rc.bytes(), id);
759          reg_file.clear(var.reg, var.rc);
760       }
761    }
762    return vars;
763 }
764 
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::set<std::pair<unsigned,unsigned>> & vars,uint32_t lb,uint32_t ub,aco_ptr<Instruction> & instr,uint32_t def_reg_lo,uint32_t def_reg_hi)765 bool get_regs_for_copies(ra_ctx& ctx,
766                          RegisterFile& reg_file,
767                          std::vector<std::pair<Operand, Definition>>& parallelcopies,
768                          const std::set<std::pair<unsigned, unsigned>> &vars,
769                          uint32_t lb, uint32_t ub,
770                          aco_ptr<Instruction>& instr,
771                          uint32_t def_reg_lo,
772                          uint32_t def_reg_hi)
773 {
774 
775    /* variables are sorted from small sized to large */
776    /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
777    for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
778       unsigned id = it->second;
779       assignment& var = ctx.assignments[id];
780       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
781       uint32_t size = info.size;
782 
783       /* check if this is a dead operand, then we can re-use the space from the definition
784        * also use the correct stride for sub-dword operands */
785       bool is_dead_operand = false;
786       for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
787          if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
788             if (instr->operands[i].isKillBeforeDef())
789                is_dead_operand = true;
790             info = DefInfo(ctx, instr, var.rc, i);
791             break;
792          }
793       }
794 
795       std::pair<PhysReg, bool> res;
796       if (is_dead_operand) {
797          if (instr->opcode == aco_opcode::p_create_vector) {
798             PhysReg reg(def_reg_lo);
799             for (unsigned i = 0; i < instr->operands.size(); i++) {
800                if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
801                   res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())};
802                   break;
803                }
804                reg.reg_b += instr->operands[i].bytes();
805             }
806             if (!res.second)
807                res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
808          } else {
809             info.lb = def_reg_lo;
810             info.ub = def_reg_hi + 1;
811             res = get_reg_simple(ctx, reg_file, info);
812          }
813       } else {
814          info.lb = lb;
815          info.ub = MIN2(def_reg_lo, ub);
816          res = get_reg_simple(ctx, reg_file, info);
817          if (!res.second && def_reg_hi < ub) {
818             info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
819             info.ub = ub;
820             res = get_reg_simple(ctx, reg_file, info);
821          }
822       }
823 
824       if (res.second) {
825          /* mark the area as blocked */
826          reg_file.block(res.first, var.rc);
827 
828          /* create parallelcopy pair (without definition id) */
829          Temp tmp = Temp(id, var.rc);
830          Operand pc_op = Operand(tmp);
831          pc_op.setFixed(var.reg);
832          Definition pc_def = Definition(res.first, pc_op.regClass());
833          parallelcopies.emplace_back(pc_op, pc_def);
834          continue;
835       }
836 
837       unsigned best_pos = lb;
838       unsigned num_moves = 0xFF;
839       unsigned num_vars = 0;
840 
841       /* we use a sliding window to find potential positions */
842       unsigned reg_lo = lb;
843       unsigned reg_hi = lb + size - 1;
844       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
845       for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
846          if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
847                                   (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
848             continue;
849 
850          /* second, check that we have at most k=num_moves elements in the window
851           * and no element is larger than the currently processed one */
852          unsigned k = 0;
853          unsigned n = 0;
854          unsigned last_var = 0;
855          bool found = true;
856          for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
857             if (reg_file[j] == 0 || reg_file[j] == last_var)
858                continue;
859 
860             if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
861                found = false;
862                break;
863             }
864             if (reg_file[j] == 0xF0000000) {
865                k += 1;
866                n++;
867                continue;
868             }
869             /* we cannot split live ranges of linear vgprs */
870             if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
871                found = false;
872                break;
873             }
874             bool is_kill = false;
875             for (const Operand& op : instr->operands) {
876                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
877                   is_kill = true;
878                   break;
879                }
880             }
881             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
882                found = false;
883                break;
884             }
885 
886             k += ctx.assignments[reg_file[j]].rc.size();
887             last_var = reg_file[j];
888             n++;
889             if (k > num_moves || (k == num_moves && n <= num_vars)) {
890                found = false;
891                break;
892             }
893          }
894 
895          if (found) {
896             best_pos = reg_lo;
897             num_moves = k;
898             num_vars = n;
899          }
900       }
901 
902       /* FIXME: we messed up and couldn't find space for the variables to be copied */
903       if (num_moves == 0xFF)
904          return false;
905 
906       reg_lo = best_pos;
907       reg_hi = best_pos + size - 1;
908 
909       /* collect variables and block reg file */
910       std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
911 
912       /* mark the area as blocked */
913       reg_file.block(PhysReg{reg_lo}, var.rc);
914 
915       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
916          return false;
917 
918       adjust_max_used_regs(ctx, var.rc, reg_lo);
919 
920       /* create parallelcopy pair (without definition id) */
921       Temp tmp = Temp(id, var.rc);
922       Operand pc_op = Operand(tmp);
923       pc_op.setFixed(var.reg);
924       Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
925       parallelcopies.emplace_back(pc_op, pc_def);
926    }
927 
928    return true;
929 }
930 
931 
get_reg_impl(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,DefInfo info,aco_ptr<Instruction> & instr)932 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
933                                       RegisterFile& reg_file,
934                                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
935                                       DefInfo info,
936                                       aco_ptr<Instruction>& instr)
937 {
938    uint32_t lb = info.lb;
939    uint32_t ub = info.ub;
940    uint32_t size = info.size;
941    uint32_t stride = info.stride;
942    RegClass rc = info.rc;
943 
944    /* check how many free regs we have */
945    unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
946 
947    /* mark and count killed operands */
948    unsigned killed_ops = 0;
949    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
950       if (instr->operands[j].isTemp() &&
951           instr->operands[j].isFirstKillBeforeDef() &&
952           instr->operands[j].physReg() >= lb &&
953           instr->operands[j].physReg() < ub &&
954           !reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes())) {
955          assert(instr->operands[j].isFixed());
956          reg_file.block(instr->operands[j].physReg(), instr->operands[j].regClass());
957          killed_ops += instr->operands[j].getTemp().size();
958       }
959    }
960 
961    assert(regs_free >= size);
962    /* we might have to move dead operands to dst in order to make space */
963    unsigned op_moves = 0;
964 
965    if (size > (regs_free - killed_ops))
966       op_moves = size - (regs_free - killed_ops);
967 
968    /* find the best position to place the definition */
969    unsigned best_pos = lb;
970    unsigned num_moves = 0xFF;
971    unsigned num_vars = 0;
972 
973    /* we use a sliding window to check potential positions */
974    unsigned reg_lo = lb;
975    unsigned reg_hi = lb + size - 1;
976    for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
977       /* first check the edges: this is what we have to fix to allow for num_moves > size */
978       if (reg_lo > lb && !reg_file.is_empty_or_blocked(PhysReg(reg_lo)) &&
979           reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
980          continue;
981       if (reg_hi < ub - 1 && !reg_file.is_empty_or_blocked(PhysReg(reg_hi).advance(3)) &&
982           reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
983          continue;
984 
985       /* second, check that we have at most k=num_moves elements in the window
986        * and no element is larger than the currently processed one */
987       unsigned k = op_moves;
988       unsigned n = 0;
989       unsigned remaining_op_moves = op_moves;
990       unsigned last_var = 0;
991       bool found = true;
992       bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
993       for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
994          if (reg_file[j] == 0 || reg_file[j] == last_var)
995             continue;
996 
997          /* dead operands effectively reduce the number of estimated moves */
998          if (reg_file.is_blocked(PhysReg{j})) {
999             if (remaining_op_moves) {
1000                k--;
1001                remaining_op_moves--;
1002             }
1003             continue;
1004          }
1005 
1006          if (reg_file[j] == 0xF0000000) {
1007             k += 1;
1008             n++;
1009             continue;
1010          }
1011 
1012          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1013             found = false;
1014             break;
1015          }
1016 
1017          /* we cannot split live ranges of linear vgprs */
1018          if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
1019             found = false;
1020             break;
1021          }
1022 
1023          k += ctx.assignments[reg_file[j]].rc.size();
1024          n++;
1025          last_var = reg_file[j];
1026       }
1027 
1028       if (!found || k > num_moves)
1029          continue;
1030       if (k == num_moves && n < num_vars)
1031          continue;
1032       if (!aligned && k == num_moves && n == num_vars)
1033          continue;
1034 
1035       if (found) {
1036          best_pos = reg_lo;
1037          num_moves = k;
1038          num_vars = n;
1039       }
1040    }
1041 
1042    if (num_moves == 0xFF) {
1043       /* remove killed operands from reg_file once again */
1044       for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1045          if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
1046             reg_file.clear(instr->operands[i]);
1047       }
1048       for (unsigned i = 0; i < instr->definitions.size(); i++) {
1049          Definition def = instr->definitions[i];
1050          if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
1051             reg_file.fill(def);
1052       }
1053       return {{}, false};
1054    }
1055 
1056    RegisterFile register_file = reg_file;
1057 
1058    /* now, we figured the placement for our definition */
1059    std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1060 
1061    if (instr->opcode == aco_opcode::p_create_vector) {
1062       /* move killed operands which aren't yet at the correct position (GFX9+)
1063        * or which are in the definition space */
1064       PhysReg reg = PhysReg{best_pos};
1065       for (Operand& op : instr->operands) {
1066          if (op.isTemp() && op.isFirstKillBeforeDef() &&
1067              op.getTemp().type() == rc.type()) {
1068             if (op.physReg() != reg &&
1069                 (ctx.program->chip_class >= GFX9 ||
1070                  (op.physReg().advance(op.bytes()) > PhysReg{best_pos} &&
1071                   op.physReg() < PhysReg{best_pos + size}))) {
1072                vars.emplace(op.bytes(), op.tempId());
1073                reg_file.clear(op);
1074             } else {
1075                reg_file.fill(op);
1076             }
1077          }
1078          reg.reg_b += op.bytes();
1079       }
1080    } else if (!is_phi(instr)) {
1081       /* re-enable killed operands */
1082       for (Operand& op : instr->operands) {
1083          if (op.isTemp() && op.isFirstKillBeforeDef())
1084             reg_file.fill(op);
1085       }
1086    }
1087 
1088    std::vector<std::pair<Operand, Definition>> pc;
1089    if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
1090       reg_file = std::move(register_file);
1091       /* remove killed operands from reg_file once again */
1092       if (!is_phi(instr)) {
1093          for (const Operand& op : instr->operands) {
1094             if (op.isTemp() && op.isFirstKillBeforeDef())
1095                reg_file.clear(op);
1096          }
1097       }
1098       for (unsigned i = 0; i < instr->definitions.size(); i++) {
1099          Definition& def = instr->definitions[i];
1100          if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
1101             reg_file.fill(def);
1102       }
1103       return {{}, false};
1104    }
1105 
1106    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1107 
1108    /* we set the definition regs == 0. the actual caller is responsible for correct setting */
1109    reg_file.clear(PhysReg{best_pos}, rc);
1110 
1111    update_renames(ctx, reg_file, parallelcopies, instr, instr->opcode != aco_opcode::p_create_vector);
1112 
1113    /* remove killed operands from reg_file once again */
1114    for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1115       if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
1116          continue;
1117       assert(!instr->operands[i].isUndefined());
1118       if (instr->operands[i].isFirstKillBeforeDef())
1119          reg_file.clear(instr->operands[i]);
1120    }
1121    for (unsigned i = 0; i < instr->definitions.size(); i++) {
1122       Definition def = instr->definitions[i];
1123       if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
1124          reg_file.fill(def);
1125    }
1126 
1127    adjust_max_used_regs(ctx, rc, best_pos);
1128    return {PhysReg{best_pos}, true};
1129 }
1130 
get_reg_specified(ra_ctx & ctx,RegisterFile & reg_file,RegClass rc,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,PhysReg reg)1131 bool get_reg_specified(ra_ctx& ctx,
1132                        RegisterFile& reg_file,
1133                        RegClass rc,
1134                        std::vector<std::pair<Operand, Definition>>& parallelcopies,
1135                        aco_ptr<Instruction>& instr,
1136                        PhysReg reg)
1137 {
1138    std::pair<unsigned, unsigned> sdw_def_info;
1139    if (rc.is_subdword())
1140       sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
1141 
1142    if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
1143       return false;
1144    if (!rc.is_subdword() && reg.byte())
1145       return false;
1146 
1147    uint32_t size = rc.size();
1148    uint32_t stride = 1;
1149    uint32_t lb, ub;
1150 
1151    if (rc.type() == RegType::vgpr) {
1152       lb = 256;
1153       ub = 256 + ctx.program->max_reg_demand.vgpr;
1154    } else {
1155       if (size == 2)
1156          stride = 2;
1157       else if (size >= 4)
1158          stride = 4;
1159       if (reg % stride != 0)
1160          return false;
1161       lb = 0;
1162       ub = ctx.program->max_reg_demand.sgpr;
1163    }
1164 
1165    uint32_t reg_lo = reg.reg();
1166    uint32_t reg_hi = reg + (size - 1);
1167 
1168    if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
1169       return false;
1170 
1171    if (rc.is_subdword()) {
1172       PhysReg test_reg;
1173       test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
1174       if (reg_file.test(test_reg, sdw_def_info.second))
1175          return false;
1176    } else {
1177       if (reg_file.test(reg, rc.bytes()))
1178          return false;
1179    }
1180 
1181    adjust_max_used_regs(ctx, rc, reg_lo);
1182    return true;
1183 }
1184 
get_reg(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1185 PhysReg get_reg(ra_ctx& ctx,
1186                 RegisterFile& reg_file,
1187                 Temp temp,
1188                 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1189                 aco_ptr<Instruction>& instr,
1190                 int operand_index=-1)
1191 {
1192    auto split_vec = ctx.split_vectors.find(temp.id());
1193    if (split_vec != ctx.split_vectors.end()) {
1194       unsigned offset = 0;
1195       for (Definition def : split_vec->second->definitions) {
1196          auto affinity_it = ctx.affinities.find(def.tempId());
1197          if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
1198             PhysReg reg = ctx.assignments[affinity_it->second].reg;
1199             reg.reg_b -= offset;
1200             if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
1201                return reg;
1202          }
1203          offset += def.bytes();
1204       }
1205    }
1206 
1207    if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
1208        ctx.assignments[ctx.affinities[temp.id()]].assigned) {
1209       PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
1210       if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
1211          return reg;
1212    }
1213 
1214    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1215       Instruction* vec = ctx.vectors[temp.id()];
1216       unsigned byte_offset = 0;
1217       for (const Operand& op : vec->operands) {
1218          if (op.isTemp() && op.tempId() == temp.id())
1219             break;
1220          else
1221             byte_offset += op.bytes();
1222       }
1223       unsigned k = 0;
1224       for (const Operand& op : vec->operands) {
1225          if (op.isTemp() &&
1226              op.tempId() != temp.id() &&
1227              op.getTemp().type() == temp.type() &&
1228              ctx.assignments[op.tempId()].assigned) {
1229             PhysReg reg = ctx.assignments[op.tempId()].reg;
1230             reg.reg_b += (byte_offset - k);
1231             if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
1232                return reg;
1233          }
1234          k += op.bytes();
1235       }
1236 
1237       DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass(), -1);
1238       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
1239       PhysReg reg = res.first;
1240       if (res.second) {
1241          reg.reg_b += byte_offset;
1242          /* make sure to only use byte offset if the instruction supports it */
1243          if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
1244             return reg;
1245       }
1246    }
1247 
1248    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1249 
1250    /* try to find space without live-range splits */
1251    std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
1252 
1253    if (res.second)
1254       return res.first;
1255 
1256    /* try to find space with live-range splits */
1257    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1258 
1259    if (res.second)
1260       return res.first;
1261 
1262    /* try using more registers */
1263 
1264    /* We should only fail here because keeping under the limit would require
1265     * too many moves. */
1266    assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
1267 
1268    uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
1269    uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
1270    if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
1271       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
1272       return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1273    } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
1274       update_vgpr_sgpr_demand(ctx.program,  RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
1275       return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1276    }
1277 
1278    //FIXME: if nothing helps, shift-rotate the registers to make space
1279 
1280    aco_err(ctx.program, "Failed to allocate registers during shader compilation.");
1281    abort();
1282 }
1283 
get_reg_create_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1284 PhysReg get_reg_create_vector(ra_ctx& ctx,
1285                               RegisterFile& reg_file,
1286                               Temp temp,
1287                               std::vector<std::pair<Operand, Definition>>& parallelcopies,
1288                               aco_ptr<Instruction>& instr)
1289 {
1290    RegClass rc = temp.regClass();
1291    /* create_vector instructions have different costs w.r.t. register coalescing */
1292    uint32_t size = rc.size();
1293    uint32_t bytes = rc.bytes();
1294    uint32_t stride = 1;
1295    uint32_t lb, ub;
1296    if (rc.type() == RegType::vgpr) {
1297       lb = 256;
1298       ub = 256 + ctx.program->max_reg_demand.vgpr;
1299    } else {
1300       lb = 0;
1301       ub = ctx.program->max_reg_demand.sgpr;
1302       if (size == 2)
1303          stride = 2;
1304       else if (size >= 4)
1305          stride = 4;
1306    }
1307 
1308    //TODO: improve p_create_vector for sub-dword vectors
1309 
1310    unsigned best_pos = -1;
1311    unsigned num_moves = 0xFF;
1312    bool best_war_hint = true;
1313 
1314    /* test for each operand which definition placement causes the least shuffle instructions */
1315    for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1316       // TODO: think about, if we can alias live operands on the same register
1317       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
1318          continue;
1319 
1320       if (offset > instr->operands[i].physReg().reg_b)
1321          continue;
1322 
1323       unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
1324       if (reg_lo % 4)
1325          continue;
1326       reg_lo /= 4;
1327       unsigned reg_hi = reg_lo + size - 1;
1328       unsigned k = 0;
1329 
1330       /* no need to check multiple times */
1331       if (reg_lo == best_pos)
1332          continue;
1333 
1334       /* check borders */
1335       // TODO: this can be improved */
1336       if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
1337          continue;
1338       if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
1339          continue;
1340       if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
1341          continue;
1342 
1343       /* count variables to be moved and check war_hint */
1344       bool war_hint = false;
1345       bool linear_vgpr = false;
1346       for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
1347          if (reg_file[j] != 0) {
1348             if (reg_file[j] == 0xF0000000) {
1349                PhysReg reg;
1350                reg.reg_b = j * 4;
1351                unsigned bytes_left = bytes - (j - reg_lo) * 4;
1352                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1353                   k += reg_file.test(reg, 1);
1354             } else {
1355                k += 4;
1356                /* we cannot split live ranges of linear vgprs */
1357                if (ctx.assignments[reg_file[j]].rc & (1 << 6))
1358                   linear_vgpr = true;
1359             }
1360          }
1361          war_hint |= ctx.war_hint[j];
1362       }
1363       if (linear_vgpr || (war_hint && !best_war_hint))
1364          continue;
1365 
1366       /* count operands in wrong positions */
1367       for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
1368          if (j == i ||
1369              !instr->operands[j].isTemp() ||
1370              instr->operands[j].getTemp().type() != rc.type())
1371             continue;
1372          if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
1373             k += instr->operands[j].bytes();
1374       }
1375       bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
1376       if (k > num_moves || (!aligned && k == num_moves))
1377          continue;
1378 
1379       best_pos = reg_lo;
1380       num_moves = k;
1381       best_war_hint = war_hint;
1382    }
1383 
1384    if (num_moves >= bytes)
1385       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1386 
1387    /* re-enable killed operands which are in the wrong position */
1388    for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1389       if (instr->operands[i].isTemp() &&
1390           instr->operands[i].isFirstKillBeforeDef() &&
1391           instr->operands[i].physReg().reg_b != best_pos * 4 + offset)
1392          reg_file.fill(instr->operands[i]);
1393    }
1394 
1395    /* collect variables to be moved */
1396    std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1397 
1398    for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1399       if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
1400           instr->operands[i].getTemp().type() != rc.type())
1401          continue;
1402       bool correct_pos = instr->operands[i].physReg().reg_b == best_pos * 4 + offset;
1403       /* GFX9+: move killed operands which aren't yet at the correct position
1404        * Moving all killed operands generally leads to more register swaps.
1405        * This is only done on GFX9+ because of the cheap v_swap instruction.
1406        */
1407       if (ctx.program->chip_class >= GFX9 && !correct_pos) {
1408          vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1409          reg_file.clear(instr->operands[i]);
1410       /* fill operands which are in the correct position to avoid overwriting */
1411       } else if (correct_pos) {
1412          reg_file.fill(instr->operands[i]);
1413       }
1414    }
1415    ASSERTED bool success = false;
1416    success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1417    assert(success);
1418 
1419    update_renames(ctx, reg_file, parallelcopies, instr, false);
1420    adjust_max_used_regs(ctx, rc, best_pos);
1421 
1422    /* remove killed operands from reg_file once again */
1423    for (unsigned i = 0; i < instr->operands.size(); i++) {
1424       if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
1425          continue;
1426       assert(!instr->operands[i].isUndefined());
1427       if (instr->operands[i].isFirstKillBeforeDef())
1428          reg_file.clear(instr->operands[i]);
1429    }
1430 
1431    return PhysReg{best_pos};
1432 }
1433 
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1434 void handle_pseudo(ra_ctx& ctx,
1435                    const RegisterFile& reg_file,
1436                    Instruction* instr)
1437 {
1438    if (instr->format != Format::PSEUDO)
1439       return;
1440 
1441    /* all instructions which use handle_operands() need this information */
1442    switch (instr->opcode) {
1443    case aco_opcode::p_extract_vector:
1444    case aco_opcode::p_create_vector:
1445    case aco_opcode::p_split_vector:
1446    case aco_opcode::p_parallelcopy:
1447    case aco_opcode::p_wqm:
1448       break;
1449    default:
1450       return;
1451    }
1452 
1453    /* if all definitions are vgpr, no need to care for SCC */
1454    bool writes_sgpr = false;
1455    for (Definition& def : instr->definitions) {
1456       if (def.getTemp().type() == RegType::sgpr) {
1457          writes_sgpr = true;
1458          break;
1459       }
1460    }
1461    /* if all operands are constant, no need to care either */
1462    bool reads_sgpr = false;
1463    bool reads_subdword = false;
1464    for (Operand& op : instr->operands) {
1465       if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1466          reads_sgpr = true;
1467          break;
1468       }
1469       if (op.isTemp() && op.regClass().is_subdword())
1470          reads_subdword = true;
1471    }
1472    bool needs_scratch_reg = (writes_sgpr && reads_sgpr) ||
1473                             (ctx.program->chip_class <= GFX7 && reads_subdword);
1474    if (!needs_scratch_reg)
1475       return;
1476 
1477    Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1478    if (reg_file[scc.reg()]) {
1479       pi->tmp_in_scc = true;
1480 
1481       int reg = ctx.max_used_sgpr;
1482       for (; reg >= 0 && reg_file[reg]; reg--)
1483          ;
1484       if (reg < 0) {
1485          reg = ctx.max_used_sgpr + 1;
1486          for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1487             ;
1488          if (reg == ctx.program->max_reg_demand.sgpr) {
1489             assert(reads_subdword && reg_file[m0] == 0);
1490             reg = m0;
1491          }
1492       }
1493 
1494       adjust_max_used_regs(ctx, s1, reg);
1495       pi->scratch_sgpr = PhysReg{(unsigned)reg};
1496    } else {
1497       pi->tmp_in_scc = false;
1498    }
1499 }
1500 
operand_can_use_reg(chip_class chip,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)1501 bool operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, RegClass rc)
1502 {
1503    if (instr->operands[idx].isFixed())
1504       return instr->operands[idx].physReg() == reg;
1505 
1506    bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
1507                        instr->opcode == aco_opcode::v_writelane_b32_e64;
1508    if (chip <= GFX9 && is_writelane && idx <= 1) {
1509       /* v_writelane_b32 can take two sgprs but only if one is m0. */
1510       bool is_other_sgpr = instr->operands[!idx].isTemp() &&
1511                            (!instr->operands[!idx].isFixed() ||
1512                             instr->operands[!idx].physReg() != m0);
1513       if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
1514          instr->operands[idx].setFixed(m0);
1515          return reg == m0;
1516       }
1517    }
1518 
1519    if (reg.byte()) {
1520       unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
1521       if (reg.byte() % stride)
1522          return false;
1523    }
1524 
1525    switch (instr->format) {
1526    case Format::SMEM:
1527       return reg != scc &&
1528              reg != exec &&
1529              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1530              (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1531    default:
1532       // TODO: there are more instructions with restrictions on registers
1533       return true;
1534    }
1535 }
1536 
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)1537 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1538                          std::vector<std::pair<Operand, Definition>>& parallelcopy,
1539                          aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
1540 {
1541    /* check if the operand is fixed */
1542    PhysReg dst;
1543    if (operand.isFixed()) {
1544       assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1545 
1546       /* check if target reg is blocked, and move away the blocking var */
1547       if (register_file[operand.physReg().reg()]) {
1548          assert(register_file[operand.physReg()] != 0xF0000000);
1549          uint32_t blocking_id = register_file[operand.physReg().reg()];
1550          RegClass rc = ctx.assignments[blocking_id].rc;
1551          Operand pc_op = Operand(Temp{blocking_id, rc});
1552          pc_op.setFixed(operand.physReg());
1553 
1554          /* find free reg */
1555          PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
1556          Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1557          register_file.clear(pc_op);
1558          parallelcopy.emplace_back(pc_op, pc_def);
1559       }
1560       dst = operand.physReg();
1561 
1562    } else {
1563       dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
1564    }
1565 
1566    Operand pc_op = operand;
1567    pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1568    Definition pc_def = Definition(dst, pc_op.regClass());
1569    register_file.clear(pc_op);
1570    parallelcopy.emplace_back(pc_op, pc_def);
1571    update_renames(ctx, register_file, parallelcopy, instr, true);
1572 }
1573 
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)1574 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1575 {
1576    std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1577    if (it == ctx.renames[block_idx].end())
1578       return val;
1579    else
1580       return it->second;
1581 }
1582 
handle_live_in(ra_ctx & ctx,Temp val,Block * block)1583 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1584 {
1585    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1586    if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1587       return val;
1588 
1589    assert(preds.size() > 0);
1590 
1591    Temp new_val;
1592    if (!ctx.sealed[block->index]) {
1593       /* consider rename from already processed predecessor */
1594       Temp tmp = read_variable(ctx, val, preds[0]);
1595 
1596       /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1597       new_val = ctx.program->allocateTmp(val.regClass());
1598       ctx.assignments.emplace_back();
1599       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1600       aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1601       phi->definitions[0] = Definition(new_val);
1602       for (unsigned i = 0; i < preds.size(); i++)
1603          phi->operands[i] = Operand(val);
1604       if (tmp.regClass() == new_val.regClass())
1605          ctx.affinities[new_val.id()] = tmp.id();
1606 
1607       ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1608       ctx.incomplete_phis[block->index].emplace_back(phi.get());
1609       block->instructions.insert(block->instructions.begin(), std::move(phi));
1610 
1611    } else if (preds.size() == 1) {
1612       /* if the block has only one predecessor, just look there for the name */
1613       new_val = read_variable(ctx, val, preds[0]);
1614    } else {
1615       /* there are multiple predecessors and the block is sealed */
1616       Temp *const ops = (Temp *)alloca(preds.size() * sizeof(Temp));
1617 
1618       /* get the rename from each predecessor and check if they are the same */
1619       bool needs_phi = false;
1620       for (unsigned i = 0; i < preds.size(); i++) {
1621          ops[i] = read_variable(ctx, val, preds[i]);
1622          if (i == 0)
1623             new_val = ops[i];
1624          else
1625             needs_phi |= !(new_val == ops[i]);
1626       }
1627 
1628       if (needs_phi) {
1629          /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1630          aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1631          aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1632          new_val = ctx.program->allocateTmp(val.regClass());
1633          phi->definitions[0] = Definition(new_val);
1634          for (unsigned i = 0; i < preds.size(); i++) {
1635             phi->operands[i] = Operand(ops[i]);
1636             phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1637             if (ops[i].regClass() == new_val.regClass())
1638                ctx.affinities[new_val.id()] = ops[i].id();
1639             /* make sure the operand gets it's original name in case
1640              * it comes from an incomplete phi */
1641             std::unordered_map<unsigned, phi_info>::iterator it = ctx.phi_map.find(ops[i].id());
1642             if (it != ctx.phi_map.end())
1643                it->second.uses.emplace(phi.get());
1644          }
1645          ctx.assignments.emplace_back();
1646          assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1647          ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1648          block->instructions.insert(block->instructions.begin(), std::move(phi));
1649       }
1650    }
1651 
1652    if (new_val != val) {
1653       ctx.renames[block->index][val.id()] = new_val;
1654       ctx.orig_names[new_val.id()] = val;
1655    }
1656    return new_val;
1657 }
1658 
try_remove_trivial_phi(ra_ctx & ctx,Temp temp)1659 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1660 {
1661    std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1662 
1663    if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1664       return;
1665 
1666    assert(info->second.block_idx != 0);
1667    Instruction* phi = info->second.phi;
1668    Temp same = Temp();
1669    Definition def = phi->definitions[0];
1670 
1671    /* a phi node is trivial if all operands are the same as the definition of the phi */
1672    for (const Operand& op : phi->operands) {
1673       const Temp t = op.getTemp();
1674       if (t == same || t == def.getTemp()) {
1675          assert(t == same || op.physReg() == def.physReg());
1676          continue;
1677       }
1678       if (same != Temp() || op.physReg() != def.physReg())
1679          return;
1680 
1681       same = t;
1682    }
1683    assert(same != Temp() || same == def.getTemp());
1684 
1685    /* reroute all uses to same and remove phi */
1686    std::vector<Temp> phi_users;
1687    std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1688    for (Instruction* instr : info->second.uses) {
1689       assert(phi != instr);
1690       /* recursively try to remove trivial phis */
1691       if (is_phi(instr)) {
1692          /* ignore if the phi was already flagged trivial */
1693          if (instr->definitions.empty())
1694             continue;
1695 
1696          if (instr->definitions[0].getTemp() != temp)
1697             phi_users.emplace_back(instr->definitions[0].getTemp());
1698       }
1699       for (Operand& op : instr->operands) {
1700          if (op.isTemp() && op.tempId() == def.tempId()) {
1701             op.setTemp(same);
1702             if (same_phi_info != ctx.phi_map.end())
1703                same_phi_info->second.uses.emplace(instr);
1704          }
1705       }
1706    }
1707 
1708    auto it = ctx.orig_names.find(same.id());
1709    unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1710    for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1711       auto it = ctx.renames[i].find(orig_var);
1712       if (it != ctx.renames[i].end() && it->second == def.getTemp())
1713          ctx.renames[i][orig_var] = same;
1714    }
1715 
1716    phi->definitions.clear(); /* this indicates that the phi can be removed */
1717    ctx.phi_map.erase(info);
1718    for (Temp t : phi_users)
1719       try_remove_trivial_phi(ctx, t);
1720 
1721    return;
1722 }
1723 
1724 } /* end namespace */
1725 
1726 
register_allocation(Program * program,std::vector<IDSet> & live_out_per_block)1727 void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block)
1728 {
1729    ra_ctx ctx(program);
1730    std::vector<std::vector<Temp>> phi_ressources;
1731    std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1732 
1733    for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1734       Block& block = *it;
1735 
1736       /* first, compute the death points of all live vars within the block */
1737       IDSet& live = live_out_per_block[block.index];
1738 
1739       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1740       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1741          aco_ptr<Instruction>& instr = *rit;
1742          if (is_phi(instr)) {
1743             if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
1744                live.erase(instr->definitions[0].tempId());
1745                continue;
1746             }
1747             /* collect information about affinity-related temporaries */
1748             std::vector<Temp> affinity_related;
1749             /* affinity_related[0] is the last seen affinity-related temp */
1750             affinity_related.emplace_back(instr->definitions[0].getTemp());
1751             affinity_related.emplace_back(instr->definitions[0].getTemp());
1752             for (const Operand& op : instr->operands) {
1753                if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1754                   affinity_related.emplace_back(op.getTemp());
1755                   temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1756                }
1757             }
1758             phi_ressources.emplace_back(std::move(affinity_related));
1759          } else {
1760             /* add vector affinities */
1761             if (instr->opcode == aco_opcode::p_create_vector) {
1762                for (const Operand& op : instr->operands) {
1763                   if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1764                      ctx.vectors[op.tempId()] = instr.get();
1765                }
1766             }
1767 
1768             if (instr->opcode == aco_opcode::p_split_vector && instr->operands[0].isFirstKillBeforeDef())
1769                ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
1770 
1771             /* add operands to live variables */
1772             for (const Operand& op : instr->operands) {
1773                if (op.isTemp())
1774                   live.insert(op.tempId());
1775             }
1776          }
1777 
1778          /* erase definitions from live */
1779          for (unsigned i = 0; i < instr->definitions.size(); i++) {
1780             const Definition& def = instr->definitions[i];
1781             if (!def.isTemp())
1782                continue;
1783             live.erase(def.tempId());
1784             /* mark last-seen phi operand */
1785             std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1786             if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1787                phi_ressources[it->second][0] = def.getTemp();
1788                /* try to coalesce phi affinities with parallelcopies */
1789                Operand op = Operand();
1790                if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
1791                   op = instr->operands[i];
1792                else if ((instr->opcode == aco_opcode::v_mad_f32 ||
1793                         (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
1794                         instr->opcode == aco_opcode::v_mad_f16 ||
1795                         instr->opcode == aco_opcode::v_mad_legacy_f16 ||
1796                         (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers())
1797                   op = instr->operands[2];
1798 
1799                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1800                   phi_ressources[it->second].emplace_back(op.getTemp());
1801                   temp_to_phi_ressources[op.tempId()] = it->second;
1802                }
1803             }
1804          }
1805       }
1806    }
1807    /* create affinities */
1808    for (std::vector<Temp>& vec : phi_ressources) {
1809       assert(vec.size() > 1);
1810       for (unsigned i = 1; i < vec.size(); i++)
1811          if (vec[i].id() != vec[0].id())
1812             ctx.affinities[vec[i].id()] = vec[0].id();
1813    }
1814 
1815    /* state of register file after phis */
1816    std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1817 
1818    for (Block& block : program->blocks) {
1819       IDSet& live = live_out_per_block[block.index];
1820       /* initialize register file */
1821       assert(block.index != 0 || live.empty());
1822       RegisterFile register_file;
1823       ctx.war_hint.reset();
1824 
1825       for (unsigned t : live) {
1826          Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block);
1827          assignment& var = ctx.assignments[renamed.id()];
1828          /* due to live-range splits, the live-in might be a phi, now */
1829          if (var.assigned)
1830             register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1831       }
1832 
1833       std::vector<aco_ptr<Instruction>> instructions;
1834       std::vector<aco_ptr<Instruction>>::iterator it;
1835 
1836       /* this is a slight adjustment from the paper as we already have phi nodes:
1837        * We consider them incomplete phis and only handle the definition. */
1838 
1839       /* handle fixed phi definitions */
1840       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1841          aco_ptr<Instruction>& phi = *it;
1842          if (!is_phi(phi))
1843             break;
1844          Definition& definition = phi->definitions[0];
1845          if (!definition.isFixed())
1846             continue;
1847 
1848          /* check if a dead exec mask phi is needed */
1849          if (definition.isKill()) {
1850             for (Operand& op : phi->operands) {
1851                assert(op.isTemp());
1852                if (!ctx.assignments[op.tempId()].assigned ||
1853                    ctx.assignments[op.tempId()].reg != exec) {
1854                    definition.setKill(false);
1855                    break;
1856                }
1857             }
1858          }
1859 
1860          if (definition.isKill())
1861             continue;
1862 
1863          assert(definition.physReg() == exec);
1864          assert(!register_file.test(definition.physReg(), definition.bytes()));
1865          register_file.fill(definition);
1866          ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1867       }
1868 
1869       /* look up the affinities */
1870       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1871          aco_ptr<Instruction>& phi = *it;
1872          if (!is_phi(phi))
1873             break;
1874          Definition& definition = phi->definitions[0];
1875          if (definition.isKill() || definition.isFixed())
1876              continue;
1877 
1878          if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1879              ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1880             assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1881             PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1882             bool try_use_special_reg = reg == scc || reg == exec;
1883             if (try_use_special_reg) {
1884                for (const Operand& op : phi->operands) {
1885                   if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1886                         ctx.assignments[op.tempId()].reg == reg)) {
1887                      try_use_special_reg = false;
1888                      break;
1889                   }
1890                }
1891                if (!try_use_special_reg)
1892                   continue;
1893             }
1894             /* only assign if register is still free */
1895             if (!register_file.test(reg, definition.bytes())) {
1896                definition.setFixed(reg);
1897                register_file.fill(definition);
1898                ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1899             }
1900          }
1901       }
1902 
1903       /* find registers for phis without affinity or where the register was blocked */
1904       for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1905          aco_ptr<Instruction>& phi = *it;
1906          if (!is_phi(phi))
1907             break;
1908 
1909          Definition& definition = phi->definitions[0];
1910          if (definition.isKill())
1911             continue;
1912 
1913          if (!definition.isFixed()) {
1914             std::vector<std::pair<Operand, Definition>> parallelcopy;
1915             /* try to find a register that is used by at least one operand */
1916             for (const Operand& op : phi->operands) {
1917                if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1918                   continue;
1919                PhysReg reg = ctx.assignments[op.tempId()].reg;
1920                /* we tried this already on the previous loop */
1921                if (reg == scc || reg == exec)
1922                   continue;
1923                if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1924                   definition.setFixed(reg);
1925                   break;
1926                }
1927             }
1928             if (!definition.isFixed())
1929                definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1930 
1931             /* process parallelcopy */
1932             for (std::pair<Operand, Definition> pc : parallelcopy) {
1933                /* see if it's a copy from a different phi */
1934                //TODO: prefer moving some previous phis over live-ins
1935                //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1936                Instruction *prev_phi = NULL;
1937                std::vector<aco_ptr<Instruction>>::iterator phi_it;
1938                for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1939                   if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1940                      prev_phi = phi_it->get();
1941                }
1942                phi_it = it;
1943                while (!prev_phi && is_phi(*++phi_it)) {
1944                   if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1945                      prev_phi = phi_it->get();
1946                }
1947                if (prev_phi) {
1948                   /* if so, just update that phi's register */
1949                   register_file.clear(prev_phi->definitions[0]);
1950                   prev_phi->definitions[0].setFixed(pc.second.physReg());
1951                   ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1952                   register_file.fill(prev_phi->definitions[0]);
1953                   continue;
1954                }
1955 
1956                /* rename */
1957                std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1958                Temp orig = pc.first.getTemp();
1959                if (orig_it != ctx.orig_names.end())
1960                   orig = orig_it->second;
1961                else
1962                   ctx.orig_names[pc.second.tempId()] = orig;
1963                ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1964 
1965                /* otherwise, this is a live-in and we need to create a new phi
1966                 * to move it in this block's predecessors */
1967                aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1968                std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1969                aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1970                new_phi->definitions[0] = pc.second;
1971                for (unsigned i = 0; i < preds.size(); i++)
1972                   new_phi->operands[i] = Operand(pc.first);
1973                instructions.emplace_back(std::move(new_phi));
1974             }
1975 
1976             register_file.fill(definition);
1977             ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1978          }
1979          live.insert(definition.tempId());
1980 
1981          /* update phi affinities */
1982          for (const Operand& op : phi->operands) {
1983             if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1984                ctx.affinities[op.tempId()] = definition.tempId();
1985          }
1986 
1987          instructions.emplace_back(std::move(*it));
1988       }
1989 
1990       /* fill in sgpr_live_in */
1991       for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1992          sgpr_live_in[block.index][i] = register_file[i];
1993       sgpr_live_in[block.index][127] = register_file[scc.reg()];
1994 
1995       /* Handle all other instructions of the block */
1996       for (; it != block.instructions.end(); ++it) {
1997          aco_ptr<Instruction>& instr = *it;
1998 
1999          /* parallelcopies from p_phi are inserted here which means
2000           * live ranges of killed operands end here as well */
2001          if (instr->opcode == aco_opcode::p_logical_end) {
2002             /* no need to process this instruction any further */
2003             if (block.logical_succs.size() != 1) {
2004                instructions.emplace_back(std::move(instr));
2005                continue;
2006             }
2007 
2008             Block& succ = program->blocks[block.logical_succs[0]];
2009             unsigned idx = 0;
2010             for (; idx < succ.logical_preds.size(); idx++) {
2011                if (succ.logical_preds[idx] == block.index)
2012                   break;
2013             }
2014             for (aco_ptr<Instruction>& phi : succ.instructions) {
2015                if (phi->opcode == aco_opcode::p_phi) {
2016                   if (phi->operands[idx].isTemp() &&
2017                       phi->operands[idx].getTemp().type() == RegType::sgpr &&
2018                       phi->operands[idx].isFirstKillBeforeDef()) {
2019                      Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
2020                      PhysReg reg = ctx.assignments[phi_op.id()].reg;
2021                      assert(register_file[reg] == phi_op.id());
2022                      register_file[reg] = 0;
2023                   }
2024                } else if (phi->opcode != aco_opcode::p_linear_phi) {
2025                   break;
2026                }
2027             }
2028             instructions.emplace_back(std::move(instr));
2029             continue;
2030          }
2031 
2032          std::vector<std::pair<Operand, Definition>> parallelcopy;
2033 
2034          assert(!is_phi(instr));
2035 
2036          /* handle operands */
2037          for (unsigned i = 0; i < instr->operands.size(); ++i) {
2038             auto& operand = instr->operands[i];
2039             if (!operand.isTemp())
2040                continue;
2041 
2042             /* rename operands */
2043             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
2044             assert(ctx.assignments[operand.tempId()].assigned);
2045 
2046             PhysReg reg = ctx.assignments[operand.tempId()].reg;
2047             if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
2048                operand.setFixed(reg);
2049             else
2050                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
2051 
2052             if (instr->format == Format::EXP ||
2053                 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
2054                 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
2055                for (unsigned j = 0; j < operand.size(); j++)
2056                   ctx.war_hint.set(operand.physReg().reg() + j);
2057             }
2058 
2059             std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2060             if (phi != ctx.phi_map.end())
2061                phi->second.uses.emplace(instr.get());
2062          }
2063 
2064          /* remove dead vars from register file */
2065          for (const Operand& op : instr->operands) {
2066             if (op.isTemp() && op.isFirstKillBeforeDef())
2067                register_file.clear(op);
2068          }
2069 
2070          /* try to optimize v_mad_f32 -> v_mac_f32 */
2071          if ((instr->opcode == aco_opcode::v_mad_f32 ||
2072               (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
2073               instr->opcode == aco_opcode::v_mad_f16 ||
2074               instr->opcode == aco_opcode::v_mad_legacy_f16 ||
2075               (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) &&
2076              instr->operands[2].isTemp() &&
2077              instr->operands[2].isKillBeforeDef() &&
2078              instr->operands[2].getTemp().type() == RegType::vgpr &&
2079              instr->operands[1].isTemp() &&
2080              instr->operands[1].getTemp().type() == RegType::vgpr &&
2081              !instr->usesModifiers() &&
2082              instr->operands[0].physReg().byte() == 0 &&
2083              instr->operands[1].physReg().byte() == 0 &&
2084              instr->operands[2].physReg().byte() == 0) {
2085             unsigned def_id = instr->definitions[0].tempId();
2086             auto it = ctx.affinities.find(def_id);
2087             if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
2088                 instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
2089                 register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
2090                instr->format = Format::VOP2;
2091                switch (instr->opcode) {
2092                case aco_opcode::v_mad_f32:
2093                   instr->opcode = aco_opcode::v_mac_f32;
2094                   break;
2095                case aco_opcode::v_fma_f32:
2096                   instr->opcode = aco_opcode::v_fmac_f32;
2097                   break;
2098                case aco_opcode::v_mad_f16:
2099                case aco_opcode::v_mad_legacy_f16:
2100                   instr->opcode = aco_opcode::v_mac_f16;
2101                   break;
2102                case aco_opcode::v_fma_f16:
2103                   instr->opcode = aco_opcode::v_fmac_f16;
2104                   break;
2105                default:
2106                   break;
2107                }
2108             }
2109          }
2110 
2111          /* handle definitions which must have the same register as an operand */
2112          if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
2113              instr->opcode == aco_opcode::v_mac_f32 ||
2114              instr->opcode == aco_opcode::v_fmac_f32 ||
2115              instr->opcode == aco_opcode::v_mac_f16 ||
2116              instr->opcode == aco_opcode::v_fmac_f16 ||
2117              instr->opcode == aco_opcode::v_writelane_b32 ||
2118              instr->opcode == aco_opcode::v_writelane_b32_e64) {
2119             instr->definitions[0].setFixed(instr->operands[2].physReg());
2120          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
2121                     instr->opcode == aco_opcode::s_mulk_i32) {
2122             instr->definitions[0].setFixed(instr->operands[0].physReg());
2123          } else if (instr->format == Format::MUBUF &&
2124                     instr->definitions.size() == 1 &&
2125                     instr->operands.size() == 4) {
2126             instr->definitions[0].setFixed(instr->operands[3].physReg());
2127          } else if (instr->format == Format::MIMG &&
2128                     instr->definitions.size() == 1 &&
2129                     instr->operands[1].regClass().type() == RegType::vgpr) {
2130             instr->definitions[0].setFixed(instr->operands[1].physReg());
2131          }
2132 
2133          ctx.defs_done.reset();
2134 
2135          /* handle fixed definitions first */
2136          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2137             auto& definition = instr->definitions[i];
2138             if (!definition.isFixed())
2139                continue;
2140 
2141             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
2142             /* check if the target register is blocked */
2143             if (register_file.test(definition.physReg(), definition.bytes())) {
2144                /* create parallelcopy pair to move blocking vars */
2145                std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size());
2146 
2147                /* re-enable the killed operands, so that we don't move the blocking vars there */
2148                for (const Operand& op : instr->operands) {
2149                   if (op.isTemp() && op.isFirstKillBeforeDef())
2150                      register_file.fill(op);
2151                }
2152 
2153                ASSERTED bool success = false;
2154                DefInfo info(ctx, instr, definition.regClass(), -1);
2155                success = get_regs_for_copies(ctx, register_file, parallelcopy,
2156                                              vars, info.lb, info.ub, instr,
2157                                              definition.physReg(),
2158                                              definition.physReg() + definition.size() - 1);
2159                assert(success);
2160 
2161                update_renames(ctx, register_file, parallelcopy, instr, false);
2162 
2163                /* once again, disable killed operands */
2164                for (const Operand& op : instr->operands) {
2165                   if (op.isTemp() && op.isFirstKillBeforeDef())
2166                      register_file.clear(op);
2167                }
2168                for (unsigned k = 0; k < i; k++) {
2169                   if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
2170                      register_file.fill(instr->definitions[k]);
2171                }
2172             }
2173             ctx.defs_done.set(i);
2174 
2175             if (!definition.isTemp())
2176                continue;
2177 
2178             /* set live if it has a kill point */
2179             if (!definition.isKill())
2180                live.insert(definition.tempId());
2181 
2182             ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
2183             register_file.fill(definition);
2184          }
2185 
2186          /* handle all other definitions */
2187          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2188             Definition *definition = &instr->definitions[i];
2189 
2190             if (definition->isFixed() || !definition->isTemp())
2191                continue;
2192 
2193             /* find free reg */
2194             if (definition->hasHint() && register_file[definition->physReg().reg()] == 0)
2195                definition->setFixed(definition->physReg());
2196             else if (instr->opcode == aco_opcode::p_split_vector) {
2197                PhysReg reg = instr->operands[0].physReg();
2198                for (unsigned j = 0; j < i; j++)
2199                   reg.reg_b += instr->definitions[j].bytes();
2200                if (get_reg_specified(ctx, register_file, definition->regClass(), parallelcopy, instr, reg))
2201                   definition->setFixed(reg);
2202             } else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) {
2203                PhysReg reg = instr->operands[i].physReg();
2204                if (instr->operands[i].isTemp() &&
2205                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
2206                    !register_file.test(reg, definition->bytes()))
2207                   definition->setFixed(reg);
2208             } else if (instr->opcode == aco_opcode::p_extract_vector) {
2209                PhysReg reg = instr->operands[0].physReg();
2210                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
2211                if (get_reg_specified(ctx, register_file, definition->regClass(), parallelcopy, instr, reg))
2212                   definition->setFixed(reg);
2213             } else if (instr->opcode == aco_opcode::p_create_vector) {
2214                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
2215                                                    parallelcopy, instr);
2216                definition->setFixed(reg);
2217             }
2218 
2219             if (!definition->isFixed()) {
2220                Temp tmp = definition->getTemp();
2221                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
2222                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2223                   definition->setFixed(reg);
2224                   if (reg.byte() || register_file.test(reg, 4)) {
2225                      add_subdword_definition(program, instr, i, reg);
2226                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate the reference */
2227                   }
2228                } else {
2229                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
2230                }
2231             }
2232 
2233             assert(definition->isFixed() && ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
2234                                              (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
2235             ctx.defs_done.set(i);
2236 
2237             /* set live if it has a kill point */
2238             if (!definition->isKill())
2239                live.insert(definition->tempId());
2240 
2241             ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
2242             register_file.fill(*definition);
2243          }
2244 
2245          handle_pseudo(ctx, register_file, instr.get());
2246 
2247          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually be read */
2248          for (const Definition& def : instr->definitions) {
2249              if (def.isTemp() && def.isKill())
2250                 register_file.clear(def);
2251          }
2252          for (unsigned i = 0; i < instr->operands.size(); i++) {
2253             const Operand& op = instr->operands[i];
2254             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
2255                register_file.clear(op);
2256             if (op.isTemp() && op.physReg().byte() != 0)
2257                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
2258          }
2259 
2260          /* emit parallelcopy */
2261          if (!parallelcopy.empty()) {
2262             aco_ptr<Pseudo_instruction> pc;
2263             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
2264             bool temp_in_scc = register_file[scc.reg()];
2265             bool sgpr_operands_alias_defs = false;
2266             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
2267             for (unsigned i = 0; i < parallelcopy.size(); i++) {
2268                if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2269                   if (!sgpr_operands_alias_defs) {
2270                      unsigned reg = parallelcopy[i].first.physReg().reg();
2271                      unsigned size = parallelcopy[i].first.getTemp().size();
2272                      sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
2273 
2274                      reg = parallelcopy[i].second.physReg().reg();
2275                      size = parallelcopy[i].second.getTemp().size();
2276                      if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
2277                         sgpr_operands_alias_defs = true;
2278                   }
2279                }
2280 
2281                pc->operands[i] = parallelcopy[i].first;
2282                pc->definitions[i] = parallelcopy[i].second;
2283                assert(pc->operands[i].size() == pc->definitions[i].size());
2284 
2285                /* it might happen that the operand is already renamed. we have to restore the original name. */
2286                std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
2287                Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
2288                ctx.orig_names[pc->definitions[i].tempId()] = orig;
2289                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
2290 
2291                std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
2292                if (phi != ctx.phi_map.end())
2293                   phi->second.uses.emplace(pc.get());
2294             }
2295 
2296             if (temp_in_scc && sgpr_operands_alias_defs) {
2297                /* disable definitions and re-enable operands */
2298                for (const Definition& def : instr->definitions) {
2299                   if (def.isTemp() && !def.isKill())
2300                      register_file.clear(def);
2301                }
2302                for (const Operand& op : instr->operands) {
2303                   if (op.isTemp() && op.isFirstKill())
2304                      register_file.block(op.physReg(), op.regClass());
2305                }
2306 
2307                handle_pseudo(ctx, register_file, pc.get());
2308 
2309                /* re-enable live vars */
2310                for (const Operand& op : instr->operands) {
2311                   if (op.isTemp() && op.isFirstKill())
2312                      register_file.clear(op);
2313                }
2314                for (const Definition& def : instr->definitions) {
2315                   if (def.isTemp() && !def.isKill())
2316                      register_file.fill(def);
2317                }
2318             } else {
2319                pc->tmp_in_scc = false;
2320             }
2321 
2322             instructions.emplace_back(std::move(pc));
2323          }
2324 
2325          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
2326          bool instr_needs_vop3 = !instr->isVOP3() &&
2327                                  ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
2328                                   (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
2329                                   ((instr->opcode == aco_opcode::v_add_co_u32 ||
2330                                     instr->opcode == aco_opcode::v_addc_co_u32 ||
2331                                     instr->opcode == aco_opcode::v_sub_co_u32 ||
2332                                     instr->opcode == aco_opcode::v_subb_co_u32 ||
2333                                     instr->opcode == aco_opcode::v_subrev_co_u32 ||
2334                                     instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2335                                    !(instr->definitions[1].physReg() == vcc)) ||
2336                                   ((instr->opcode == aco_opcode::v_addc_co_u32 ||
2337                                     instr->opcode == aco_opcode::v_subb_co_u32 ||
2338                                     instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2339                                    !(instr->operands[2].physReg() == vcc)));
2340          if (instr_needs_vop3) {
2341 
2342             /* if the first operand is a literal, we have to move it to a reg */
2343             if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
2344                bool can_sgpr = true;
2345                /* check, if we have to move to vgpr */
2346                for (const Operand& op : instr->operands) {
2347                   if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
2348                      can_sgpr = false;
2349                      break;
2350                   }
2351                }
2352                /* disable definitions and re-enable operands */
2353                for (const Definition& def : instr->definitions)
2354                   register_file.clear(def);
2355                for (const Operand& op : instr->operands) {
2356                   if (op.isTemp() && op.isFirstKill())
2357                      register_file.block(op.physReg(), op.regClass());
2358                }
2359                Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
2360                ctx.assignments.emplace_back();
2361                PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2362 
2363                aco_ptr<Instruction> mov;
2364                if (can_sgpr)
2365                   mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
2366                else
2367                   mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
2368                mov->operands[0] = instr->operands[0];
2369                mov->definitions[0] = Definition(tmp);
2370                mov->definitions[0].setFixed(reg);
2371 
2372                instr->operands[0] = Operand(tmp);
2373                instr->operands[0].setFixed(reg);
2374                instructions.emplace_back(std::move(mov));
2375                /* re-enable live vars */
2376                for (const Operand& op : instr->operands) {
2377                   if (op.isTemp() && op.isFirstKill())
2378                      register_file.clear(op);
2379                }
2380                for (const Definition& def : instr->definitions) {
2381                   if (def.isTemp() && !def.isKill())
2382                      register_file.fill(def);
2383                }
2384             }
2385 
2386             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2387             aco_ptr<Instruction> tmp = std::move(instr);
2388             Format format = asVOP3(tmp->format);
2389             instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
2390             std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
2391             std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2392             update_phi_map(ctx, tmp.get(), instr.get());
2393          }
2394 
2395          instructions.emplace_back(std::move(*it));
2396 
2397       } /* end for Instr */
2398 
2399       block.instructions = std::move(instructions);
2400 
2401       ctx.filled[block.index] = true;
2402       for (unsigned succ_idx : block.linear_succs) {
2403          Block& succ = program->blocks[succ_idx];
2404          /* seal block if all predecessors are filled */
2405          bool all_filled = true;
2406          for (unsigned pred_idx : succ.linear_preds) {
2407             if (!ctx.filled[pred_idx]) {
2408                all_filled = false;
2409                break;
2410             }
2411          }
2412          if (all_filled) {
2413             ctx.sealed[succ_idx] = true;
2414 
2415             /* finish incomplete phis and check if they became trivial */
2416             for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2417                std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2418                for (unsigned i = 0; i < phi->operands.size(); i++) {
2419                   phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2420                   phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2421                }
2422                try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2423             }
2424             /* complete the original phi nodes, but no need to check triviality */
2425             for (aco_ptr<Instruction>& instr : succ.instructions) {
2426                if (!is_phi(instr))
2427                   break;
2428                std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2429 
2430                for (unsigned i = 0; i < instr->operands.size(); i++) {
2431                   auto& operand = instr->operands[i];
2432                   if (!operand.isTemp())
2433                      continue;
2434                   operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2435                   operand.setFixed(ctx.assignments[operand.tempId()].reg);
2436                   std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2437                   if (phi != ctx.phi_map.end())
2438                      phi->second.uses.emplace(instr.get());
2439                }
2440             }
2441          }
2442       }
2443    } /* end for BB */
2444 
2445    /* remove trivial phis */
2446    for (Block& block : program->blocks) {
2447       auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2448                               [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2449       auto middle = std::remove_if(block.instructions.begin(), end,
2450                                    [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2451       block.instructions.erase(middle, end);
2452    }
2453 
2454    /* find scc spill registers which may be needed for parallelcopies created by phis */
2455    for (Block& block : program->blocks) {
2456       if (block.linear_preds.size() <= 1)
2457          continue;
2458 
2459       std::bitset<128> regs = sgpr_live_in[block.index];
2460       if (!regs[127])
2461          continue;
2462 
2463       /* choose a register */
2464       int16_t reg = 0;
2465       for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2466          ;
2467       assert(reg < ctx.program->max_reg_demand.sgpr);
2468       adjust_max_used_regs(ctx, s1, reg);
2469 
2470       /* update predecessors */
2471       for (unsigned& pred_index : block.linear_preds) {
2472          Block& pred = program->blocks[pred_index];
2473          pred.scc_live_out = true;
2474          pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2475       }
2476    }
2477 
2478    /* num_gpr = rnd_up(max_used_gpr + 1) */
2479    program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2480    if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2481       program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2482    else
2483       program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2484 }
2485 
2486 }
2487