1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "util/bitset.h"
10 #include "util/enum_operators.h"
11
12 #include <algorithm>
13 #include <array>
14 #include <bitset>
15 #include <map>
16 #include <optional>
17 #include <vector>
18
19 namespace aco {
20 namespace {
21
22 struct ra_ctx;
23 struct DefInfo;
24
25 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
26 unsigned idx, RegClass rc);
27 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
28 RegClass rc);
29 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
30 bool allow_16bit_write);
31
32 struct assignment {
33 PhysReg reg;
34 RegClass rc;
35 union {
36 struct {
37 bool assigned : 1;
38 bool vcc : 1;
39 bool m0 : 1;
40 bool renamed : 1;
41 };
42 uint8_t _ = 0;
43 };
44 uint32_t affinity = 0;
45 assignment() = default;
assignmentaco::__anone0d89ec20111::assignment46 assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anone0d89ec20111::assignment47 void set(const Definition& def)
48 {
49 assigned = true;
50 reg = def.physReg();
51 rc = def.regClass();
52 }
53 };
54
55 /* Iterator type for making PhysRegInterval compatible with range-based for */
56 struct PhysRegIterator {
57 using difference_type = int;
58 using value_type = unsigned;
59 using reference = const unsigned&;
60 using pointer = const unsigned*;
61 using iterator_category = std::bidirectional_iterator_tag;
62
63 PhysReg reg;
64
operator *aco::__anone0d89ec20111::PhysRegIterator65 PhysReg operator*() const { return reg; }
66
operator ++aco::__anone0d89ec20111::PhysRegIterator67 PhysRegIterator& operator++()
68 {
69 reg.reg_b += 4;
70 return *this;
71 }
72
operator --aco::__anone0d89ec20111::PhysRegIterator73 PhysRegIterator& operator--()
74 {
75 reg.reg_b -= 4;
76 return *this;
77 }
78
operator ==aco::__anone0d89ec20111::PhysRegIterator79 bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
80
operator !=aco::__anone0d89ec20111::PhysRegIterator81 bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
82
operator <aco::__anone0d89ec20111::PhysRegIterator83 bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
84 };
85
86 struct vector_info {
vector_infoaco::__anone0d89ec20111::vector_info87 vector_info() : is_weak(false), num_parts(0), parts(NULL) {}
vector_infoaco::__anone0d89ec20111::vector_info88 vector_info(Instruction* instr, unsigned start = 0, bool weak = false)
89 : is_weak(weak), num_parts(instr->operands.size() - start),
90 parts(instr->operands.begin() + start)
91 {}
92
93 /* If true, then we should stop trying to form a vector if anything goes wrong. Useful for when
94 * the cost of failing does not introduce copies. */
95 bool is_weak;
96 uint32_t num_parts;
97 Operand* parts;
98 };
99
100 struct ra_ctx {
101
102 Program* program;
103 Block* block = NULL;
104 aco::monotonic_buffer_resource memory;
105 std::vector<assignment> assignments;
106 std::vector<aco::unordered_map<uint32_t, Temp>> renames;
107 std::vector<std::pair<uint32_t, PhysReg>> loop_header;
108 aco::unordered_map<uint32_t, Temp> orig_names;
109 aco::unordered_map<uint32_t, vector_info> vectors;
110 aco::unordered_map<uint32_t, Instruction*> split_vectors;
111 aco_ptr<Instruction> pseudo_dummy;
112 aco_ptr<Instruction> phi_dummy;
113 uint16_t max_used_sgpr = 0;
114 uint16_t max_used_vgpr = 0;
115 uint16_t sgpr_limit;
116 uint16_t vgpr_limit;
117 std::bitset<512> war_hint;
118 PhysRegIterator rr_sgpr_it;
119 PhysRegIterator rr_vgpr_it;
120
121 uint16_t sgpr_bounds;
122 uint16_t vgpr_bounds;
123 uint16_t num_linear_vgprs;
124
125 ra_test_policy policy;
126
ra_ctxaco::__anone0d89ec20111::ra_ctx127 ra_ctx(Program* program_, ra_test_policy policy_)
128 : program(program_), assignments(program->peekAllocationId()),
129 renames(program->blocks.size(), aco::unordered_map<uint32_t, Temp>(memory)),
130 orig_names(memory), vectors(memory), split_vectors(memory), policy(policy_)
131 {
132 pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
133 phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
134 sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
135 vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
136
137 sgpr_bounds = program->max_reg_demand.sgpr;
138 vgpr_bounds = program->max_reg_demand.vgpr;
139 num_linear_vgprs = 0;
140 }
141 };
142
143 /* Half-open register interval used in "sliding window"-style for-loops */
144 struct PhysRegInterval {
145 PhysReg lo_;
146 unsigned size;
147
148 /* Inclusive lower bound */
loaco::__anone0d89ec20111::PhysRegInterval149 PhysReg lo() const { return lo_; }
150
151 /* Exclusive upper bound */
hiaco::__anone0d89ec20111::PhysRegInterval152 PhysReg hi() const { return PhysReg{lo() + size}; }
153
operator +=aco::__anone0d89ec20111::PhysRegInterval154 PhysRegInterval& operator+=(uint32_t stride)
155 {
156 lo_ = PhysReg{lo_.reg() + stride};
157 return *this;
158 }
159
operator !=aco::__anone0d89ec20111::PhysRegInterval160 bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
161
162 /* Construct a half-open interval, excluding the end register */
from_untilaco::__anone0d89ec20111::PhysRegInterval163 static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
164
containsaco::__anone0d89ec20111::PhysRegInterval165 bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
166
containsaco::__anone0d89ec20111::PhysRegInterval167 bool contains(const PhysRegInterval& needle) const
168 {
169 return needle.lo() >= lo() && needle.hi() <= hi();
170 }
171
beginaco::__anone0d89ec20111::PhysRegInterval172 PhysRegIterator begin() const { return {lo_}; }
173
endaco::__anone0d89ec20111::PhysRegInterval174 PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
175 };
176
177 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)178 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
179 {
180 return a.hi() > b.lo() && b.hi() > a.lo();
181 }
182
183 /* Gets the stride for full (non-subdword) registers */
184 uint32_t
get_stride(RegClass rc)185 get_stride(RegClass rc)
186 {
187 if (rc.type() == RegType::vgpr) {
188 return 1;
189 } else {
190 uint32_t size = rc.size();
191 if (size == 2) {
192 return 2;
193 } else if (size >= 4) {
194 return 4;
195 } else {
196 return 1;
197 }
198 }
199 }
200
201 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegType type,bool linear_vgpr)202 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
203 {
204 uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
205 if (type == RegType::vgpr && linear_vgpr) {
206 return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
207 } else if (type == RegType::vgpr) {
208 return PhysRegInterval{PhysReg(256), linear_vgpr_start};
209 } else {
210 return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
211 }
212 }
213
214 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegClass rc)215 get_reg_bounds(ra_ctx& ctx, RegClass rc)
216 {
217 return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
218 }
219
220 struct DefInfo {
221 PhysRegInterval bounds;
222 uint8_t size;
223 uint8_t stride;
224 /* Even if stride=4, we might be able to write to the high half instead without preserving the
225 * low half. In that case, data_stride=2. */
226 uint8_t data_stride;
227 RegClass rc;
228
DefInfoaco::__anone0d89ec20111::DefInfo229 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
230 {
231 size = rc.size();
232 stride = get_stride(rc);
233 data_stride = 0;
234
235 bounds = get_reg_bounds(ctx, rc);
236
237 if (rc.is_subdword() && operand >= 0) {
238 /* stride in bytes */
239 stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
240 } else if (rc.is_subdword()) {
241 get_subdword_definition_info(ctx.program, instr);
242 } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
243 /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
244 *
245 * The register use is not calculated correctly, and the hardware assumes a
246 * full dword per component. Don't use the last registers of the register file.
247 * Otherwise, the instruction will be skipped.
248 *
249 * https://reviews.llvm.org/D81172
250 */
251 bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
252 assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
253
254 if (imageGather4D16Bug)
255 bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
256 }
257
258 if (!data_stride)
259 data_stride = rc.is_subdword() ? stride : (stride * 4);
260 }
261
262 private:
263 void get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr);
264 };
265
266 class RegisterFile {
267 public:
RegisterFile()268 RegisterFile() { regs.fill(0); }
269
270 std::array<uint32_t, 512> regs;
271 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
272
operator [](PhysReg index) const273 const uint32_t& operator[](PhysReg index) const { return regs[index]; }
274
operator [](PhysReg index)275 uint32_t& operator[](PhysReg index) { return regs[index]; }
276
count_zero(PhysRegInterval reg_interval) const277 unsigned count_zero(PhysRegInterval reg_interval) const
278 {
279 unsigned res = 0;
280 for (PhysReg reg : reg_interval)
281 res += !regs[reg];
282 return res;
283 }
284
285 /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes) const286 bool test(PhysReg start, unsigned num_bytes) const
287 {
288 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
289 assert(i <= 511);
290 if (regs[i] & 0x0FFFFFFF)
291 return true;
292 if (regs[i] == 0xF0000000) {
293 auto it = subdword_regs.find(i);
294 assert(it != subdword_regs.end());
295 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
296 if (it->second[j])
297 return true;
298 }
299 }
300 }
301 return false;
302 }
303
block(PhysReg start,RegClass rc)304 void block(PhysReg start, RegClass rc)
305 {
306 if (rc.is_subdword())
307 fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
308 else
309 fill(start, rc.size(), 0xFFFFFFFF);
310 }
311
is_blocked(PhysReg start) const312 bool is_blocked(PhysReg start) const
313 {
314 if (regs[start] == 0xFFFFFFFF)
315 return true;
316 if (regs[start] == 0xF0000000) {
317 auto it = subdword_regs.find(start);
318 assert(it != subdword_regs.end());
319 for (unsigned i = start.byte(); i < 4; i++)
320 if (it->second[i] == 0xFFFFFFFF)
321 return true;
322 }
323 return false;
324 }
325
is_empty_or_blocked(PhysReg start) const326 bool is_empty_or_blocked(PhysReg start) const
327 {
328 /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
329 * incremented value to 1 */
330 if (regs[start] == 0xF0000000) {
331 auto it = subdword_regs.find(start);
332 assert(it != subdword_regs.end());
333 return it->second[start.byte()] + 1 <= 1;
334 }
335 return regs[start] + 1 <= 1;
336 }
337
clear(PhysReg start,RegClass rc)338 void clear(PhysReg start, RegClass rc)
339 {
340 if (rc.is_subdword())
341 fill_subdword(start, rc.bytes(), 0);
342 else
343 fill(start, rc.size(), 0);
344 }
345
fill_killed_operands(Instruction * instr)346 void fill_killed_operands(Instruction* instr)
347 {
348 for (Operand& op : instr->operands) {
349 if (op.isPrecolored()) {
350 block(op.physReg(), op.regClass());
351 } else if (op.isFixed() && op.isFirstKillBeforeDef()) {
352 if (op.regClass().is_subdword())
353 fill_subdword(op.physReg(), op.bytes(), op.tempId());
354 else
355 fill(op.physReg(), op.size(), op.tempId());
356 }
357 }
358 }
359
clear(Operand op)360 void clear(Operand op) { clear(op.physReg(), op.regClass()); }
361
fill(Definition def)362 void fill(Definition def)
363 {
364 if (def.regClass().is_subdword())
365 fill_subdword(def.physReg(), def.bytes(), def.tempId());
366 else
367 fill(def.physReg(), def.size(), def.tempId());
368 }
369
clear(Definition def)370 void clear(Definition def) { clear(def.physReg(), def.regClass()); }
371
get_id(PhysReg reg) const372 unsigned get_id(PhysReg reg) const
373 {
374 return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
375 }
376
377 private:
fill(PhysReg start,unsigned size,uint32_t val)378 void fill(PhysReg start, unsigned size, uint32_t val)
379 {
380 for (unsigned i = 0; i < size; i++)
381 regs[start + i] = val;
382 }
383
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)384 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
385 {
386 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
387 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
388 /* emplace or get */
389 std::array<uint32_t, 4>& sub =
390 subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
391 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
392 sub[j] = val;
393
394 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
395 subdword_regs.erase(i);
396 regs[i] = 0;
397 }
398 }
399 }
400 };
401
402 std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
403 const PhysRegInterval reg_interval);
404
405 /* helper function for debugging */
406 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)407 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
408 {
409 if (reg_file[reg] == 0xFFFFFFFF) {
410 printf((const char*)u8"☐");
411 } else if (reg_file[reg]) {
412 const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
413 if (show_subdword_alloc) {
414 auto block_chars = {
415 // clang-format off
416 u8"?", u8"▘", u8"▝", u8"▀",
417 u8"▖", u8"▌", u8"▞", u8"▛",
418 u8"▗", u8"▚", u8"▐", u8"▜",
419 u8"▄", u8"▙", u8"▟", u8"▉"
420 // clang-format on
421 };
422 unsigned index = 0;
423 for (int i = 0; i < 4; ++i) {
424 if (reg_file.subdword_regs.at(reg)[i]) {
425 index |= 1 << i;
426 }
427 }
428 printf("%s", (const char*)(block_chars.begin()[index]));
429 } else {
430 /* Indicate filled register slot */
431 if (!has_adjacent_variable) {
432 printf((const char*)u8"█");
433 } else {
434 /* Use a slightly shorter box to leave a small gap between adjacent variables */
435 printf((const char*)u8"▉");
436 }
437 }
438 } else {
439 printf((const char*)u8"·");
440 }
441 }
442
443 /* helper function for debugging */
444 UNUSED void
print_regs(ra_ctx & ctx,PhysRegInterval regs,const RegisterFile & reg_file)445 print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
446 {
447 char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
448 const int max_regs_per_line = 64;
449
450 /* print markers */
451 printf(" ");
452 for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
453 printf("%-3.2u ", i);
454 }
455 printf("\n");
456
457 /* print usage */
458 auto line_begin_it = regs.begin();
459 while (line_begin_it != regs.end()) {
460 const int regs_in_line =
461 std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
462
463 if (line_begin_it == regs.begin()) {
464 printf("%cgprs: ", reg_char);
465 } else {
466 printf(" %+4d ", std::distance(regs.begin(), line_begin_it));
467 }
468 const auto line_end_it = std::next(line_begin_it, regs_in_line);
469
470 for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
471 bool has_adjacent_variable =
472 (std::next(reg_it) != line_end_it &&
473 reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
474 print_reg(reg_file, *reg_it, has_adjacent_variable);
475 }
476
477 line_begin_it = line_end_it;
478 printf("\n");
479 }
480
481 const unsigned free_regs =
482 std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
483 printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
484
485 /* print assignments ordered by registers */
486 std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
487 for (unsigned id : find_vars(ctx, reg_file, regs)) {
488 const assignment& var = ctx.assignments[id];
489 PhysReg reg = var.reg;
490 ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
491 assert(inserted.second);
492 }
493
494 for (const auto& reg_and_var : regs_to_vars) {
495 const auto& first_reg = reg_and_var.first;
496 const auto& size_id = reg_and_var.second;
497
498 printf("%%%u ", size_id.second);
499 if (ctx.orig_names.count(size_id.second) &&
500 ctx.orig_names[size_id.second].id() != size_id.second) {
501 printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
502 }
503 printf("= %c[%d", reg_char, first_reg.reg() % 256);
504 PhysReg last_reg = first_reg.advance(size_id.first - 1);
505 if (first_reg.reg() != last_reg.reg()) {
506 assert(first_reg.byte() == 0 && last_reg.byte() == 3);
507 printf("-%d", last_reg.reg() % 256);
508 }
509 printf("]");
510 if (first_reg.byte() != 0 || last_reg.byte() != 3) {
511 printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
512 }
513 printf("\n");
514 }
515 }
516
517 bool
is_sgpr_writable_without_side_effects(amd_gfx_level gfx_level,PhysReg reg)518 is_sgpr_writable_without_side_effects(amd_gfx_level gfx_level, PhysReg reg)
519 {
520 assert(reg < 256);
521 bool has_flat_scr_lo_gfx89 = gfx_level >= GFX8 && gfx_level <= GFX9;
522 bool has_flat_scr_lo_gfx7_or_xnack_mask = gfx_level <= GFX9;
523 return (reg <= vcc_hi || reg == m0) &&
524 (!has_flat_scr_lo_gfx89 || (reg != flat_scr_lo && reg != flat_scr_hi)) &&
525 (!has_flat_scr_lo_gfx7_or_xnack_mask || (reg != 104 || reg != 105));
526 }
527
528 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)529 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
530 unsigned idx, RegClass rc)
531 {
532 assert(gfx_level >= GFX8);
533 if (instr->isPseudo()) {
534 /* v_readfirstlane_b32 cannot use SDWA */
535 if (instr->opcode == aco_opcode::p_as_uniform)
536 return 4;
537 else
538 return rc.bytes() % 2 == 0 ? 2 : 1;
539 }
540
541 assert(rc.bytes() <= 2);
542 if (instr->isVALU()) {
543 if (can_use_SDWA(gfx_level, instr, false))
544 return rc.bytes();
545 if (can_use_opsel(gfx_level, instr->opcode, idx))
546 return 2;
547 if (instr->isVOP3P())
548 return 2;
549 }
550
551 switch (instr->opcode) {
552 case aco_opcode::v_cvt_f32_ubyte0: return 1;
553 case aco_opcode::ds_write_b8:
554 case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
555 case aco_opcode::buffer_store_byte:
556 case aco_opcode::buffer_store_short:
557 case aco_opcode::buffer_store_format_d16_x:
558 case aco_opcode::flat_store_byte:
559 case aco_opcode::flat_store_short:
560 case aco_opcode::scratch_store_byte:
561 case aco_opcode::scratch_store_short:
562 case aco_opcode::global_store_byte:
563 case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
564 default: return 4;
565 }
566 }
567
568 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)569 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
570 RegClass rc)
571 {
572 amd_gfx_level gfx_level = ctx.program->gfx_level;
573 if (instr->isPseudo() || byte == 0)
574 return;
575
576 assert(rc.bytes() <= 2);
577 if (instr->isVALU()) {
578 if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
579 switch (byte) {
580 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
581 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
582 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
583 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
584 }
585 return;
586 }
587
588 /* use SDWA */
589 if (can_use_SDWA(gfx_level, instr, false)) {
590 convert_to_SDWA(gfx_level, instr);
591 return;
592 }
593
594 /* use opsel */
595 if (instr->isVOP3P()) {
596 assert(byte == 2 && !instr->valu().opsel_lo[idx]);
597 instr->valu().opsel_lo[idx] = true;
598 instr->valu().opsel_hi[idx] = true;
599 return;
600 }
601
602 assert(can_use_opsel(gfx_level, instr->opcode, idx));
603 instr->valu().opsel[idx] = true;
604 return;
605 }
606
607 assert(byte == 2);
608 if (instr->opcode == aco_opcode::ds_write_b8)
609 instr->opcode = aco_opcode::ds_write_b8_d16_hi;
610 else if (instr->opcode == aco_opcode::ds_write_b16)
611 instr->opcode = aco_opcode::ds_write_b16_d16_hi;
612 else if (instr->opcode == aco_opcode::buffer_store_byte)
613 instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
614 else if (instr->opcode == aco_opcode::buffer_store_short)
615 instr->opcode = aco_opcode::buffer_store_short_d16_hi;
616 else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
617 instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
618 else if (instr->opcode == aco_opcode::flat_store_byte)
619 instr->opcode = aco_opcode::flat_store_byte_d16_hi;
620 else if (instr->opcode == aco_opcode::flat_store_short)
621 instr->opcode = aco_opcode::flat_store_short_d16_hi;
622 else if (instr->opcode == aco_opcode::scratch_store_byte)
623 instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
624 else if (instr->opcode == aco_opcode::scratch_store_short)
625 instr->opcode = aco_opcode::scratch_store_short_d16_hi;
626 else if (instr->opcode == aco_opcode::global_store_byte)
627 instr->opcode = aco_opcode::global_store_byte_d16_hi;
628 else if (instr->opcode == aco_opcode::global_store_short)
629 instr->opcode = aco_opcode::global_store_short_d16_hi;
630 else
631 unreachable("Something went wrong: Impossible register assignment.");
632 return;
633 }
634
635 void
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr)636 DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr)
637 {
638 amd_gfx_level gfx_level = program->gfx_level;
639 assert(gfx_level >= GFX8);
640
641 stride = rc.bytes() % 2 == 0 ? 2 : 1;
642
643 if (instr->isPseudo()) {
644 if (instr->opcode == aco_opcode::p_interp_gfx11) {
645 rc = RegClass(RegType::vgpr, rc.size());
646 stride = 1;
647 }
648 return;
649 }
650
651 if (instr->isVALU()) {
652 assert(rc.bytes() <= 2);
653
654 if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
655 return;
656
657 rc = instr_is_16bit(gfx_level, instr->opcode) ? v2b : v1;
658 stride = rc == v2b ? 4 : 1;
659 if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
660 can_use_opsel(gfx_level, instr->opcode, -1)) {
661 data_stride = 2;
662 stride = rc == v2b ? 2 : stride;
663 }
664 return;
665 }
666
667 switch (instr->opcode) {
668 case aco_opcode::v_interp_p2_f16: return;
669 /* D16 loads with _hi version */
670 case aco_opcode::ds_read_u8_d16:
671 case aco_opcode::ds_read_i8_d16:
672 case aco_opcode::ds_read_u16_d16:
673 case aco_opcode::flat_load_ubyte_d16:
674 case aco_opcode::flat_load_sbyte_d16:
675 case aco_opcode::flat_load_short_d16:
676 case aco_opcode::global_load_ubyte_d16:
677 case aco_opcode::global_load_sbyte_d16:
678 case aco_opcode::global_load_short_d16:
679 case aco_opcode::scratch_load_ubyte_d16:
680 case aco_opcode::scratch_load_sbyte_d16:
681 case aco_opcode::scratch_load_short_d16:
682 case aco_opcode::buffer_load_ubyte_d16:
683 case aco_opcode::buffer_load_sbyte_d16:
684 case aco_opcode::buffer_load_short_d16:
685 case aco_opcode::buffer_load_format_d16_x: {
686 assert(gfx_level >= GFX9);
687 if (program->dev.sram_ecc_enabled) {
688 rc = v1;
689 stride = 1;
690 data_stride = 2;
691 } else {
692 stride = 2;
693 }
694 return;
695 }
696 /* 3-component D16 loads */
697 case aco_opcode::buffer_load_format_d16_xyz:
698 case aco_opcode::tbuffer_load_format_d16_xyz: {
699 assert(gfx_level >= GFX9);
700 if (program->dev.sram_ecc_enabled) {
701 rc = v2;
702 stride = 1;
703 } else {
704 stride = 4;
705 }
706 return;
707 }
708 default: break;
709 }
710
711 if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
712 assert(gfx_level >= GFX9);
713 stride = 4;
714 } else {
715 rc = RegClass(RegType::vgpr, rc.size());
716 stride = 1;
717 }
718 }
719
720 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg,bool allow_16bit_write)721 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
722 bool allow_16bit_write)
723 {
724 if (instr->isPseudo())
725 return;
726
727 if (instr->isVALU()) {
728 amd_gfx_level gfx_level = program->gfx_level;
729 assert(instr->definitions[0].bytes() <= 2);
730
731 if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
732 return;
733
734 if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
735 return;
736
737 /* use SDWA */
738 if (can_use_SDWA(gfx_level, instr, false)) {
739 convert_to_SDWA(gfx_level, instr);
740 return;
741 }
742
743 assert(allow_16bit_write);
744
745 if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
746 instr->opcode = aco_opcode::v_fma_mixhi_f16;
747 return;
748 }
749
750 /* use opsel */
751 assert(reg.byte() == 2);
752 assert(can_use_opsel(gfx_level, instr->opcode, -1));
753 instr->valu().opsel[3] = true; /* dst in high half */
754 return;
755 }
756
757 if (reg.byte() == 0)
758 return;
759 else if (instr->opcode == aco_opcode::v_interp_p2_f16)
760 instr->opcode = aco_opcode::v_interp_p2_hi_f16;
761 else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
762 instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
763 else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
764 instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
765 else if (instr->opcode == aco_opcode::buffer_load_short_d16)
766 instr->opcode = aco_opcode::buffer_load_short_d16_hi;
767 else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
768 instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
769 else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
770 instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
771 else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
772 instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
773 else if (instr->opcode == aco_opcode::flat_load_short_d16)
774 instr->opcode = aco_opcode::flat_load_short_d16_hi;
775 else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
776 instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
777 else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
778 instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
779 else if (instr->opcode == aco_opcode::scratch_load_short_d16)
780 instr->opcode = aco_opcode::scratch_load_short_d16_hi;
781 else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
782 instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
783 else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
784 instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
785 else if (instr->opcode == aco_opcode::global_load_short_d16)
786 instr->opcode = aco_opcode::global_load_short_d16_hi;
787 else if (instr->opcode == aco_opcode::ds_read_u8_d16)
788 instr->opcode = aco_opcode::ds_read_u8_d16_hi;
789 else if (instr->opcode == aco_opcode::ds_read_i8_d16)
790 instr->opcode = aco_opcode::ds_read_i8_d16_hi;
791 else if (instr->opcode == aco_opcode::ds_read_u16_d16)
792 instr->opcode = aco_opcode::ds_read_u16_d16_hi;
793 else
794 unreachable("Something went wrong: Impossible register assignment.");
795 }
796
797 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)798 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
799 {
800 uint16_t max_addressible_sgpr = ctx.sgpr_limit;
801 unsigned size = rc.size();
802 if (rc.type() == RegType::vgpr) {
803 assert(reg >= 256);
804 uint16_t hi = reg - 256 + size - 1;
805 assert(hi <= 255);
806 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
807 } else if (reg + rc.size() <= max_addressible_sgpr) {
808 uint16_t hi = reg + size - 1;
809 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
810 }
811 }
812
813 enum UpdateRenames {
814 rename_not_killed_ops = 0x1,
815 };
816 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
817
818 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)819 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
820 std::vector<std::pair<Operand, Definition>>& parallelcopies,
821 aco_ptr<Instruction>& instr, UpdateRenames flags)
822 {
823 /* clear operands */
824 for (std::pair<Operand, Definition>& copy : parallelcopies) {
825 /* the definitions with id are not from this function and already handled */
826 if (copy.second.isTemp())
827 continue;
828 reg_file.clear(copy.first);
829 }
830
831 /* allocate id's and rename operands: this is done transparently here */
832 auto it = parallelcopies.begin();
833 while (it != parallelcopies.end()) {
834 if (it->second.isTemp()) {
835 ++it;
836 continue;
837 }
838
839 /* check if we moved a definition: change the register and remove copy */
840 bool is_def = false;
841 for (Definition& def : instr->definitions) {
842 if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
843 // FIXME: ensure that the definition can use this reg
844 def.setFixed(it->second.physReg());
845 reg_file.fill(def);
846 ctx.assignments[def.tempId()].reg = def.physReg();
847 it = parallelcopies.erase(it);
848 is_def = true;
849 break;
850 }
851 }
852 if (is_def)
853 continue;
854
855 /* check if we moved another parallelcopy definition */
856 for (std::pair<Operand, Definition>& other : parallelcopies) {
857 if (!other.second.isTemp())
858 continue;
859 if (it->first.getTemp() == other.second.getTemp()) {
860 other.second.setFixed(it->second.physReg());
861 ctx.assignments[other.second.tempId()].reg = other.second.physReg();
862 it = parallelcopies.erase(it);
863 is_def = true;
864 /* check if we moved an operand, again */
865 bool fill = true;
866 for (Operand& op : instr->operands) {
867 if (op.isTemp() && op.tempId() == other.second.tempId()) {
868 // FIXME: ensure that the operand can use this reg
869 op.setFixed(other.second.physReg());
870 fill = !op.isKillBeforeDef();
871 }
872 }
873 if (fill)
874 reg_file.fill(other.second);
875 break;
876 }
877 }
878 if (is_def)
879 continue;
880
881 std::pair<Operand, Definition>& copy = *it;
882 copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
883 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
884 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
885
886 /* check if we moved an operand */
887 bool first[2] = {true, true};
888 bool fill = true;
889 for (unsigned i = 0; i < instr->operands.size(); i++) {
890 Operand& op = instr->operands[i];
891 if (!op.isTemp())
892 continue;
893 if (op.tempId() == copy.first.tempId()) {
894 /* only rename precolored operands if the copy-location matches */
895 bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg();
896
897 /* Omit renaming in some cases for p_create_vector in order to avoid
898 * unnecessary shuffle code. */
899 if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
900 omit_renaming = true;
901 for (std::pair<Operand, Definition>& pc : parallelcopies) {
902 PhysReg def_reg = pc.second.physReg();
903 omit_renaming &= def_reg > copy.first.physReg()
904 ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
905 : (def_reg + pc.second.size() <= copy.first.physReg().reg());
906 }
907 }
908
909 /* Fix the kill flags */
910 if (first[omit_renaming])
911 op.setFirstKill(omit_renaming || op.isKill());
912 else
913 op.setKill(omit_renaming || op.isKill());
914 first[omit_renaming] = false;
915
916 if (omit_renaming)
917 continue;
918
919 op.setTemp(copy.second.getTemp());
920 op.setFixed(copy.second.physReg());
921
922 fill = !op.isKillBeforeDef() || op.isPrecolored();
923 }
924 }
925
926 /* Apply changes to register file. */
927 if (fill)
928 reg_file.fill(copy.second);
929
930 ++it;
931 }
932 }
933
934 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,const RegisterFile & reg_file,DefInfo info)935 get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
936 {
937 PhysRegInterval bounds = info.bounds;
938 uint32_t size = info.size;
939 uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
940 RegClass rc = info.rc;
941
942 if (stride < size && !rc.is_subdword()) {
943 DefInfo new_info = info;
944 new_info.stride = stride * 2;
945 if (size % new_info.stride == 0) {
946 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
947 if (res)
948 return res;
949 }
950 }
951
952 PhysRegIterator& rr_it = rc.type() == RegType::vgpr ? ctx.rr_vgpr_it : ctx.rr_sgpr_it;
953 if (stride == 1) {
954 if (rr_it != bounds.begin() && bounds.contains(rr_it.reg)) {
955 assert(bounds.begin() < rr_it);
956 assert(rr_it < bounds.end());
957 info.bounds = PhysRegInterval::from_until(rr_it.reg, bounds.hi());
958 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
959 if (res)
960 return res;
961 bounds = PhysRegInterval::from_until(bounds.lo(), rr_it.reg);
962 }
963 }
964
965 auto is_free = [&](PhysReg reg_index)
966 { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
967
968 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
969 reg_win += stride) {
970 if (std::all_of(reg_win.begin(), reg_win.end(), is_free)) {
971 if (stride == 1) {
972 PhysRegIterator new_rr_it{PhysReg{reg_win.lo() + size}};
973 if (new_rr_it < bounds.end())
974 rr_it = new_rr_it;
975 }
976 adjust_max_used_regs(ctx, rc, reg_win.lo());
977 return reg_win.lo();
978 }
979 }
980
981 /* do this late because using the upper bytes of a register can require
982 * larger instruction encodings or copies
983 * TODO: don't do this in situations where it doesn't benefit */
984 if (rc.is_subdword()) {
985 for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
986 reg_file.subdword_regs) {
987 assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
988 if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
989 continue;
990
991 auto it = entry.second.begin();
992 for (unsigned i = 0; i < 4; i += info.stride) {
993 /* check if there's a block of free bytes large enough to hold the register */
994 bool reg_found =
995 std::all_of(std::next(it, i), std::next(it, std::min(4u, i + rc.bytes())),
996 [](unsigned v) { return v == 0; });
997
998 /* check if also the neighboring reg is free if needed */
999 if (reg_found && i + rc.bytes() > 4)
1000 reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
1001
1002 if (reg_found) {
1003 PhysReg res{entry.first};
1004 res.reg_b += i;
1005 adjust_max_used_regs(ctx, rc, entry.first);
1006 return res;
1007 }
1008 }
1009 }
1010 }
1011
1012 return {};
1013 }
1014
1015 /* collect variables from a register area */
1016 std::vector<unsigned>
find_vars(ra_ctx & ctx,const RegisterFile & reg_file,const PhysRegInterval reg_interval)1017 find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
1018 {
1019 std::vector<unsigned> vars;
1020 for (PhysReg j : reg_interval) {
1021 if (reg_file.is_blocked(j))
1022 continue;
1023 if (reg_file[j] == 0xF0000000) {
1024 for (unsigned k = 0; k < 4; k++) {
1025 unsigned id = reg_file.subdword_regs.at(j)[k];
1026 if (id && (vars.empty() || id != vars.back()))
1027 vars.emplace_back(id);
1028 }
1029 } else {
1030 unsigned id = reg_file[j];
1031 if (id && (vars.empty() || id != vars.back()))
1032 vars.emplace_back(id);
1033 }
1034 }
1035 return vars;
1036 }
1037
1038 /* collect variables from a register area and clear reg_file
1039 * variables are sorted in decreasing size and
1040 * increasing assigned register
1041 */
1042 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1043 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1044 {
1045 std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1046 std::sort(ids.begin(), ids.end(),
1047 [&](unsigned a, unsigned b)
1048 {
1049 assignment& var_a = ctx.assignments[a];
1050 assignment& var_b = ctx.assignments[b];
1051 return var_a.rc.bytes() > var_b.rc.bytes() ||
1052 (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1053 });
1054
1055 for (unsigned id : ids) {
1056 assignment& var = ctx.assignments[id];
1057 reg_file.clear(var.reg, var.rc);
1058 }
1059 return ids;
1060 }
1061
1062 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1063 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1064 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1065 aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1066 DefInfo info, unsigned id)
1067 {
1068 PhysReg reg = def_reg.lo();
1069 /* dead operand: return position in vector */
1070 for (unsigned i = 0; i < instr->operands.size(); i++) {
1071 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1072 instr->operands[i].isKillBeforeDef()) {
1073 assert(!reg_file.test(reg, instr->operands[i].bytes()));
1074 if (info.rc.is_subdword() || reg.byte() == 0)
1075 return reg;
1076 else
1077 return {};
1078 }
1079 reg.reg_b += instr->operands[i].bytes();
1080 }
1081
1082 /* GFX9+ has a VGPR swap instruction. */
1083 if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1084 return {};
1085
1086 /* check if the previous position was in vector */
1087 assignment& var = ctx.assignments[id];
1088 if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1089 reg = def_reg.lo();
1090 /* try to use the previous register of the operand */
1091 for (unsigned i = 0; i < instr->operands.size(); i++) {
1092 if (reg != var.reg) {
1093 reg.reg_b += instr->operands[i].bytes();
1094 continue;
1095 }
1096
1097 /* check if we can swap positions */
1098 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1099 instr->operands[i].regClass() == info.rc) {
1100 assignment& op = ctx.assignments[instr->operands[i].tempId()];
1101 /* if everything matches, create parallelcopy for the killed operand */
1102 if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1103 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1104 Definition pc_def = Definition(reg, info.rc);
1105 parallelcopies.emplace_back(instr->operands[i], pc_def);
1106 return op.reg;
1107 }
1108 }
1109 return {};
1110 }
1111 }
1112 return {};
1113 }
1114
1115 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1116 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1117 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1118 const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1119 const PhysRegInterval def_reg)
1120 {
1121 /* Variables are sorted from large to small and with increasing assigned register */
1122 for (unsigned id : vars) {
1123 assignment& var = ctx.assignments[id];
1124 PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
1125 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1126 uint32_t size = info.size;
1127
1128 /* check if this is a dead operand, then we can re-use the space from the definition
1129 * also use the correct stride for sub-dword operands */
1130 bool is_dead_operand = false;
1131 std::optional<PhysReg> res;
1132 if (instr->opcode == aco_opcode::p_create_vector) {
1133 res =
1134 get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1135 } else {
1136 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1137 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1138 info = DefInfo(ctx, instr, var.rc, i);
1139 if (instr->operands[i].isKillBeforeDef()) {
1140 info.bounds = def_reg;
1141 res = get_reg_simple(ctx, reg_file, info);
1142 is_dead_operand = true;
1143 }
1144 break;
1145 }
1146 }
1147 }
1148 if (!res && !def_reg.size) {
1149 /* If this is before definitions are handled, def_reg may be an empty interval. */
1150 info.bounds = bounds;
1151 res = get_reg_simple(ctx, reg_file, info);
1152 } else if (!res) {
1153 /* Try to find space within the bounds but outside of the definition */
1154 info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1155 res = get_reg_simple(ctx, reg_file, info);
1156 if (!res && def_reg.hi() <= bounds.hi()) {
1157 unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1158 info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1159 res = get_reg_simple(ctx, reg_file, info);
1160 }
1161 }
1162
1163 if (res) {
1164 /* mark the area as blocked */
1165 reg_file.block(*res, var.rc);
1166
1167 /* create parallelcopy pair (without definition id) */
1168 Temp tmp = Temp(id, var.rc);
1169 Operand pc_op = Operand(tmp);
1170 pc_op.setFixed(var.reg);
1171 Definition pc_def = Definition(*res, pc_op.regClass());
1172 parallelcopies.emplace_back(pc_op, pc_def);
1173 continue;
1174 }
1175
1176 PhysReg best_pos = bounds.lo();
1177 unsigned num_moves = 0xFF;
1178 unsigned num_vars = 0;
1179
1180 /* we use a sliding window to find potential positions */
1181 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1182 for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1183 reg_win += stride) {
1184 if (!is_dead_operand && intersects(reg_win, def_reg))
1185 continue;
1186
1187 /* second, check that we have at most k=num_moves elements in the window
1188 * and no element is larger than the currently processed one */
1189 unsigned k = 0;
1190 unsigned n = 0;
1191 unsigned last_var = 0;
1192 bool found = true;
1193 for (PhysReg j : reg_win) {
1194 if (reg_file[j] == 0 || reg_file[j] == last_var)
1195 continue;
1196
1197 if (reg_file.is_blocked(j) || k > num_moves) {
1198 found = false;
1199 break;
1200 }
1201 if (reg_file[j] == 0xF0000000) {
1202 k += 1;
1203 n++;
1204 continue;
1205 }
1206 /* we cannot split live ranges of linear vgprs */
1207 if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1208 found = false;
1209 break;
1210 }
1211 bool is_kill = false;
1212 for (const Operand& op : instr->operands) {
1213 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1214 is_kill = true;
1215 break;
1216 }
1217 }
1218 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1219 found = false;
1220 break;
1221 }
1222
1223 k += ctx.assignments[reg_file[j]].rc.size();
1224 last_var = reg_file[j];
1225 n++;
1226 if (k > num_moves || (k == num_moves && n <= num_vars)) {
1227 found = false;
1228 break;
1229 }
1230 }
1231
1232 if (found) {
1233 best_pos = reg_win.lo();
1234 num_moves = k;
1235 num_vars = n;
1236 }
1237 }
1238
1239 /* FIXME: we messed up and couldn't find space for the variables to be copied */
1240 if (num_moves == 0xFF)
1241 return false;
1242
1243 PhysRegInterval reg_win{best_pos, size};
1244
1245 /* collect variables and block reg file */
1246 std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1247
1248 /* mark the area as blocked */
1249 reg_file.block(reg_win.lo(), var.rc);
1250 adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1251
1252 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1253 return false;
1254
1255 /* create parallelcopy pair (without definition id) */
1256 Temp tmp = Temp(id, var.rc);
1257 Operand pc_op = Operand(tmp);
1258 pc_op.setFixed(var.reg);
1259 Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1260 parallelcopies.emplace_back(pc_op, pc_def);
1261 }
1262
1263 return true;
1264 }
1265
1266 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1267 get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
1268 std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1269 aco_ptr<Instruction>& instr)
1270 {
1271 const PhysRegInterval& bounds = info.bounds;
1272 uint32_t size = info.size;
1273 uint32_t stride = info.stride;
1274 RegClass rc = info.rc;
1275
1276 /* check how many free regs we have */
1277 unsigned regs_free = reg_file.count_zero(bounds);
1278
1279 /* mark and count killed operands */
1280 unsigned killed_ops = 0;
1281 std::bitset<256> is_killed_operand; /* per-register */
1282 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1283 Operand& op = instr->operands[j];
1284 if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1285 !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1286 assert(op.isFixed());
1287
1288 for (unsigned i = 0; i < op.size(); ++i) {
1289 is_killed_operand[(op.physReg() & 0xff) + i] = true;
1290 }
1291
1292 killed_ops += op.getTemp().size();
1293 }
1294 }
1295
1296 assert((regs_free + ctx.num_linear_vgprs) >= size);
1297
1298 /* we might have to move dead operands to dst in order to make space */
1299 unsigned op_moves = 0;
1300
1301 if (size > (regs_free - killed_ops))
1302 op_moves = size - (regs_free - killed_ops);
1303
1304 /* find the best position to place the definition */
1305 PhysRegInterval best_win = {bounds.lo(), size};
1306 unsigned num_moves = 0xFF;
1307 unsigned num_vars = 0;
1308
1309 /* we use a sliding window to check potential positions */
1310 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1311 reg_win += stride) {
1312 /* first check if the register window starts in the middle of an
1313 * allocated variable: this is what we have to fix to allow for
1314 * num_moves > size */
1315 if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1316 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1317 continue;
1318 if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1319 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1320 continue;
1321
1322 /* second, check that we have at most k=num_moves elements in the window
1323 * and no element is larger than the currently processed one */
1324 unsigned k = op_moves;
1325 unsigned n = 0;
1326 unsigned remaining_op_moves = op_moves;
1327 unsigned last_var = 0;
1328 bool found = true;
1329 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1330 for (const PhysReg j : reg_win) {
1331 /* dead operands effectively reduce the number of estimated moves */
1332 if (is_killed_operand[j & 0xFF]) {
1333 if (remaining_op_moves) {
1334 k--;
1335 remaining_op_moves--;
1336 }
1337 continue;
1338 }
1339
1340 if (reg_file[j] == 0 || reg_file[j] == last_var)
1341 continue;
1342
1343 if (reg_file[j] == 0xF0000000) {
1344 k += 1;
1345 n++;
1346 continue;
1347 }
1348
1349 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1350 found = false;
1351 break;
1352 }
1353
1354 /* we cannot split live ranges of linear vgprs */
1355 if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1356 found = false;
1357 break;
1358 }
1359
1360 k += ctx.assignments[reg_file[j]].rc.size();
1361 n++;
1362 last_var = reg_file[j];
1363 }
1364
1365 if (!found || k > num_moves)
1366 continue;
1367 if (k == num_moves && n < num_vars)
1368 continue;
1369 if (!aligned && k == num_moves && n == num_vars)
1370 continue;
1371
1372 if (found) {
1373 best_win = reg_win;
1374 num_moves = k;
1375 num_vars = n;
1376 }
1377 }
1378
1379 if (num_moves == 0xFF)
1380 return {};
1381
1382 /* now, we figured the placement for our definition */
1383 RegisterFile tmp_file(reg_file);
1384
1385 /* p_create_vector: also re-place killed operands in the definition space */
1386 if (instr->opcode == aco_opcode::p_create_vector)
1387 tmp_file.fill_killed_operands(instr.get());
1388
1389 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1390
1391 /* re-enable killed operands */
1392 if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector)
1393 tmp_file.fill_killed_operands(instr.get());
1394
1395 std::vector<std::pair<Operand, Definition>> pc;
1396 if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1397 return {};
1398
1399 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1400
1401 adjust_max_used_regs(ctx, rc, best_win.lo());
1402 return best_win.lo();
1403 }
1404
1405 bool
get_reg_specified(ra_ctx & ctx,const RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg,int operand)1406 get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
1407 aco_ptr<Instruction>& instr, PhysReg reg, int operand)
1408 {
1409 /* catch out-of-range registers */
1410 if (reg >= PhysReg{512})
1411 return false;
1412
1413 DefInfo info(ctx, instr, rc, operand);
1414
1415 if (reg.reg_b % info.data_stride)
1416 return false;
1417
1418 assert(util_is_power_of_two_nonzero(info.stride));
1419 reg.reg_b &= ~(info.stride - 1);
1420
1421 PhysRegInterval reg_win = {PhysReg(reg.reg()), info.rc.size()};
1422 PhysRegInterval vcc_win = {vcc, 2};
1423 /* VCC is outside the bounds */
1424 bool is_vcc =
1425 info.rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1426 bool is_m0 = info.rc == s1 && reg == m0 && can_write_m0(instr);
1427 if (!info.bounds.contains(reg_win) && !is_vcc && !is_m0)
1428 return false;
1429
1430 if (reg_file.test(reg, info.rc.bytes()))
1431 return false;
1432
1433 adjust_max_used_regs(ctx, info.rc, reg_win.lo());
1434 return true;
1435 }
1436
1437 bool
increase_register_file(ra_ctx & ctx,RegClass rc)1438 increase_register_file(ra_ctx& ctx, RegClass rc)
1439 {
1440 if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
1441 ctx.vgpr_bounds < ctx.vgpr_limit) {
1442 /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
1443 update_vgpr_sgpr_demand(
1444 ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
1445
1446 ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
1447 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1448 update_vgpr_sgpr_demand(
1449 ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
1450
1451 ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
1452 } else {
1453 return false;
1454 }
1455
1456 return true;
1457 }
1458
1459 struct IDAndRegClass {
IDAndRegClassaco::__anone0d89ec20111::IDAndRegClass1460 IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1461
1462 unsigned id;
1463 RegClass rc;
1464 };
1465
1466 struct IDAndInfo {
IDAndInfoaco::__anone0d89ec20111::IDAndInfo1467 IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1468
1469 unsigned id;
1470 DefInfo info;
1471 };
1472
1473 void
add_rename(ra_ctx & ctx,Temp orig_val,Temp new_val)1474 add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
1475 {
1476 ctx.renames[ctx.block->index][orig_val.id()] = new_val;
1477 ctx.orig_names.emplace(new_val.id(), orig_val);
1478 ctx.assignments[orig_val.id()].renamed = true;
1479 }
1480
1481 /* Reallocates vars by sorting them and placing each variable after the previous
1482 * one. If one of the variables has 0xffffffff as an ID, the register assigned
1483 * for that variable will be returned.
1484 */
1485 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1486 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1487 std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1488 {
1489 /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1490 * temporary sizes to dwords.
1491 */
1492 std::vector<IDAndInfo> sorted;
1493 for (IDAndRegClass var : vars) {
1494 DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1495 sorted.emplace_back(var.id, info);
1496 }
1497
1498 std::sort(
1499 sorted.begin(), sorted.end(),
1500 [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1501 {
1502 unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1503 unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1504 if (a_stride > b_stride)
1505 return true;
1506 if (a_stride < b_stride)
1507 return false;
1508 if (a.id == 0xffffffff || b.id == 0xffffffff)
1509 return a.id ==
1510 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1511 return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1512 });
1513
1514 PhysReg next_reg = start;
1515 PhysReg space_reg;
1516 for (IDAndInfo& var : sorted) {
1517 unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1518 next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1519
1520 /* 0xffffffff is a special variable ID used reserve a space for killed
1521 * operands and definitions.
1522 */
1523 if (var.id != 0xffffffff) {
1524 if (next_reg != ctx.assignments[var.id].reg) {
1525 RegClass rc = ctx.assignments[var.id].rc;
1526 Temp tmp(var.id, rc);
1527
1528 Operand pc_op(tmp);
1529 pc_op.setFixed(ctx.assignments[var.id].reg);
1530 Definition pc_def(next_reg, rc);
1531 parallelcopies.emplace_back(pc_op, pc_def);
1532 }
1533 } else {
1534 space_reg = next_reg;
1535 }
1536
1537 adjust_max_used_regs(ctx, var.info.rc, next_reg);
1538
1539 next_reg = next_reg.advance(var.info.rc.size() * 4);
1540 }
1541
1542 return space_reg;
1543 }
1544
1545 bool
is_vector_intact(ra_ctx & ctx,const RegisterFile & reg_file,const vector_info & vec_info)1546 is_vector_intact(ra_ctx& ctx, const RegisterFile& reg_file, const vector_info& vec_info)
1547 {
1548 unsigned size = 0;
1549 for (unsigned i = 0; i < vec_info.num_parts; i++)
1550 size += vec_info.parts[i].bytes();
1551
1552 PhysReg first{512};
1553 int offset = 0;
1554 for (unsigned i = 0; i < vec_info.num_parts; i++) {
1555 Operand op = vec_info.parts[i];
1556
1557 if (ctx.assignments[op.tempId()].assigned) {
1558 PhysReg reg = ctx.assignments[op.tempId()].reg;
1559
1560 if (first.reg() == 512) {
1561 PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1562 first = reg.advance(-offset);
1563 PhysRegInterval vec = PhysRegInterval{first, DIV_ROUND_UP(size, 4)};
1564 if (!bounds.contains(vec)) /* not enough space for other operands */
1565 return false;
1566 } else {
1567 if (reg != first.advance(offset)) /* not at the best position */
1568 return false;
1569 }
1570 } else {
1571 /* If there's an unexpected temporary, this operand is unlikely to be
1572 * placed in the best position.
1573 */
1574 if (first.reg() != 512 && reg_file.test(first.advance(offset), op.bytes()))
1575 return false;
1576 }
1577
1578 offset += op.bytes();
1579 }
1580
1581 return true;
1582 }
1583
1584 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr,int operand)1585 get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr,
1586 int operand)
1587 {
1588 const vector_info& vec = ctx.vectors[temp.id()];
1589 if (!vec.is_weak || is_vector_intact(ctx, reg_file, vec)) {
1590 unsigned our_offset = 0;
1591 for (unsigned i = 0; i < vec.num_parts; i++) {
1592 const Operand& op = vec.parts[i];
1593 if (op.isTemp() && op.tempId() == temp.id())
1594 break;
1595 else
1596 our_offset += op.bytes();
1597 }
1598
1599 unsigned their_offset = 0;
1600 /* check for every operand of the vector
1601 * - whether the operand is assigned and
1602 * - we can use the register relative to that operand
1603 */
1604 for (unsigned i = 0; i < vec.num_parts; i++) {
1605 const Operand& op = vec.parts[i];
1606 if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1607 ctx.assignments[op.tempId()].assigned) {
1608 PhysReg reg = ctx.assignments[op.tempId()].reg;
1609 reg.reg_b += (our_offset - their_offset);
1610 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand))
1611 return reg;
1612
1613 /* return if MIMG vaddr components don't remain vector-aligned */
1614 if (vec.is_weak)
1615 return {};
1616 }
1617 their_offset += op.bytes();
1618 }
1619
1620 /* We didn't find a register relative to other vector operands.
1621 * Try to find new space which fits the whole vector.
1622 */
1623 RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1624 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1625 std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1626 if (reg) {
1627 reg->reg_b += our_offset;
1628 /* make sure to only use byte offset if the instruction supports it */
1629 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg, operand))
1630 return reg;
1631 }
1632 }
1633 return {};
1634 }
1635
1636 bool
compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies)1637 compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
1638 std::vector<std::pair<Operand, Definition>>& parallelcopies)
1639 {
1640 PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
1641 int zeros = reg_file.count_zero(linear_vgpr_bounds);
1642 if (zeros == 0)
1643 return false;
1644
1645 std::vector<IDAndRegClass> vars;
1646 for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
1647 vars.emplace_back(id, ctx.assignments[id].rc);
1648
1649 ctx.num_linear_vgprs -= zeros;
1650 compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
1651
1652 return true;
1653 }
1654
1655 /* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
1656 * from normal VGPRs. This is for two reasons:
1657 * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
1658 * linear one, we never have to swap a normal VGPR and a linear one.
1659 * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
1660 * linear VGPR in control flow.
1661 */
1662 PhysReg
alloc_linear_vgpr(ra_ctx & ctx,const RegisterFile & reg_file,aco_ptr<Instruction> & instr,std::vector<std::pair<Operand,Definition>> & parallelcopies)1663 alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
1664 std::vector<std::pair<Operand, Definition>>& parallelcopies)
1665 {
1666 assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
1667 assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
1668
1669 RegClass rc = instr->definitions[0].regClass();
1670
1671 /* Try to choose an unused space in the linear VGPR bounds. */
1672 for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
1673 PhysReg reg(256 + ctx.vgpr_bounds - i);
1674 if (!reg_file.test(reg, rc.bytes())) {
1675 adjust_max_used_regs(ctx, rc, reg);
1676 return reg;
1677 }
1678 }
1679
1680 PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1681
1682 /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
1683 compact_linear_vgprs(ctx, reg_file, parallelcopies);
1684
1685 PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
1686 /* Space that was for normal VGPRs, but is now for linear VGPRs. */
1687 PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
1688
1689 RegisterFile tmp_file(reg_file);
1690 PhysRegInterval reg_win{reg, rc.size()};
1691 std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
1692
1693 /* Re-enable killed operands */
1694 tmp_file.fill_killed_operands(instr.get());
1695
1696 /* Find new assignments for blocking vars. */
1697 std::vector<std::pair<Operand, Definition>> pc;
1698 if (!ctx.policy.skip_optimistic_path &&
1699 get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
1700 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1701 } else {
1702 /* Fallback algorithm: reallocate all variables at once. */
1703 std::vector<IDAndRegClass> vars;
1704 for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
1705 vars.emplace_back(id, ctx.assignments[id].rc);
1706 compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
1707
1708 std::vector<IDAndRegClass> killed_op_vars;
1709 for (Operand& op : instr->operands) {
1710 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
1711 killed_op_vars.emplace_back(op.tempId(), op.regClass());
1712 }
1713 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
1714 }
1715
1716 /* If this is updated earlier, a killed operand can't be placed inside the definition. */
1717 ctx.num_linear_vgprs += rc.size();
1718
1719 adjust_max_used_regs(ctx, rc, reg);
1720 return reg;
1721 }
1722
1723 bool
should_compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file)1724 should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
1725 {
1726 if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
1727 return false;
1728
1729 /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
1730 * ensure in advance that there is enough space for normal VGPRs. */
1731 unsigned max_vgpr_usage = 0;
1732 unsigned next_toplevel = ctx.block->index + 1;
1733 for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
1734 max_vgpr_usage =
1735 MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
1736 }
1737 max_vgpr_usage =
1738 MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].live_in_demand.vgpr);
1739
1740 for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
1741 max_vgpr_usage -= ctx.assignments[tmp].rc.size();
1742
1743 return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
1744 }
1745
1746 PhysReg
get_reg(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1747 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1748 std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1749 int operand_index = -1)
1750 {
1751 auto split_vec = ctx.split_vectors.find(temp.id());
1752 if (split_vec != ctx.split_vectors.end()) {
1753 unsigned offset = 0;
1754 for (Definition def : split_vec->second->definitions) {
1755 if (ctx.assignments[def.tempId()].affinity) {
1756 assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1757 if (affinity.assigned) {
1758 PhysReg reg = affinity.reg;
1759 reg.reg_b -= offset;
1760 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand_index))
1761 return reg;
1762 }
1763 }
1764 offset += def.bytes();
1765 }
1766 }
1767
1768 if (ctx.assignments[temp.id()].affinity) {
1769 assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1770 if (affinity.assigned) {
1771 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index))
1772 return affinity.reg;
1773 }
1774 }
1775 if (ctx.assignments[temp.id()].vcc) {
1776 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc, operand_index))
1777 return vcc;
1778 }
1779 if (ctx.assignments[temp.id()].m0) {
1780 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0, operand_index))
1781 return m0;
1782 }
1783
1784 std::optional<PhysReg> res;
1785
1786 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1787 res = get_reg_vector(ctx, reg_file, temp, instr, operand_index);
1788 if (res)
1789 return *res;
1790 }
1791
1792 if (temp.size() == 1 && operand_index == -1) {
1793 for (const Operand& op : instr->operands) {
1794 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass() == temp.regClass()) {
1795 assert(op.isFixed());
1796 if (op.physReg() == vcc || op.physReg() == vcc_hi)
1797 continue;
1798 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, op.physReg(),
1799 operand_index))
1800 return op.physReg();
1801 }
1802 }
1803 }
1804
1805 DefInfo info(ctx, instr, temp.regClass(), operand_index);
1806
1807 if (!ctx.policy.skip_optimistic_path) {
1808 /* try to find space without live-range splits */
1809 res = get_reg_simple(ctx, reg_file, info);
1810
1811 if (res)
1812 return *res;
1813 }
1814
1815 /* try to find space with live-range splits */
1816 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1817
1818 if (res)
1819 return *res;
1820
1821 /* try compacting the linear vgprs to make more space */
1822 std::vector<std::pair<Operand, Definition>> pc;
1823 if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
1824 compact_linear_vgprs(ctx, reg_file, pc)) {
1825 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1826
1827 /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
1828 * space here. */
1829 RegisterFile tmp_file(reg_file);
1830 for (std::pair<Operand, Definition>& copy : pc)
1831 tmp_file.clear(copy.first);
1832
1833 return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
1834 }
1835
1836 /* We should only fail here because keeping under the limit would require
1837 * too many moves. */
1838 assert(reg_file.count_zero(info.bounds) >= info.size);
1839
1840 /* try using more registers */
1841 if (!increase_register_file(ctx, info.rc)) {
1842 /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
1843 * compact at the end) */
1844 unsigned def_size = info.rc.size();
1845 for (Definition def : instr->definitions) {
1846 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1847 def_size += def.regClass().size();
1848 }
1849
1850 unsigned killed_op_size = 0;
1851 for (Operand op : instr->operands) {
1852 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1853 killed_op_size += op.regClass().size();
1854 }
1855
1856 const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
1857
1858 /* reallocate passthrough variables and non-killed operands */
1859 std::vector<IDAndRegClass> vars;
1860 for (unsigned id : find_vars(ctx, reg_file, regs))
1861 vars.emplace_back(id, ctx.assignments[id].rc);
1862 vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1863
1864 PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1865
1866 /* reallocate killed operands */
1867 std::vector<IDAndRegClass> killed_op_vars;
1868 for (Operand op : instr->operands) {
1869 if (op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1870 killed_op_vars.emplace_back(op.tempId(), op.regClass());
1871 }
1872 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1873
1874 /* reallocate definitions */
1875 std::vector<IDAndRegClass> def_vars;
1876 for (Definition def : instr->definitions) {
1877 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1878 def_vars.emplace_back(def.tempId(), def.regClass());
1879 }
1880 def_vars.emplace_back(0xffffffff, info.rc);
1881 return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1882 }
1883
1884 return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1885 }
1886
1887 PhysReg
get_reg_create_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1888 get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1889 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1890 aco_ptr<Instruction>& instr)
1891 {
1892 RegClass rc = temp.regClass();
1893 /* create_vector instructions have different costs w.r.t. register coalescing */
1894 uint32_t size = rc.size();
1895 uint32_t bytes = rc.bytes();
1896 uint32_t stride = get_stride(rc);
1897 PhysRegInterval bounds = get_reg_bounds(ctx, rc);
1898
1899 // TODO: improve p_create_vector for sub-dword vectors
1900
1901 PhysReg best_pos{0xFFF};
1902 unsigned num_moves = 0xFF;
1903 bool best_avoid = true;
1904 uint32_t correct_pos_mask = 0;
1905
1906 /* test for each operand which definition placement causes the least shuffle instructions */
1907 for (unsigned i = 0, offset = 0; i < instr->operands.size();
1908 offset += instr->operands[i].bytes(), i++) {
1909 // TODO: think about, if we can alias live operands on the same register
1910 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1911 instr->operands[i].getTemp().type() != rc.type())
1912 continue;
1913
1914 if (offset > instr->operands[i].physReg().reg_b)
1915 continue;
1916
1917 unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1918 if (reg_lower % 4)
1919 continue;
1920 PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1921 unsigned k = 0;
1922
1923 /* no need to check multiple times */
1924 if (reg_win.lo() == best_pos)
1925 continue;
1926
1927 /* check borders */
1928 // TODO: this can be improved */
1929 if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1930 continue;
1931 if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1932 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1933 continue;
1934 if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1935 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1936 continue;
1937
1938 /* count variables to be moved and check "avoid" */
1939 bool avoid = false;
1940 bool linear_vgpr = false;
1941 for (PhysReg j : reg_win) {
1942 if (reg_file[j] != 0) {
1943 if (reg_file[j] == 0xF0000000) {
1944 PhysReg reg;
1945 reg.reg_b = j * 4;
1946 unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1947 for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1948 k += reg_file.test(reg, 1);
1949 } else {
1950 k += 4;
1951 linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1952 }
1953 }
1954 avoid |= ctx.war_hint[j];
1955 }
1956
1957 /* we cannot split live ranges of linear vgprs */
1958 if (linear_vgpr)
1959 continue;
1960
1961 if (avoid && !best_avoid)
1962 continue;
1963
1964 /* count operands in wrong positions */
1965 uint32_t correct_pos_mask_new = 0;
1966 for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1967 offset2 += instr->operands[j].bytes(), j++) {
1968 Operand& op = instr->operands[j];
1969 if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1970 correct_pos_mask_new |= 1 << j;
1971 else
1972 k += op.bytes();
1973 }
1974 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1975 if (k > num_moves || (!aligned && k == num_moves))
1976 continue;
1977
1978 best_pos = reg_win.lo();
1979 num_moves = k;
1980 best_avoid = avoid;
1981 correct_pos_mask = correct_pos_mask_new;
1982 }
1983
1984 /* too many moves: try the generic get_reg() function */
1985 if (num_moves >= 2 * bytes) {
1986 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1987 } else if (num_moves > bytes) {
1988 DefInfo info(ctx, instr, rc, -1);
1989 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1990 if (res)
1991 return *res;
1992 }
1993
1994 /* re-enable killed operands which are in the wrong position */
1995 RegisterFile tmp_file(reg_file);
1996 tmp_file.fill_killed_operands(instr.get());
1997
1998 for (unsigned i = 0; i < instr->operands.size(); i++) {
1999 if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
2000 tmp_file.clear(instr->operands[i]);
2001 }
2002
2003 /* collect variables to be moved */
2004 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
2005
2006 bool success = false;
2007 std::vector<std::pair<Operand, Definition>> pc;
2008 success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
2009
2010 if (!success) {
2011 if (!increase_register_file(ctx, temp.regClass())) {
2012 /* use the fallback algorithm in get_reg() */
2013 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
2014 }
2015 return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
2016 }
2017
2018 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
2019 adjust_max_used_regs(ctx, rc, best_pos);
2020
2021 return best_pos;
2022 }
2023
2024 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)2025 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
2026 {
2027 if (instr->format != Format::PSEUDO)
2028 return;
2029
2030 /* all instructions which use handle_operands() need this information */
2031 switch (instr->opcode) {
2032 case aco_opcode::p_extract_vector:
2033 case aco_opcode::p_create_vector:
2034 case aco_opcode::p_split_vector:
2035 case aco_opcode::p_parallelcopy:
2036 case aco_opcode::p_start_linear_vgpr: break;
2037 default: return;
2038 }
2039
2040 bool writes_linear = false;
2041 /* if all definitions are logical vgpr, no need to care for SCC */
2042 for (Definition& def : instr->definitions) {
2043 if (def.getTemp().regClass().is_linear())
2044 writes_linear = true;
2045 }
2046 /* if all operands are constant, no need to care either */
2047 bool reads_linear = false;
2048 for (Operand& op : instr->operands) {
2049 if (op.isTemp() && op.getTemp().regClass().is_linear())
2050 reads_linear = true;
2051 }
2052
2053 if (!writes_linear || !reads_linear)
2054 return;
2055
2056 instr->pseudo().needs_scratch_reg = true;
2057
2058 if (!reg_file[scc]) {
2059 instr->pseudo().scratch_sgpr = scc;
2060 return;
2061 }
2062
2063 int reg = ctx.max_used_sgpr;
2064 for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
2065 ;
2066 if (reg < 0) {
2067 reg = ctx.max_used_sgpr + 1;
2068 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
2069 ;
2070 }
2071
2072 adjust_max_used_regs(ctx, s1, reg);
2073 instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
2074 }
2075
2076 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)2077 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
2078 RegClass rc)
2079 {
2080 if (reg.byte()) {
2081 unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
2082 if (reg.byte() % stride)
2083 return false;
2084 }
2085
2086 switch (instr->format) {
2087 case Format::SMEM:
2088 return reg != scc && reg != exec &&
2089 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
2090 (reg != vcc || (instr->definitions.empty() && idx == 2) ||
2091 gfx_level >= GFX10); /* sdata can be vcc */
2092 case Format::MUBUF:
2093 case Format::MTBUF: return idx != 2 || gfx_level < GFX12 || reg != scc;
2094 case Format::SOPK:
2095 if (idx == 0 && reg == scc)
2096 return false;
2097 FALLTHROUGH;
2098 case Format::SOP2:
2099 case Format::SOP1:
2100 return get_op_fixed_to_def(instr.get()) != (int)idx ||
2101 is_sgpr_writable_without_side_effects(gfx_level, reg);
2102 default:
2103 // TODO: there are more instructions with restrictions on registers
2104 return true;
2105 }
2106 }
2107
2108 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)2109 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
2110 std::vector<std::pair<Operand, Definition>>& parallelcopy,
2111 aco_ptr<Instruction>& instr)
2112 {
2113 assert(instr->operands.size() <= 128);
2114 assert(parallelcopy.empty());
2115
2116 RegisterFile tmp_file(register_file);
2117
2118 BITSET_DECLARE(mask, 128) = {0};
2119
2120 for (unsigned i = 0; i < instr->operands.size(); i++) {
2121 Operand& op = instr->operands[i];
2122
2123 if (!op.isPrecolored())
2124 continue;
2125
2126 assert(op.isTemp());
2127 PhysReg src = ctx.assignments[op.tempId()].reg;
2128 adjust_max_used_regs(ctx, op.regClass(), op.physReg());
2129
2130 if (op.physReg() == src) {
2131 tmp_file.block(op.physReg(), op.regClass());
2132 continue;
2133 }
2134
2135 /* An instruction can have at most one operand precolored to the same register. */
2136 assert(std::none_of(parallelcopy.begin(), parallelcopy.end(),
2137 [&](auto copy) { return copy.second.physReg() == op.physReg(); }));
2138
2139 /* clear from register_file so fixed operands are not collected be collect_vars() */
2140 tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
2141
2142 BITSET_SET(mask, i);
2143
2144 Operand pc_op(instr->operands[i].getTemp());
2145 pc_op.setFixed(src);
2146 Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2147 parallelcopy.emplace_back(pc_op, pc_def);
2148 }
2149
2150 if (BITSET_IS_EMPTY(mask))
2151 return;
2152
2153 unsigned i;
2154 std::vector<unsigned> blocking_vars;
2155 BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2156 Operand& op = instr->operands[i];
2157 PhysRegInterval target{op.physReg(), op.size()};
2158 std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2159 blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2160
2161 /* prevent get_regs_for_copies() from using these registers */
2162 tmp_file.block(op.physReg(), op.regClass());
2163 }
2164
2165 get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2166 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
2167 }
2168
2169 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2170 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2171 std::vector<std::pair<Operand, Definition>>& parallelcopy,
2172 aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2173 {
2174 /* clear the operand in case it's only a stride mismatch */
2175 PhysReg src = ctx.assignments[operand.tempId()].reg;
2176 register_file.clear(src, operand.regClass());
2177 PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2178
2179 Operand pc_op = operand;
2180 pc_op.setFixed(src);
2181 Definition pc_def = Definition(dst, pc_op.regClass());
2182 parallelcopy.emplace_back(pc_op, pc_def);
2183 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
2184 register_file.fill(Definition(operand.getTemp(), dst));
2185 }
2186
2187 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2188 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2189 std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2190 aco_ptr<Instruction>& phi, Temp tmp)
2191 {
2192 std::vector<std::pair<Operand, Definition>> parallelcopy;
2193 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2194 update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2195
2196 /* process parallelcopy */
2197 for (std::pair<Operand, Definition> pc : parallelcopy) {
2198 /* see if it's a copy from a different phi */
2199 // TODO: prefer moving some previous phis over live-ins
2200 // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2201 // problem in practice since they can only be fixed to exec)
2202 Instruction* prev_phi = NULL;
2203 for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2204 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2205 prev_phi = phi_it->get();
2206 }
2207 if (prev_phi) {
2208 /* if so, just update that phi's register */
2209 prev_phi->definitions[0].setFixed(pc.second.physReg());
2210 register_file.fill(prev_phi->definitions[0]);
2211 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2212 pc.second.regClass()};
2213 continue;
2214 }
2215
2216 /* rename */
2217 auto orig_it = ctx.orig_names.find(pc.first.tempId());
2218 Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2219 add_rename(ctx, orig, pc.second.getTemp());
2220
2221 /* otherwise, this is a live-in and we need to create a new phi
2222 * to move it in this block's predecessors */
2223 aco_opcode opcode =
2224 pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2225 Block::edge_vec& preds =
2226 pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2227 aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2228 new_phi->definitions[0] = pc.second;
2229 for (unsigned i = 0; i < preds.size(); i++)
2230 new_phi->operands[i] = Operand(pc.first);
2231 instructions.emplace_back(std::move(new_phi));
2232
2233 /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2234 * a loop header.
2235 */
2236 live_in.erase(orig.id());
2237 }
2238
2239 return reg;
2240 }
2241
2242 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2243 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2244 std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2245 {
2246 /* move all phis to instructions */
2247 bool has_linear_phis = false;
2248 for (aco_ptr<Instruction>& phi : block.instructions) {
2249 if (!is_phi(phi))
2250 break;
2251 if (!phi->definitions[0].isKill()) {
2252 has_linear_phis |= phi->opcode == aco_opcode::p_linear_phi;
2253 instructions.emplace_back(std::move(phi));
2254 }
2255 }
2256
2257 /* assign phis with all-matching registers to that register */
2258 for (aco_ptr<Instruction>& phi : instructions) {
2259 Definition& definition = phi->definitions[0];
2260 if (definition.isFixed())
2261 continue;
2262
2263 if (!phi->operands[0].isTemp())
2264 continue;
2265
2266 PhysReg reg = phi->operands[0].physReg();
2267 auto OpsSame = [=](const Operand& op) -> bool
2268 { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2269 bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2270 if (!all_same)
2271 continue;
2272
2273 if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1))
2274 continue;
2275
2276 definition.setFixed(reg);
2277 register_file.fill(definition);
2278 ctx.assignments[definition.tempId()].set(definition);
2279 }
2280
2281 /* try to find a register that is used by at least one operand */
2282 for (aco_ptr<Instruction>& phi : instructions) {
2283 Definition& definition = phi->definitions[0];
2284 if (definition.isFixed())
2285 continue;
2286
2287 /* use affinity if available */
2288 if (ctx.assignments[definition.tempId()].affinity &&
2289 ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2290 assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2291 assert(affinity.rc == definition.regClass());
2292 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg, -1)) {
2293 definition.setFixed(affinity.reg);
2294 register_file.fill(definition);
2295 ctx.assignments[definition.tempId()].set(definition);
2296 continue;
2297 }
2298 }
2299
2300 /* by going backwards, we aim to avoid copies in else-blocks */
2301 for (int i = phi->operands.size() - 1; i >= 0; i--) {
2302 const Operand& op = phi->operands[i];
2303 if (!op.isTemp() || !op.isFixed())
2304 continue;
2305
2306 PhysReg reg = op.physReg();
2307 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1)) {
2308 definition.setFixed(reg);
2309 register_file.fill(definition);
2310 ctx.assignments[definition.tempId()].set(definition);
2311 break;
2312 }
2313 }
2314 }
2315
2316 /* find registers for phis where the register was blocked or no operand was assigned */
2317
2318 /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2319 for (unsigned i = 0; i < instructions.size(); i++) {
2320 aco_ptr<Instruction>& phi = instructions[i];
2321 Definition& definition = phi->definitions[0];
2322 if (definition.isFixed())
2323 continue;
2324
2325 definition.setFixed(
2326 get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2327
2328 register_file.fill(definition);
2329 ctx.assignments[definition.tempId()].set(definition);
2330 }
2331
2332 /* Provide a scratch register in case we need to preserve SCC */
2333 if (has_linear_phis || block.kind & block_kind_loop_header) {
2334 PhysReg scratch_reg = scc;
2335 if (register_file[scc]) {
2336 scratch_reg = get_reg_phi(ctx, live_in, register_file, instructions, block, ctx.phi_dummy,
2337 Temp(0, s1));
2338 if (block.kind & block_kind_loop_header)
2339 ctx.loop_header.back().second = scratch_reg;
2340 }
2341
2342 for (aco_ptr<Instruction>& phi : instructions) {
2343 phi->pseudo().scratch_sgpr = scratch_reg;
2344 phi->pseudo().needs_scratch_reg = true;
2345 }
2346 }
2347 }
2348
2349 inline Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2350 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2351 {
2352 /* This variable didn't get renamed, yet. */
2353 if (!ctx.assignments[val.id()].renamed)
2354 return val;
2355
2356 auto it = ctx.renames[block_idx].find(val.id());
2357 if (it == ctx.renames[block_idx].end())
2358 return val;
2359 else
2360 return it->second;
2361 }
2362
2363 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2364 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2365 {
2366 /* This variable didn't get renamed, yet. */
2367 if (!ctx.assignments[val.id()].renamed)
2368 return val;
2369
2370 Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2371 if (preds.size() == 0)
2372 return val;
2373
2374 if (preds.size() == 1) {
2375 /* if the block has only one predecessor, just look there for the name */
2376 return read_variable(ctx, val, preds[0]);
2377 }
2378
2379 /* there are multiple predecessors and the block is sealed */
2380 Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2381
2382 /* get the rename from each predecessor and check if they are the same */
2383 Temp new_val;
2384 bool needs_phi = false;
2385 for (unsigned i = 0; i < preds.size(); i++) {
2386 ops[i] = read_variable(ctx, val, preds[i]);
2387 if (i == 0)
2388 new_val = ops[i];
2389 else
2390 needs_phi |= !(new_val == ops[i]);
2391 }
2392
2393 if (needs_phi) {
2394 assert(!val.regClass().is_linear_vgpr());
2395
2396 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2397 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2398 aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2399 new_val = ctx.program->allocateTmp(val.regClass());
2400 phi->definitions[0] = Definition(new_val);
2401 ctx.assignments.emplace_back();
2402 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2403 for (unsigned i = 0; i < preds.size(); i++) {
2404 /* update the operands so that it uses the new affinity */
2405 phi->operands[i] = Operand(ops[i]);
2406 assert(ctx.assignments[ops[i].id()].assigned);
2407 assert(ops[i].regClass() == new_val.regClass());
2408 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2409 }
2410 block->instructions.insert(block->instructions.begin(), std::move(phi));
2411 }
2412
2413 return new_val;
2414 }
2415
2416 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx,PhysReg scratch_reg)2417 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2418 uint32_t loop_exit_idx, PhysReg scratch_reg)
2419 {
2420 Block& loop_header = ctx.program->blocks[loop_header_idx];
2421 aco::unordered_map<uint32_t, Temp> renames(ctx.memory);
2422
2423 /* create phis for variables renamed during the loop */
2424 for (unsigned t : live_in) {
2425 if (!ctx.assignments[t].renamed)
2426 continue;
2427
2428 Temp val = Temp(t, ctx.program->temp_rc[t]);
2429 Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2430 Temp renamed = handle_live_in(ctx, val, &loop_header);
2431 if (renamed == prev)
2432 continue;
2433
2434 /* insert additional renames at block end, but don't overwrite */
2435 renames[prev.id()] = renamed;
2436 ctx.orig_names[renamed.id()] = val;
2437 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2438 auto it = ctx.renames[idx].emplace(val.id(), renamed);
2439 /* if insertion is unsuccessful, update if necessary */
2440 if (!it.second && it.first->second == prev)
2441 it.first->second = renamed;
2442 }
2443
2444 /* update loop-carried values of the phi created by handle_live_in() */
2445 for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2446 Operand& op = loop_header.instructions[0]->operands[i];
2447 if (op.getTemp() == prev)
2448 op.setTemp(renamed);
2449 }
2450
2451 /* use the assignment from the loop preheader and fix def reg */
2452 assignment& var = ctx.assignments[prev.id()];
2453 ctx.assignments[renamed.id()] = var;
2454 loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2455
2456 /* Set scratch register */
2457 loop_header.instructions[0]->pseudo().scratch_sgpr = scratch_reg;
2458 loop_header.instructions[0]->pseudo().needs_scratch_reg = true;
2459 }
2460
2461 /* rename loop carried phi operands */
2462 for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2463 aco_ptr<Instruction>& phi = loop_header.instructions[i];
2464 if (!is_phi(phi))
2465 break;
2466 const Block::edge_vec& preds =
2467 phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2468 for (unsigned j = 1; j < phi->operands.size(); j++) {
2469 Operand& op = phi->operands[j];
2470 if (!op.isTemp())
2471 continue;
2472
2473 /* Find the original name, since this operand might not use the original name if the phi
2474 * was created after init_reg_file().
2475 */
2476 auto it = ctx.orig_names.find(op.tempId());
2477 Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2478
2479 op.setTemp(read_variable(ctx, orig, preds[j]));
2480 op.setFixed(ctx.assignments[op.tempId()].reg);
2481 }
2482 }
2483
2484 /* return early if no new phi was created */
2485 if (renames.empty())
2486 return;
2487
2488 /* propagate new renames through loop */
2489 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2490 Block& current = ctx.program->blocks[idx];
2491 /* rename all uses in this block */
2492 for (aco_ptr<Instruction>& instr : current.instructions) {
2493 /* phis are renamed after RA */
2494 if (idx == loop_header_idx && is_phi(instr))
2495 continue;
2496
2497 for (Operand& op : instr->operands) {
2498 if (!op.isTemp())
2499 continue;
2500
2501 auto rename = renames.find(op.tempId());
2502 if (rename != renames.end()) {
2503 assert(rename->second.id());
2504 op.setTemp(rename->second);
2505 }
2506 }
2507 }
2508 }
2509 }
2510
2511 /**
2512 * This function serves the purpose to correctly initialize the register file
2513 * at the beginning of a block (before any existing phis).
2514 * In order to do so, all live-in variables are entered into the RegisterFile.
2515 * Reg-to-reg moves (renames) from previous blocks are taken into account and
2516 * the SSA is repaired by inserting corresponding phi-nodes.
2517 */
2518 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2519 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2520 {
2521 if (block.kind & block_kind_loop_exit) {
2522 uint32_t header = ctx.loop_header.back().first;
2523 PhysReg scratch_reg = ctx.loop_header.back().second;
2524 ctx.loop_header.pop_back();
2525 handle_loop_phis(ctx, live_out_per_block[header], header, block.index, scratch_reg);
2526 }
2527
2528 RegisterFile register_file;
2529 const IDSet& live_in = live_out_per_block[block.index];
2530 assert(block.index != 0 || live_in.empty());
2531
2532 if (block.kind & block_kind_loop_header) {
2533 ctx.loop_header.emplace_back(block.index, PhysReg{scc});
2534 /* already rename phis incoming value */
2535 for (aco_ptr<Instruction>& instr : block.instructions) {
2536 if (!is_phi(instr))
2537 break;
2538 Operand& operand = instr->operands[0];
2539 if (operand.isTemp()) {
2540 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2541 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2542 }
2543 }
2544 for (unsigned t : live_in) {
2545 Temp val = Temp(t, ctx.program->temp_rc[t]);
2546 Temp renamed = read_variable(ctx, val, block.index - 1);
2547 if (renamed != val)
2548 add_rename(ctx, val, renamed);
2549 assignment& var = ctx.assignments[renamed.id()];
2550 assert(var.assigned);
2551 register_file.fill(Definition(renamed, var.reg));
2552 }
2553 } else {
2554 /* rename phi operands */
2555 for (aco_ptr<Instruction>& instr : block.instructions) {
2556 if (!is_phi(instr))
2557 break;
2558 const Block::edge_vec& preds =
2559 instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2560
2561 for (unsigned i = 0; i < instr->operands.size(); i++) {
2562 Operand& operand = instr->operands[i];
2563 if (!operand.isTemp())
2564 continue;
2565 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2566 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2567 }
2568 }
2569 for (unsigned t : live_in) {
2570 Temp val = Temp(t, ctx.program->temp_rc[t]);
2571 Temp renamed = handle_live_in(ctx, val, &block);
2572 assignment& var = ctx.assignments[renamed.id()];
2573 /* due to live-range splits, the live-in might be a phi, now */
2574 if (var.assigned) {
2575 register_file.fill(Definition(renamed, var.reg));
2576 }
2577 if (renamed != val) {
2578 add_rename(ctx, val, renamed);
2579 }
2580 }
2581 }
2582
2583 return register_file;
2584 }
2585
2586 bool
vop3_can_use_vop2acc(ra_ctx & ctx,Instruction * instr)2587 vop3_can_use_vop2acc(ra_ctx& ctx, Instruction* instr)
2588 {
2589 if (!instr->isVOP3() && !instr->isVOP3P())
2590 return false;
2591
2592 switch (instr->opcode) {
2593 case aco_opcode::v_mad_f32:
2594 case aco_opcode::v_mad_f16:
2595 case aco_opcode::v_mad_legacy_f16: break;
2596 case aco_opcode::v_fma_f32:
2597 case aco_opcode::v_pk_fma_f16:
2598 case aco_opcode::v_fma_f16:
2599 case aco_opcode::v_dot4_i32_i8:
2600 if (ctx.program->gfx_level < GFX10)
2601 return false;
2602 break;
2603 case aco_opcode::v_mad_legacy_f32:
2604 if (!ctx.program->dev.has_mac_legacy32)
2605 return false;
2606 break;
2607 case aco_opcode::v_fma_legacy_f32:
2608 if (!ctx.program->dev.has_fmac_legacy32)
2609 return false;
2610 break;
2611 default: return false;
2612 }
2613
2614 if (!instr->operands[2].isOfType(RegType::vgpr) || !instr->operands[2].isKillBeforeDef() ||
2615 (!instr->operands[0].isOfType(RegType::vgpr) && !instr->operands[1].isOfType(RegType::vgpr)))
2616 return false;
2617
2618 if (instr->isVOP3P()) {
2619 for (unsigned i = 0; i < 3; i++) {
2620 if (instr->operands[i].isLiteral())
2621 continue;
2622
2623 if (instr->valu().opsel_lo[i])
2624 return false;
2625
2626 /* v_pk_fmac_f16 inline constants are replicated to hi bits starting with gfx11. */
2627 if (instr->valu().opsel_hi[i] ==
2628 (instr->operands[i].isConstant() && ctx.program->gfx_level >= GFX11))
2629 return false;
2630 }
2631 } else {
2632 if (instr->valu().opsel & (ctx.program->gfx_level < GFX11 ? 0xf : ~0x3))
2633 return false;
2634 for (unsigned i = 0; i < 2; i++) {
2635 if (!instr->operands[i].isOfType(RegType::vgpr) && instr->valu().opsel[i])
2636 return false;
2637 }
2638 }
2639
2640 unsigned im_mask = instr->isDPP16() && instr->isVOP3() ? 0x3 : 0;
2641 if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2642 (instr->valu().neg & ~im_mask))
2643 return false;
2644
2645 return true;
2646 }
2647
2648 bool
sop2_can_use_sopk(ra_ctx & ctx,Instruction * instr)2649 sop2_can_use_sopk(ra_ctx& ctx, Instruction* instr)
2650 {
2651 if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_add_u32 &&
2652 instr->opcode != aco_opcode::s_mul_i32 && instr->opcode != aco_opcode::s_cselect_b32)
2653 return false;
2654
2655 if (instr->opcode == aco_opcode::s_add_u32 && !instr->definitions[1].isKill())
2656 return false;
2657
2658 uint32_t literal_idx = 0;
2659
2660 if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2661 literal_idx = 1;
2662
2663 if (!instr->operands[!literal_idx].isTemp() || !instr->operands[!literal_idx].isKillBeforeDef())
2664 return false;
2665
2666 if (!instr->operands[literal_idx].isLiteral())
2667 return false;
2668
2669 const uint32_t i16_mask = 0xffff8000u;
2670 uint32_t value = instr->operands[literal_idx].constantValue();
2671 if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2672 return false;
2673
2674 return true;
2675 }
2676
2677 void
create_phi_vector_affinities(ra_ctx & ctx,aco_ptr<Instruction> & instr,std::map<Operand *,std::vector<vector_info>> & vector_phis)2678 create_phi_vector_affinities(ra_ctx& ctx, aco_ptr<Instruction>& instr,
2679 std::map<Operand*, std::vector<vector_info>>& vector_phis)
2680 {
2681 auto it = ctx.vectors.find(instr->definitions[0].tempId());
2682 if (it == ctx.vectors.end())
2683 return;
2684 vector_info& dest_vector = it->second;
2685
2686 auto pair = vector_phis.try_emplace(dest_vector.parts, instr->operands.size(), dest_vector);
2687 std::vector<vector_info>& src_vectors = pair.first->second;
2688 if (pair.second) {
2689 RegType type = instr->definitions[0].regClass().type();
2690
2691 for (vector_info& src_vector : src_vectors) {
2692 src_vector.parts =
2693 (Operand*)ctx.memory.allocate(sizeof(Operand) * src_vector.num_parts, alignof(Operand));
2694 for (unsigned j = 0; j < src_vector.num_parts; j++)
2695 src_vector.parts[j] = Operand(RegClass::get(type, dest_vector.parts[j].bytes()));
2696 }
2697 }
2698
2699 unsigned index = 0;
2700 for (; index < dest_vector.num_parts; index++) {
2701 if (dest_vector.parts[index].isTemp() &&
2702 dest_vector.parts[index].tempId() == instr->definitions[0].tempId())
2703 break;
2704 }
2705 assert(index != dest_vector.num_parts);
2706
2707 for (int i = instr->operands.size() - 1; i >= 0; i--) {
2708 const Operand& op = instr->operands[i];
2709 if (!op.isTemp() || op.regClass() != instr->definitions[0].regClass())
2710 continue;
2711
2712 src_vectors[i].parts[index] = op;
2713 ctx.vectors[op.tempId()] = src_vectors[i];
2714 }
2715 }
2716
2717 void
get_affinities(ra_ctx & ctx)2718 get_affinities(ra_ctx& ctx)
2719 {
2720 std::vector<std::vector<Temp>> phi_resources;
2721 aco::unordered_map<uint32_t, uint32_t> temp_to_phi_resources(ctx.memory);
2722
2723 for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2724 block_rit++) {
2725 Block& block = *block_rit;
2726
2727 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2728 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2729 aco_ptr<Instruction>& instr = *rit;
2730 if (is_phi(instr))
2731 break;
2732
2733 /* add vector affinities */
2734 if (instr->opcode == aco_opcode::p_create_vector) {
2735 for (const Operand& op : instr->operands) {
2736 if (op.isTemp() && op.isFirstKill() &&
2737 op.getTemp().type() == instr->definitions[0].getTemp().type())
2738 ctx.vectors[op.tempId()] = vector_info(instr.get());
2739 }
2740 } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2741 !instr->mimg().strict_wqm && ctx.program->gfx_level < GFX12) {
2742 for (unsigned i = 3; i < instr->operands.size(); i++)
2743 ctx.vectors[instr->operands[i].tempId()] = vector_info(instr.get(), 3, true);
2744 } else if (instr->opcode == aco_opcode::p_split_vector &&
2745 instr->operands[0].isFirstKillBeforeDef()) {
2746 ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2747 } else if (instr->isVOPC() && !instr->isVOP3()) {
2748 if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2749 ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2750 } else if (instr->isVOP2() && !instr->isVOP3()) {
2751 if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2752 instr->operands[2].regClass().type() == RegType::sgpr)
2753 ctx.assignments[instr->operands[2].tempId()].vcc = true;
2754 if (instr->definitions.size() == 2)
2755 ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2756 } else if (instr->opcode == aco_opcode::s_and_b32 ||
2757 instr->opcode == aco_opcode::s_and_b64) {
2758 /* If SCC is used by a branch, we might be able to use
2759 * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2760 */
2761 if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2762 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2763 ctx.assignments[instr->operands[0].tempId()].vcc = true;
2764 } else if (instr->opcode == aco_opcode::s_sendmsg) {
2765 ctx.assignments[instr->operands[0].tempId()].m0 = true;
2766 }
2767
2768 int op_fixed_to_def0 = get_op_fixed_to_def(instr.get());
2769 for (unsigned i = 0; i < instr->definitions.size(); i++) {
2770 const Definition& def = instr->definitions[i];
2771 if (!def.isTemp())
2772 continue;
2773 /* mark last-seen phi operand */
2774 auto it = temp_to_phi_resources.find(def.tempId());
2775 if (it != temp_to_phi_resources.end() &&
2776 def.regClass() == phi_resources[it->second][0].regClass()) {
2777 phi_resources[it->second][0] = def.getTemp();
2778 /* try to coalesce phi affinities with parallelcopies */
2779 Operand op;
2780 if (instr->opcode == aco_opcode::p_parallelcopy) {
2781 op = instr->operands[i];
2782 } else if (i == 0 && op_fixed_to_def0 != -1) {
2783 op = instr->operands[op_fixed_to_def0];
2784 } else if (vop3_can_use_vop2acc(ctx, instr.get())) {
2785 op = instr->operands[2];
2786 } else if (i == 0 && sop2_can_use_sopk(ctx, instr.get())) {
2787 op = instr->operands[instr->operands[0].isLiteral()];
2788 } else {
2789 continue;
2790 }
2791
2792 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2793 phi_resources[it->second].emplace_back(op.getTemp());
2794 temp_to_phi_resources[op.tempId()] = it->second;
2795 }
2796 }
2797 }
2798 }
2799
2800 /* collect phi affinities */
2801 std::map<Operand*, std::vector<vector_info>> vector_phis;
2802 for (; rit != block.instructions.rend(); ++rit) {
2803 aco_ptr<Instruction>& instr = *rit;
2804 assert(is_phi(instr));
2805
2806 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2807 continue;
2808
2809 assert(instr->definitions[0].isTemp());
2810 auto it = temp_to_phi_resources.find(instr->definitions[0].tempId());
2811 unsigned index = phi_resources.size();
2812 std::vector<Temp>* affinity_related;
2813 if (it != temp_to_phi_resources.end()) {
2814 index = it->second;
2815 phi_resources[index][0] = instr->definitions[0].getTemp();
2816 affinity_related = &phi_resources[index];
2817 } else {
2818 phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2819 affinity_related = &phi_resources.back();
2820 }
2821
2822 for (const Operand& op : instr->operands) {
2823 if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2824 affinity_related->emplace_back(op.getTemp());
2825 if (block.kind & block_kind_loop_header)
2826 continue;
2827 temp_to_phi_resources[op.tempId()] = index;
2828 }
2829 }
2830
2831 create_phi_vector_affinities(ctx, instr, vector_phis);
2832 }
2833
2834 /* visit the loop header phis first in order to create nested affinities */
2835 if (block.kind & block_kind_loop_exit) {
2836 /* find loop header */
2837 auto header_rit = block_rit;
2838 while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2839 header_rit++;
2840
2841 for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2842 if (!is_phi(phi))
2843 break;
2844 if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2845 continue;
2846
2847 /* create an (empty) merge-set for the phi-related variables */
2848 auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2849 unsigned index = phi_resources.size();
2850 if (it == temp_to_phi_resources.end()) {
2851 temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2852 phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2853 } else {
2854 index = it->second;
2855 }
2856 for (unsigned i = 1; i < phi->operands.size(); i++) {
2857 const Operand& op = phi->operands[i];
2858 if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2859 temp_to_phi_resources[op.tempId()] = index;
2860 }
2861 }
2862 }
2863 }
2864 }
2865 /* create affinities */
2866 for (std::vector<Temp>& vec : phi_resources) {
2867 for (unsigned i = 1; i < vec.size(); i++)
2868 if (vec[i].id() != vec[0].id())
2869 ctx.assignments[vec[i].id()].affinity = vec[0].id();
2870 }
2871 }
2872
2873 void
optimize_encoding_vop2(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2874 optimize_encoding_vop2(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2875 {
2876 if (!vop3_can_use_vop2acc(ctx, instr.get()))
2877 return;
2878
2879 for (unsigned i = ctx.program->gfx_level < GFX11 ? 0 : 2; i < 3; i++) {
2880 if (instr->operands[i].physReg().byte())
2881 return;
2882 }
2883
2884 unsigned def_id = instr->definitions[0].tempId();
2885 if (ctx.assignments[def_id].affinity) {
2886 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2887 if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2888 !register_file.test(affinity.reg, instr->operands[2].bytes()))
2889 return;
2890 }
2891
2892 if (!instr->operands[1].isOfType(RegType::vgpr))
2893 instr->valu().swapOperands(0, 1);
2894
2895 if (instr->isVOP3P() && instr->operands[0].isLiteral()) {
2896 unsigned literal = instr->operands[0].constantValue();
2897 unsigned lo = (literal >> (instr->valu().opsel_lo[0] * 16)) & 0xffff;
2898 unsigned hi = (literal >> (instr->valu().opsel_hi[0] * 16)) & 0xffff;
2899 instr->operands[0] = Operand::literal32(lo | (hi << 16));
2900 }
2901
2902 instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2903 (unsigned)Format::VOP2);
2904 instr->valu().opsel_lo = 0;
2905 instr->valu().opsel_hi = 0;
2906 switch (instr->opcode) {
2907 case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2908 case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2909 case aco_opcode::v_mad_f16:
2910 case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2911 case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2912 case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2913 case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2914 case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2915 case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2916 default: break;
2917 }
2918 }
2919
2920 void
optimize_encoding_sopk(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2921 optimize_encoding_sopk(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2922 {
2923 /* try to optimize sop2 with literal source to sopk */
2924 if (!sop2_can_use_sopk(ctx, instr.get()))
2925 return;
2926 unsigned literal_idx = instr->operands[1].isLiteral();
2927
2928 PhysReg op_reg = instr->operands[!literal_idx].physReg();
2929 if (!is_sgpr_writable_without_side_effects(ctx.program->gfx_level, op_reg))
2930 return;
2931
2932 unsigned def_id = instr->definitions[0].tempId();
2933 if (ctx.assignments[def_id].affinity) {
2934 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2935 if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2936 !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2937 return;
2938 }
2939
2940 instr->format = Format::SOPK;
2941 instr->salu().imm = instr->operands[literal_idx].constantValue() & 0xffff;
2942 if (literal_idx == 0)
2943 std::swap(instr->operands[0], instr->operands[1]);
2944 if (instr->operands.size() > 2)
2945 std::swap(instr->operands[1], instr->operands[2]);
2946 instr->operands.pop_back();
2947
2948 switch (instr->opcode) {
2949 case aco_opcode::s_add_u32:
2950 case aco_opcode::s_add_i32: instr->opcode = aco_opcode::s_addk_i32; break;
2951 case aco_opcode::s_mul_i32: instr->opcode = aco_opcode::s_mulk_i32; break;
2952 case aco_opcode::s_cselect_b32: instr->opcode = aco_opcode::s_cmovk_i32; break;
2953 default: unreachable("illegal instruction");
2954 }
2955 }
2956
2957 void
optimize_encoding(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2958 optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2959 {
2960 if (instr->isVALU())
2961 optimize_encoding_vop2(ctx, register_file, instr);
2962 if (instr->isSALU())
2963 optimize_encoding_sopk(ctx, register_file, instr);
2964 }
2965
2966 void
emit_parallel_copy_internal(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2967 emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2968 aco_ptr<Instruction>& instr,
2969 std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
2970 RegisterFile& register_file)
2971 {
2972 if (parallelcopy.empty())
2973 return;
2974
2975 aco_ptr<Instruction> pc;
2976 pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
2977 parallelcopy.size()));
2978 bool linear_vgpr = false;
2979 bool may_swap_sgprs = false;
2980 std::bitset<256> sgpr_operands;
2981 for (unsigned i = 0; i < parallelcopy.size(); i++) {
2982 linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2983
2984 if (!may_swap_sgprs && parallelcopy[i].first.isTemp() &&
2985 parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2986 unsigned op_reg = parallelcopy[i].first.physReg().reg();
2987 unsigned def_reg = parallelcopy[i].second.physReg().reg();
2988 for (unsigned j = 0; j < parallelcopy[i].first.size(); j++) {
2989 sgpr_operands.set(op_reg + j);
2990 if (sgpr_operands.test(def_reg + j))
2991 may_swap_sgprs = true;
2992 }
2993 }
2994
2995 pc->operands[i] = parallelcopy[i].first;
2996 pc->definitions[i] = parallelcopy[i].second;
2997 assert(pc->operands[i].size() == pc->definitions[i].size());
2998
2999 /* it might happen that the operand is already renamed. we have to restore the
3000 * original name. */
3001 auto it = ctx.orig_names.find(pc->operands[i].tempId());
3002 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
3003 add_rename(ctx, orig, pc->definitions[i].getTemp());
3004 }
3005
3006 if (temp_in_scc && (may_swap_sgprs || linear_vgpr)) {
3007 /* disable definitions and re-enable operands */
3008 RegisterFile tmp_file(register_file);
3009 for (const Definition& def : instr->definitions) {
3010 if (def.isTemp() && !def.isKill())
3011 tmp_file.clear(def);
3012 }
3013 for (const Operand& op : instr->operands) {
3014 if (op.isTemp() && op.isFirstKill())
3015 tmp_file.block(op.physReg(), op.regClass());
3016 }
3017
3018 handle_pseudo(ctx, tmp_file, pc.get());
3019 } else {
3020 pc->pseudo().needs_scratch_reg = may_swap_sgprs || linear_vgpr;
3021 pc->pseudo().scratch_sgpr = scc;
3022 }
3023
3024 instructions.emplace_back(std::move(pc));
3025
3026 parallelcopy.clear();
3027 }
3028
3029 void
emit_parallel_copy(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)3030 emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
3031 aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
3032 bool temp_in_scc, RegisterFile& register_file)
3033 {
3034 if (parallelcopy.empty())
3035 return;
3036
3037 std::vector<std::pair<Operand, Definition>> linear_vgpr;
3038 if (ctx.num_linear_vgprs) {
3039 unsigned next = 0;
3040 for (unsigned i = 0; i < parallelcopy.size(); i++) {
3041 if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
3042 linear_vgpr.push_back(parallelcopy[i]);
3043 continue;
3044 }
3045
3046 if (next != i)
3047 parallelcopy[next] = parallelcopy[i];
3048 next++;
3049 }
3050 parallelcopy.resize(next);
3051 }
3052
3053 /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
3054 * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
3055 emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
3056 register_file);
3057 emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
3058 register_file);
3059 }
3060
3061 } /* end namespace */
3062
3063 void
register_allocation(Program * program,ra_test_policy policy)3064 register_allocation(Program* program, ra_test_policy policy)
3065 {
3066 ra_ctx ctx(program, policy);
3067 get_affinities(ctx);
3068
3069 for (Block& block : program->blocks) {
3070 ctx.block = █
3071
3072 /* initialize register file */
3073 RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block);
3074 ctx.war_hint.reset();
3075 ctx.rr_vgpr_it = {PhysReg{256}};
3076 ctx.rr_sgpr_it = {PhysReg{0}};
3077
3078 std::vector<aco_ptr<Instruction>> instructions;
3079 instructions.reserve(block.instructions.size());
3080
3081 /* this is a slight adjustment from the paper as we already have phi nodes:
3082 * We consider them incomplete phis and only handle the definition. */
3083 get_regs_for_phis(ctx, block, register_file, instructions,
3084 program->live.live_in[block.index]);
3085
3086 /* Handle all other instructions of the block */
3087 auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
3088 auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
3089 for (; instr_it != block.instructions.end(); ++instr_it) {
3090 aco_ptr<Instruction>& instr = *instr_it;
3091 std::vector<std::pair<Operand, Definition>> parallelcopy;
3092 assert(!is_phi(instr));
3093
3094 /* handle operands */
3095 bool fixed = false;
3096 for (unsigned i = 0; i < instr->operands.size(); ++i) {
3097 auto& operand = instr->operands[i];
3098 if (!operand.isTemp())
3099 continue;
3100
3101 /* rename operands */
3102 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
3103 assert(ctx.assignments[operand.tempId()].assigned);
3104
3105 fixed |=
3106 operand.isPrecolored() && ctx.assignments[operand.tempId()].reg != operand.physReg();
3107 }
3108
3109 bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
3110 instr->opcode == aco_opcode::v_writelane_b32_e64;
3111 if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
3112 instr->operands[1].isTemp()) {
3113 /* v_writelane_b32 can take two sgprs but only if one is m0. */
3114 if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
3115 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
3116 instr->operands[0].setPrecolored(m0);
3117 fixed = true;
3118 }
3119 }
3120
3121 if (fixed)
3122 handle_fixed_operands(ctx, register_file, parallelcopy, instr);
3123
3124 for (unsigned i = 0; i < instr->operands.size(); ++i) {
3125 auto& operand = instr->operands[i];
3126 if (!operand.isTemp() || operand.isFixed())
3127 continue;
3128
3129 PhysReg reg = ctx.assignments[operand.tempId()].reg;
3130 if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
3131 operand.setFixed(reg);
3132 else
3133 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
3134
3135 if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
3136 (instr->isDS() && instr->ds().gds)) {
3137 for (unsigned j = 0; j < operand.size(); j++)
3138 ctx.war_hint.set(operand.physReg().reg() + j);
3139 }
3140 }
3141 bool temp_in_scc = register_file[scc];
3142
3143 /* remove dead vars from register file */
3144 for (const Operand& op : instr->operands) {
3145 if (op.isTemp() && op.isFirstKillBeforeDef())
3146 register_file.clear(op);
3147 }
3148
3149 optimize_encoding(ctx, register_file, instr);
3150
3151 /* Handle definitions which must have the same register as an operand.
3152 * We expect that the definition has the same size as the operand, otherwise the new
3153 * location for the operand (if it's not killed) might intersect with the old one.
3154 * We can't read from the old location because it's corrupted, and we can't write the new
3155 * location because that's used by a live-through operand.
3156 */
3157 int op_fixed_to_def = get_op_fixed_to_def(instr.get());
3158 if (op_fixed_to_def != -1)
3159 instr->definitions[0].setPrecolored(instr->operands[op_fixed_to_def].physReg());
3160
3161 /* handle fixed definitions first */
3162 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3163 auto& definition = instr->definitions[i];
3164 if (!definition.isFixed())
3165 continue;
3166
3167 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
3168 /* check if the target register is blocked */
3169 if (register_file.test(definition.physReg(), definition.bytes())) {
3170 const PhysRegInterval def_regs{definition.physReg(), definition.size()};
3171
3172 /* create parallelcopy pair to move blocking vars */
3173 std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
3174
3175 RegisterFile tmp_file(register_file);
3176 /* re-enable the killed operands, so that we don't move the blocking vars there */
3177 tmp_file.fill_killed_operands(instr.get());
3178
3179 ASSERTED bool success = false;
3180 success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
3181 assert(success);
3182
3183 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3184 }
3185
3186 if (!definition.isTemp())
3187 continue;
3188
3189 ctx.assignments[definition.tempId()].set(definition);
3190 register_file.fill(definition);
3191 }
3192
3193 /* handle all other definitions */
3194 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3195 Definition* definition = &instr->definitions[i];
3196
3197 if (definition->isFixed() || !definition->isTemp())
3198 continue;
3199
3200 /* find free reg */
3201 if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
3202 /* Allocation of linear VGPRs is special. */
3203 definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
3204 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3205 } else if (instr->opcode == aco_opcode::p_split_vector) {
3206 PhysReg reg = instr->operands[0].physReg();
3207 RegClass rc = definition->regClass();
3208 for (unsigned j = 0; j < i; j++)
3209 reg.reg_b += instr->definitions[j].bytes();
3210 if (get_reg_specified(ctx, register_file, rc, instr, reg, -1)) {
3211 definition->setFixed(reg);
3212 } else if (i == 0) {
3213 RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
3214 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
3215 std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
3216 if (res && get_reg_specified(ctx, register_file, rc, instr, *res, -1))
3217 definition->setFixed(*res);
3218 } else if (instr->definitions[i - 1].isFixed()) {
3219 reg = instr->definitions[i - 1].physReg();
3220 reg.reg_b += instr->definitions[i - 1].bytes();
3221 if (get_reg_specified(ctx, register_file, rc, instr, reg, -1))
3222 definition->setFixed(reg);
3223 }
3224 } else if (instr->opcode == aco_opcode::p_parallelcopy) {
3225 PhysReg reg = instr->operands[i].physReg();
3226 if (instr->operands[i].isTemp() &&
3227 instr->operands[i].getTemp().type() == definition->getTemp().type() &&
3228 !register_file.test(reg, definition->bytes()))
3229 definition->setFixed(reg);
3230 } else if (instr->opcode == aco_opcode::p_extract_vector) {
3231 PhysReg reg = instr->operands[0].physReg();
3232 reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
3233 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg, -1))
3234 definition->setFixed(reg);
3235 } else if (instr->opcode == aco_opcode::p_create_vector) {
3236 PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
3237 parallelcopy, instr);
3238 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3239 definition->setFixed(reg);
3240 } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
3241 instr->operands[2].isTemp() && instr->operands[2].isKill() &&
3242 instr->operands[2].regClass() == definition->regClass()) {
3243 /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
3244 * Here we set a policy of forcing them the same if operands[2] gets killed (and
3245 * otherwise they don't overlap). This may not be optimal if RA would select a
3246 * different location due to affinity, but that gets complicated very quickly. */
3247 definition->setFixed(instr->operands[2].physReg());
3248 }
3249
3250 if (!definition->isFixed()) {
3251 Temp tmp = definition->getTemp();
3252 if (definition->regClass().is_subdword() && definition->bytes() < 4) {
3253 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
3254 definition->setFixed(reg);
3255 if (reg.byte() || register_file.test(reg, 4)) {
3256 bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
3257 add_subdword_definition(program, instr, reg, allow_16bit_write);
3258 definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
3259 the reference */
3260 }
3261 } else {
3262 definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
3263 }
3264 update_renames(ctx, register_file, parallelcopy, instr,
3265 instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
3266 : (UpdateRenames)0);
3267 }
3268
3269 assert(
3270 definition->isFixed() &&
3271 ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
3272 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
3273 ctx.assignments[definition->tempId()].set(*definition);
3274 register_file.fill(*definition);
3275 }
3276
3277 handle_pseudo(ctx, register_file, instr.get());
3278
3279 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3280 * be read */
3281 for (const Definition& def : instr->definitions) {
3282 if (def.isTemp() && def.isKill())
3283 register_file.clear(def);
3284 }
3285 for (unsigned i = 0; i < instr->operands.size(); i++) {
3286 const Operand& op = instr->operands[i];
3287 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3288 register_file.clear(op);
3289 if (op.isTemp() && op.physReg().byte() != 0)
3290 add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3291 }
3292
3293 emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
3294
3295 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3296 bool instr_needs_vop3 =
3297 !instr->isVOP3() &&
3298 ((withoutDPP(instr->format) == Format::VOPC &&
3299 instr->definitions[0].physReg() != vcc) ||
3300 (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3301 ((instr->opcode == aco_opcode::v_add_co_u32 ||
3302 instr->opcode == aco_opcode::v_addc_co_u32 ||
3303 instr->opcode == aco_opcode::v_sub_co_u32 ||
3304 instr->opcode == aco_opcode::v_subb_co_u32 ||
3305 instr->opcode == aco_opcode::v_subrev_co_u32 ||
3306 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3307 instr->definitions[1].physReg() != vcc) ||
3308 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3309 instr->opcode == aco_opcode::v_subb_co_u32 ||
3310 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3311 instr->operands[2].physReg() != vcc));
3312 if (instr_needs_vop3) {
3313
3314 /* If the first operand is a literal, we have to move it to an sgpr
3315 * for generations without VOP3+literal support.
3316 * Both literals and sgprs count towards the constant bus limit,
3317 * so this is always valid.
3318 */
3319 if (instr->operands.size() && instr->operands[0].isLiteral() &&
3320 program->gfx_level < GFX10) {
3321 /* Re-use the register we already allocated for the definition.
3322 * This works because the instruction cannot have any other SGPR operand.
3323 */
3324 Temp tmp = program->allocateTmp(instr->operands[0].size() == 2 ? s2 : s1);
3325 const Definition& def =
3326 instr->isVOPC() ? instr->definitions[0] : instr->definitions.back();
3327 assert(def.regClass() == s2);
3328 ctx.assignments.emplace_back(def.physReg(), tmp.regClass());
3329
3330 Instruction* copy =
3331 create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
3332 copy->operands[0] = instr->operands[0];
3333 if (copy->operands[0].bytes() < 4)
3334 copy->operands[0] = Operand::c32(copy->operands[0].constantValue());
3335 copy->definitions[0] = Definition(tmp);
3336 copy->definitions[0].setFixed(def.physReg());
3337
3338 instr->operands[0] = Operand(tmp);
3339 instr->operands[0].setFixed(def.physReg());
3340 instr->operands[0].setFirstKill(true);
3341
3342 instructions.emplace_back(copy);
3343 }
3344
3345 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3346 instr->format = asVOP3(instr->format);
3347 }
3348
3349 instructions.emplace_back(std::move(*instr_it));
3350
3351 } /* end for Instr */
3352
3353 if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
3354 /* Reset this for block_kind_resume. */
3355 ctx.num_linear_vgprs = 0;
3356
3357 ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
3358 ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
3359 assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
3360 assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
3361 } else if (should_compact_linear_vgprs(ctx, register_file)) {
3362 aco_ptr<Instruction> br = std::move(instructions.back());
3363 instructions.pop_back();
3364
3365 bool temp_in_scc =
3366 register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
3367
3368 std::vector<std::pair<Operand, Definition>> parallelcopy;
3369 compact_linear_vgprs(ctx, register_file, parallelcopy);
3370 update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
3371 emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
3372
3373 instructions.push_back(std::move(br));
3374 }
3375
3376 block.instructions = std::move(instructions);
3377 } /* end for BB */
3378
3379 /* num_gpr = rnd_up(max_used_gpr + 1) */
3380 program->config->num_vgprs =
3381 std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3382 program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3383
3384 program->progress = CompilationProgress::after_ra;
3385 }
3386
3387 } // namespace aco
3388