1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26
27 #include "util/bitset.h"
28 #include "util/enum_operators.h"
29
30 #include <algorithm>
31 #include <array>
32 #include <bitset>
33 #include <map>
34 #include <optional>
35 #include <set>
36 #include <unordered_map>
37 #include <vector>
38
39 namespace aco {
40 namespace {
41
42 struct ra_ctx;
43
44 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
45 unsigned idx, RegClass rc);
46 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
47 RegClass rc);
48 std::pair<unsigned, unsigned>
49 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
50 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
51
52 struct assignment {
53 PhysReg reg;
54 RegClass rc;
55 union {
56 struct {
57 bool assigned : 1;
58 bool vcc : 1;
59 bool m0 : 1;
60 };
61 uint8_t _ = 0;
62 };
63 uint32_t affinity = 0;
64 assignment() = default;
assignmentaco::__anonc76b5ea00111::assignment65 assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anonc76b5ea00111::assignment66 void set(const Definition& def)
67 {
68 assigned = true;
69 reg = def.physReg();
70 rc = def.regClass();
71 }
72 };
73
74 struct ra_ctx {
75
76 Program* program;
77 Block* block = NULL;
78 std::vector<assignment> assignments;
79 std::vector<std::unordered_map<unsigned, Temp>> renames;
80 std::vector<uint32_t> loop_header;
81 std::unordered_map<unsigned, Temp> orig_names;
82 std::unordered_map<unsigned, Instruction*> vectors;
83 std::unordered_map<unsigned, Instruction*> split_vectors;
84 aco_ptr<Instruction> pseudo_dummy;
85 aco_ptr<Instruction> phi_dummy;
86 uint16_t max_used_sgpr = 0;
87 uint16_t max_used_vgpr = 0;
88 uint16_t sgpr_limit;
89 uint16_t vgpr_limit;
90 std::bitset<512> war_hint;
91
92 ra_test_policy policy;
93
ra_ctxaco::__anonc76b5ea00111::ra_ctx94 ra_ctx(Program* program_, ra_test_policy policy_)
95 : program(program_), assignments(program->peekAllocationId()),
96 renames(program->blocks.size()), policy(policy_)
97 {
98 pseudo_dummy.reset(
99 create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
100 phi_dummy.reset(
101 create_instruction<Instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
102 sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
103 vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
104 }
105 };
106
107 /* Iterator type for making PhysRegInterval compatible with range-based for */
108 struct PhysRegIterator {
109 using difference_type = int;
110 using value_type = unsigned;
111 using reference = const unsigned&;
112 using pointer = const unsigned*;
113 using iterator_category = std::bidirectional_iterator_tag;
114
115 PhysReg reg;
116
operator *aco::__anonc76b5ea00111::PhysRegIterator117 PhysReg operator*() const { return reg; }
118
operator ++aco::__anonc76b5ea00111::PhysRegIterator119 PhysRegIterator& operator++()
120 {
121 reg.reg_b += 4;
122 return *this;
123 }
124
operator --aco::__anonc76b5ea00111::PhysRegIterator125 PhysRegIterator& operator--()
126 {
127 reg.reg_b -= 4;
128 return *this;
129 }
130
operator ==aco::__anonc76b5ea00111::PhysRegIterator131 bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
132
operator !=aco::__anonc76b5ea00111::PhysRegIterator133 bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
134
operator <aco::__anonc76b5ea00111::PhysRegIterator135 bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
136 };
137
138 /* Half-open register interval used in "sliding window"-style for-loops */
139 struct PhysRegInterval {
140 PhysReg lo_;
141 unsigned size;
142
143 /* Inclusive lower bound */
loaco::__anonc76b5ea00111::PhysRegInterval144 PhysReg lo() const { return lo_; }
145
146 /* Exclusive upper bound */
hiaco::__anonc76b5ea00111::PhysRegInterval147 PhysReg hi() const { return PhysReg{lo() + size}; }
148
operator +=aco::__anonc76b5ea00111::PhysRegInterval149 PhysRegInterval& operator+=(uint32_t stride)
150 {
151 lo_ = PhysReg{lo_.reg() + stride};
152 return *this;
153 }
154
operator !=aco::__anonc76b5ea00111::PhysRegInterval155 bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
156
157 /* Construct a half-open interval, excluding the end register */
from_untilaco::__anonc76b5ea00111::PhysRegInterval158 static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
159
containsaco::__anonc76b5ea00111::PhysRegInterval160 bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
161
containsaco::__anonc76b5ea00111::PhysRegInterval162 bool contains(const PhysRegInterval& needle) const
163 {
164 return needle.lo() >= lo() && needle.hi() <= hi();
165 }
166
beginaco::__anonc76b5ea00111::PhysRegInterval167 PhysRegIterator begin() const { return {lo_}; }
168
endaco::__anonc76b5ea00111::PhysRegInterval169 PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
170 };
171
172 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)173 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
174 {
175 return a.hi() > b.lo() && b.hi() > a.lo();
176 }
177
178 /* Gets the stride for full (non-subdword) registers */
179 uint32_t
get_stride(RegClass rc)180 get_stride(RegClass rc)
181 {
182 if (rc.type() == RegType::vgpr) {
183 return 1;
184 } else {
185 uint32_t size = rc.size();
186 if (size == 2) {
187 return 2;
188 } else if (size >= 4) {
189 return 4;
190 } else {
191 return 1;
192 }
193 }
194 }
195
196 PhysRegInterval
get_reg_bounds(Program * program,RegType type)197 get_reg_bounds(Program* program, RegType type)
198 {
199 if (type == RegType::vgpr) {
200 return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
201 } else {
202 return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
203 }
204 }
205
206 struct DefInfo {
207 PhysRegInterval bounds;
208 uint8_t size;
209 uint8_t stride;
210 RegClass rc;
211
DefInfoaco::__anonc76b5ea00111::DefInfo212 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
213 {
214 size = rc.size();
215 stride = get_stride(rc);
216
217 bounds = get_reg_bounds(ctx.program, rc.type());
218
219 if (rc.is_subdword() && operand >= 0) {
220 /* stride in bytes */
221 stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
222 } else if (rc.is_subdword()) {
223 std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
224 stride = info.first;
225 if (info.second > rc.bytes()) {
226 rc = RegClass::get(rc.type(), info.second);
227 size = rc.size();
228 /* we might still be able to put the definition in the high half,
229 * but that's only useful for affinities and this information isn't
230 * used for them */
231 stride = align(stride, info.second);
232 if (!rc.is_subdword())
233 stride = DIV_ROUND_UP(stride, 4);
234 }
235 assert(stride > 0);
236 } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
237 /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
238 *
239 * The register use is not calculated correctly, and the hardware assumes a
240 * full dword per component. Don't use the last registers of the register file.
241 * Otherwise, the instruction will be skipped.
242 *
243 * https://reviews.llvm.org/D81172
244 */
245 bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
246 assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
247
248 if (imageGather4D16Bug)
249 bounds.size -= rc.bytes() / 4;
250 }
251 }
252 };
253
254 class RegisterFile {
255 public:
RegisterFile()256 RegisterFile() { regs.fill(0); }
257
258 std::array<uint32_t, 512> regs;
259 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
260
operator [](PhysReg index) const261 const uint32_t& operator[](PhysReg index) const { return regs[index]; }
262
operator [](PhysReg index)263 uint32_t& operator[](PhysReg index) { return regs[index]; }
264
count_zero(PhysRegInterval reg_interval)265 unsigned count_zero(PhysRegInterval reg_interval)
266 {
267 unsigned res = 0;
268 for (PhysReg reg : reg_interval)
269 res += !regs[reg];
270 return res;
271 }
272
273 /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes)274 bool test(PhysReg start, unsigned num_bytes)
275 {
276 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
277 assert(i <= 511);
278 if (regs[i] & 0x0FFFFFFF)
279 return true;
280 if (regs[i] == 0xF0000000) {
281 assert(subdword_regs.find(i) != subdword_regs.end());
282 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
283 if (subdword_regs[i][j])
284 return true;
285 }
286 }
287 }
288 return false;
289 }
290
block(PhysReg start,RegClass rc)291 void block(PhysReg start, RegClass rc)
292 {
293 if (rc.is_subdword())
294 fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
295 else
296 fill(start, rc.size(), 0xFFFFFFFF);
297 }
298
is_blocked(PhysReg start)299 bool is_blocked(PhysReg start)
300 {
301 if (regs[start] == 0xFFFFFFFF)
302 return true;
303 if (regs[start] == 0xF0000000) {
304 for (unsigned i = start.byte(); i < 4; i++)
305 if (subdword_regs[start][i] == 0xFFFFFFFF)
306 return true;
307 }
308 return false;
309 }
310
is_empty_or_blocked(PhysReg start)311 bool is_empty_or_blocked(PhysReg start)
312 {
313 /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
314 * incremented value to 1 */
315 if (regs[start] == 0xF0000000) {
316 return subdword_regs[start][start.byte()] + 1 <= 1;
317 }
318 return regs[start] + 1 <= 1;
319 }
320
clear(PhysReg start,RegClass rc)321 void clear(PhysReg start, RegClass rc)
322 {
323 if (rc.is_subdword())
324 fill_subdword(start, rc.bytes(), 0);
325 else
326 fill(start, rc.size(), 0);
327 }
328
fill(Operand op)329 void fill(Operand op)
330 {
331 if (op.regClass().is_subdword())
332 fill_subdword(op.physReg(), op.bytes(), op.tempId());
333 else
334 fill(op.physReg(), op.size(), op.tempId());
335 }
336
clear(Operand op)337 void clear(Operand op) { clear(op.physReg(), op.regClass()); }
338
fill(Definition def)339 void fill(Definition def)
340 {
341 if (def.regClass().is_subdword())
342 fill_subdword(def.physReg(), def.bytes(), def.tempId());
343 else
344 fill(def.physReg(), def.size(), def.tempId());
345 }
346
clear(Definition def)347 void clear(Definition def) { clear(def.physReg(), def.regClass()); }
348
get_id(PhysReg reg)349 unsigned get_id(PhysReg reg)
350 {
351 return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
352 }
353
354 private:
fill(PhysReg start,unsigned size,uint32_t val)355 void fill(PhysReg start, unsigned size, uint32_t val)
356 {
357 for (unsigned i = 0; i < size; i++)
358 regs[start + i] = val;
359 }
360
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)361 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
362 {
363 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
364 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
365 /* emplace or get */
366 std::array<uint32_t, 4>& sub =
367 subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
368 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
369 sub[j] = val;
370
371 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
372 subdword_regs.erase(i);
373 regs[i] = 0;
374 }
375 }
376 }
377 };
378
379 std::vector<unsigned> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
380 const PhysRegInterval reg_interval);
381
382 /* helper function for debugging */
383 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)384 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
385 {
386 if (reg_file[reg] == 0xFFFFFFFF) {
387 printf((const char*)u8"☐");
388 } else if (reg_file[reg]) {
389 const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
390 if (show_subdword_alloc) {
391 auto block_chars = {
392 // clang-format off
393 u8"?", u8"▘", u8"▝", u8"▀",
394 u8"▖", u8"▌", u8"▞", u8"▛",
395 u8"▗", u8"▚", u8"▐", u8"▜",
396 u8"▄", u8"▙", u8"▟", u8"▉"
397 // clang-format on
398 };
399 unsigned index = 0;
400 for (int i = 0; i < 4; ++i) {
401 if (reg_file.subdword_regs.at(reg)[i]) {
402 index |= 1 << i;
403 }
404 }
405 printf("%s", (const char*)(block_chars.begin()[index]));
406 } else {
407 /* Indicate filled register slot */
408 if (!has_adjacent_variable) {
409 printf((const char*)u8"█");
410 } else {
411 /* Use a slightly shorter box to leave a small gap between adjacent variables */
412 printf((const char*)u8"▉");
413 }
414 }
415 } else {
416 printf((const char*)u8"·");
417 }
418 }
419
420 /* helper function for debugging */
421 UNUSED void
print_regs(ra_ctx & ctx,bool vgprs,RegisterFile & reg_file)422 print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
423 {
424 PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
425 char reg_char = vgprs ? 'v' : 's';
426 const int max_regs_per_line = 64;
427
428 /* print markers */
429 printf(" ");
430 for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
431 printf("%-3.2u ", i);
432 }
433 printf("\n");
434
435 /* print usage */
436 auto line_begin_it = regs.begin();
437 while (line_begin_it != regs.end()) {
438 const int regs_in_line =
439 std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
440
441 if (line_begin_it == regs.begin()) {
442 printf("%cgprs: ", reg_char);
443 } else {
444 printf(" %+4d ", std::distance(regs.begin(), line_begin_it));
445 }
446 const auto line_end_it = std::next(line_begin_it, regs_in_line);
447
448 for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
449 bool has_adjacent_variable =
450 (std::next(reg_it) != line_end_it &&
451 reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
452 print_reg(reg_file, *reg_it, has_adjacent_variable);
453 }
454
455 line_begin_it = line_end_it;
456 printf("\n");
457 }
458
459 const unsigned free_regs =
460 std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
461 printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
462
463 /* print assignments ordered by registers */
464 std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
465 for (unsigned id : find_vars(ctx, reg_file, regs)) {
466 const assignment& var = ctx.assignments[id];
467 PhysReg reg = var.reg;
468 ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
469 assert(inserted.second);
470 }
471
472 for (const auto& reg_and_var : regs_to_vars) {
473 const auto& first_reg = reg_and_var.first;
474 const auto& size_id = reg_and_var.second;
475
476 printf("%%%u ", size_id.second);
477 if (ctx.orig_names.count(size_id.second) &&
478 ctx.orig_names[size_id.second].id() != size_id.second) {
479 printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
480 }
481 printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
482 PhysReg last_reg = first_reg.advance(size_id.first - 1);
483 if (first_reg.reg() != last_reg.reg()) {
484 assert(first_reg.byte() == 0 && last_reg.byte() == 3);
485 printf("-%d", last_reg.reg() - regs.lo());
486 }
487 printf("]");
488 if (first_reg.byte() != 0 || last_reg.byte() != 3) {
489 printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
490 }
491 printf("\n");
492 }
493 }
494
495 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)496 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
497 unsigned idx, RegClass rc)
498 {
499 if (instr->isPseudo()) {
500 /* v_readfirstlane_b32 cannot use SDWA */
501 if (instr->opcode == aco_opcode::p_as_uniform)
502 return 4;
503 else if (gfx_level >= GFX8)
504 return rc.bytes() % 2 == 0 ? 2 : 1;
505 else
506 return 4;
507 }
508
509 assert(rc.bytes() <= 2);
510 if (instr->isVALU()) {
511 if (can_use_SDWA(gfx_level, instr, false))
512 return rc.bytes();
513 if (can_use_opsel(gfx_level, instr->opcode, idx))
514 return 2;
515 if (instr->isVOP3P())
516 return 2;
517 }
518
519 switch (instr->opcode) {
520 case aco_opcode::v_cvt_f32_ubyte0: return 1;
521 case aco_opcode::ds_write_b8:
522 case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
523 case aco_opcode::buffer_store_byte:
524 case aco_opcode::buffer_store_short:
525 case aco_opcode::buffer_store_format_d16_x:
526 case aco_opcode::flat_store_byte:
527 case aco_opcode::flat_store_short:
528 case aco_opcode::scratch_store_byte:
529 case aco_opcode::scratch_store_short:
530 case aco_opcode::global_store_byte:
531 case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
532 default: return 4;
533 }
534 }
535
536 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)537 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
538 RegClass rc)
539 {
540 amd_gfx_level gfx_level = ctx.program->gfx_level;
541 if (instr->isPseudo() || byte == 0)
542 return;
543
544 assert(rc.bytes() <= 2);
545 if (instr->isVALU()) {
546 if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
547 switch (byte) {
548 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
549 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
550 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
551 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
552 }
553 return;
554 }
555
556 /* use SDWA */
557 if (can_use_SDWA(gfx_level, instr, false)) {
558 convert_to_SDWA(gfx_level, instr);
559 return;
560 }
561
562 /* use opsel */
563 if (instr->isVOP3P()) {
564 assert(byte == 2 && !instr->valu().opsel_lo[idx]);
565 instr->valu().opsel_lo[idx] = true;
566 instr->valu().opsel_hi[idx] = true;
567 return;
568 }
569
570 assert(can_use_opsel(gfx_level, instr->opcode, idx));
571 instr->valu().opsel[idx] = true;
572 return;
573 }
574
575 assert(byte == 2);
576 if (instr->opcode == aco_opcode::ds_write_b8)
577 instr->opcode = aco_opcode::ds_write_b8_d16_hi;
578 else if (instr->opcode == aco_opcode::ds_write_b16)
579 instr->opcode = aco_opcode::ds_write_b16_d16_hi;
580 else if (instr->opcode == aco_opcode::buffer_store_byte)
581 instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
582 else if (instr->opcode == aco_opcode::buffer_store_short)
583 instr->opcode = aco_opcode::buffer_store_short_d16_hi;
584 else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
585 instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
586 else if (instr->opcode == aco_opcode::flat_store_byte)
587 instr->opcode = aco_opcode::flat_store_byte_d16_hi;
588 else if (instr->opcode == aco_opcode::flat_store_short)
589 instr->opcode = aco_opcode::flat_store_short_d16_hi;
590 else if (instr->opcode == aco_opcode::scratch_store_byte)
591 instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
592 else if (instr->opcode == aco_opcode::scratch_store_short)
593 instr->opcode = aco_opcode::scratch_store_short_d16_hi;
594 else if (instr->opcode == aco_opcode::global_store_byte)
595 instr->opcode = aco_opcode::global_store_byte_d16_hi;
596 else if (instr->opcode == aco_opcode::global_store_short)
597 instr->opcode = aco_opcode::global_store_short_d16_hi;
598 else
599 unreachable("Something went wrong: Impossible register assignment.");
600 return;
601 }
602
603 /* minimum_stride, bytes_written */
604 std::pair<unsigned, unsigned>
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr,RegClass rc)605 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
606 {
607 amd_gfx_level gfx_level = program->gfx_level;
608
609 if (instr->isPseudo()) {
610 if (instr->opcode == aco_opcode::p_interp_gfx11)
611 return std::make_pair(4u, 4u);
612 else if (gfx_level >= GFX8)
613 return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
614 else
615 return std::make_pair(4, rc.size() * 4u);
616 }
617
618 if (instr->isVALU() || instr->isVINTRP()) {
619 assert(rc.bytes() <= 2);
620
621 if (can_use_SDWA(gfx_level, instr, false))
622 return std::make_pair(rc.bytes(), rc.bytes());
623
624 unsigned bytes_written = 4u;
625 if (instr_is_16bit(gfx_level, instr->opcode))
626 bytes_written = 2u;
627
628 unsigned stride = 4u;
629 if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
630 can_use_opsel(gfx_level, instr->opcode, -1))
631 stride = 2u;
632
633 return std::make_pair(stride, bytes_written);
634 }
635
636 switch (instr->opcode) {
637 /* D16 loads with _hi version */
638 case aco_opcode::ds_read_u8_d16:
639 case aco_opcode::ds_read_i8_d16:
640 case aco_opcode::ds_read_u16_d16:
641 case aco_opcode::flat_load_ubyte_d16:
642 case aco_opcode::flat_load_sbyte_d16:
643 case aco_opcode::flat_load_short_d16:
644 case aco_opcode::global_load_ubyte_d16:
645 case aco_opcode::global_load_sbyte_d16:
646 case aco_opcode::global_load_short_d16:
647 case aco_opcode::scratch_load_ubyte_d16:
648 case aco_opcode::scratch_load_sbyte_d16:
649 case aco_opcode::scratch_load_short_d16:
650 case aco_opcode::buffer_load_ubyte_d16:
651 case aco_opcode::buffer_load_sbyte_d16:
652 case aco_opcode::buffer_load_short_d16:
653 case aco_opcode::buffer_load_format_d16_x: {
654 assert(gfx_level >= GFX9);
655 if (!program->dev.sram_ecc_enabled)
656 return std::make_pair(2u, 2u);
657 else
658 return std::make_pair(2u, 4u);
659 }
660 /* 3-component D16 loads */
661 case aco_opcode::buffer_load_format_d16_xyz:
662 case aco_opcode::tbuffer_load_format_d16_xyz: {
663 assert(gfx_level >= GFX9);
664 if (!program->dev.sram_ecc_enabled)
665 return std::make_pair(4u, 6u);
666 break;
667 }
668
669 default: break;
670 }
671
672 if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
673 assert(gfx_level >= GFX9);
674 return std::make_pair(4u, rc.bytes());
675 }
676
677 return std::make_pair(4, rc.size() * 4u);
678 }
679
680 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg)681 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
682 {
683 if (instr->isPseudo())
684 return;
685
686 if (instr->isVALU()) {
687 amd_gfx_level gfx_level = program->gfx_level;
688 assert(instr->definitions[0].bytes() <= 2);
689
690 if (reg.byte() == 0 && instr_is_16bit(gfx_level, instr->opcode))
691 return;
692
693 /* use SDWA */
694 if (can_use_SDWA(gfx_level, instr, false)) {
695 convert_to_SDWA(gfx_level, instr);
696 return;
697 }
698
699 if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
700 instr->opcode = aco_opcode::v_fma_mixhi_f16;
701 return;
702 }
703
704 /* use opsel */
705 assert(reg.byte() == 2);
706 assert(can_use_opsel(gfx_level, instr->opcode, -1));
707 instr->valu().opsel[3] = true; /* dst in high half */
708 return;
709 }
710
711 if (reg.byte() == 0)
712 return;
713 else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
714 instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
715 else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
716 instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
717 else if (instr->opcode == aco_opcode::buffer_load_short_d16)
718 instr->opcode = aco_opcode::buffer_load_short_d16_hi;
719 else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
720 instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
721 else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
722 instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
723 else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
724 instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
725 else if (instr->opcode == aco_opcode::flat_load_short_d16)
726 instr->opcode = aco_opcode::flat_load_short_d16_hi;
727 else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
728 instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
729 else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
730 instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
731 else if (instr->opcode == aco_opcode::scratch_load_short_d16)
732 instr->opcode = aco_opcode::scratch_load_short_d16_hi;
733 else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
734 instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
735 else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
736 instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
737 else if (instr->opcode == aco_opcode::global_load_short_d16)
738 instr->opcode = aco_opcode::global_load_short_d16_hi;
739 else if (instr->opcode == aco_opcode::ds_read_u8_d16)
740 instr->opcode = aco_opcode::ds_read_u8_d16_hi;
741 else if (instr->opcode == aco_opcode::ds_read_i8_d16)
742 instr->opcode = aco_opcode::ds_read_i8_d16_hi;
743 else if (instr->opcode == aco_opcode::ds_read_u16_d16)
744 instr->opcode = aco_opcode::ds_read_u16_d16_hi;
745 else
746 unreachable("Something went wrong: Impossible register assignment.");
747 }
748
749 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)750 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
751 {
752 uint16_t max_addressible_sgpr = ctx.sgpr_limit;
753 unsigned size = rc.size();
754 if (rc.type() == RegType::vgpr) {
755 assert(reg >= 256);
756 uint16_t hi = reg - 256 + size - 1;
757 assert(hi <= 255);
758 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
759 } else if (reg + rc.size() <= max_addressible_sgpr) {
760 uint16_t hi = reg + size - 1;
761 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
762 }
763 }
764
765 enum UpdateRenames {
766 rename_not_killed_ops = 0x1,
767 fill_killed_ops = 0x2,
768 rename_precolored_ops = 0x4,
769 };
770 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
771
772 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)773 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
774 std::vector<std::pair<Operand, Definition>>& parallelcopies,
775 aco_ptr<Instruction>& instr, UpdateRenames flags)
776 {
777 /* clear operands */
778 for (std::pair<Operand, Definition>& copy : parallelcopies) {
779 /* the definitions with id are not from this function and already handled */
780 if (copy.second.isTemp())
781 continue;
782 reg_file.clear(copy.first);
783 }
784
785 /* allocate id's and rename operands: this is done transparently here */
786 auto it = parallelcopies.begin();
787 while (it != parallelcopies.end()) {
788 if (it->second.isTemp()) {
789 ++it;
790 continue;
791 }
792
793 /* check if we moved a definition: change the register and remove copy */
794 bool is_def = false;
795 for (Definition& def : instr->definitions) {
796 if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
797 // FIXME: ensure that the definition can use this reg
798 def.setFixed(it->second.physReg());
799 reg_file.fill(def);
800 ctx.assignments[def.tempId()].reg = def.physReg();
801 it = parallelcopies.erase(it);
802 is_def = true;
803 break;
804 }
805 }
806 if (is_def)
807 continue;
808
809 /* check if we moved another parallelcopy definition */
810 for (std::pair<Operand, Definition>& other : parallelcopies) {
811 if (!other.second.isTemp())
812 continue;
813 if (it->first.getTemp() == other.second.getTemp()) {
814 other.second.setFixed(it->second.physReg());
815 ctx.assignments[other.second.tempId()].reg = other.second.physReg();
816 it = parallelcopies.erase(it);
817 is_def = true;
818 /* check if we moved an operand, again */
819 bool fill = true;
820 for (Operand& op : instr->operands) {
821 if (op.isTemp() && op.tempId() == other.second.tempId()) {
822 // FIXME: ensure that the operand can use this reg
823 op.setFixed(other.second.physReg());
824 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
825 }
826 }
827 if (fill)
828 reg_file.fill(other.second);
829 break;
830 }
831 }
832 if (is_def)
833 continue;
834
835 std::pair<Operand, Definition>& copy = *it;
836 copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
837 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
838 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
839
840 /* check if we moved an operand */
841 bool first = true;
842 bool fill = true;
843 for (unsigned i = 0; i < instr->operands.size(); i++) {
844 Operand& op = instr->operands[i];
845 if (!op.isTemp())
846 continue;
847 if (op.tempId() == copy.first.tempId()) {
848 /* only rename precolored operands if the copy-location matches */
849 if ((flags & rename_precolored_ops) && op.isFixed() &&
850 op.physReg() != copy.second.physReg())
851 continue;
852
853 bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
854 for (std::pair<Operand, Definition>& pc : parallelcopies) {
855 PhysReg def_reg = pc.second.physReg();
856 omit_renaming &= def_reg > copy.first.physReg()
857 ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
858 : (def_reg + pc.second.size() <= copy.first.physReg().reg());
859 }
860 if (omit_renaming) {
861 if (first)
862 op.setFirstKill(true);
863 else
864 op.setKill(true);
865 first = false;
866 continue;
867 }
868 op.setTemp(copy.second.getTemp());
869 op.setFixed(copy.second.physReg());
870
871 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
872 }
873 }
874
875 if (fill)
876 reg_file.fill(copy.second);
877
878 ++it;
879 }
880 }
881
882 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,RegisterFile & reg_file,DefInfo info)883 get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
884 {
885 const PhysRegInterval& bounds = info.bounds;
886 uint32_t size = info.size;
887 uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
888 RegClass rc = info.rc;
889
890 DefInfo new_info = info;
891 new_info.rc = RegClass(rc.type(), size);
892 for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {
893 if (size % new_stride)
894 continue;
895 new_info.stride = new_stride;
896 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
897 if (res)
898 return res;
899 }
900
901 auto is_free = [&](PhysReg reg_index)
902 { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
903
904 if (stride == 1) {
905 /* best fit algorithm: find the smallest gap to fit in the variable */
906 PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
907 const unsigned max_gpr =
908 (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
909
910 PhysRegIterator reg_it = bounds.begin();
911 const PhysRegIterator end_it =
912 std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
913 while (reg_it != bounds.end()) {
914 /* Find the next chunk of available register slots */
915 reg_it = std::find_if(reg_it, end_it, is_free);
916 auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);
917 if (reg_it == bounds.end()) {
918 break;
919 }
920
921 if (next_nonfree_it == end_it) {
922 /* All registers past max_used_gpr are free */
923 next_nonfree_it = bounds.end();
924 }
925
926 PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);
927
928 /* early return on exact matches */
929 if (size == gap.size) {
930 adjust_max_used_regs(ctx, rc, gap.lo());
931 return gap.lo();
932 }
933
934 /* check if it fits and the gap size is smaller */
935 if (size < gap.size && gap.size < best_gap.size) {
936 best_gap = gap;
937 }
938
939 /* Move past the processed chunk */
940 reg_it = next_nonfree_it;
941 }
942
943 if (best_gap.size == UINT_MAX)
944 return {};
945
946 /* find best position within gap by leaving a good stride for other variables*/
947 unsigned buffer = best_gap.size - size;
948 if (buffer > 1) {
949 if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
950 ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
951 ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
952 best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
953 }
954
955 adjust_max_used_regs(ctx, rc, best_gap.lo());
956 return best_gap.lo();
957 }
958
959 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
960 reg_win += stride) {
961 if (reg_file[reg_win.lo()] != 0) {
962 continue;
963 }
964
965 bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
966 if (is_valid) {
967 adjust_max_used_regs(ctx, rc, reg_win.lo());
968 return reg_win.lo();
969 }
970 }
971
972 /* do this late because using the upper bytes of a register can require
973 * larger instruction encodings or copies
974 * TODO: don't do this in situations where it doesn't benefit */
975 if (rc.is_subdword()) {
976 for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
977 assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
978 if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
979 continue;
980
981 for (unsigned i = 0; i < 4; i += info.stride) {
982 /* check if there's a block of free bytes large enough to hold the register */
983 bool reg_found =
984 std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
985 [](unsigned v) { return v == 0; });
986
987 /* check if also the neighboring reg is free if needed */
988 if (reg_found && i + rc.bytes() > 4)
989 reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
990
991 if (reg_found) {
992 PhysReg res{entry.first};
993 res.reg_b += i;
994 adjust_max_used_regs(ctx, rc, entry.first);
995 return res;
996 }
997 }
998 }
999 }
1000
1001 return {};
1002 }
1003
1004 /* collect variables from a register area */
1005 std::vector<unsigned>
find_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1006 find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1007 {
1008 std::vector<unsigned> vars;
1009 for (PhysReg j : reg_interval) {
1010 if (reg_file.is_blocked(j))
1011 continue;
1012 if (reg_file[j] == 0xF0000000) {
1013 for (unsigned k = 0; k < 4; k++) {
1014 unsigned id = reg_file.subdword_regs[j][k];
1015 if (id && (vars.empty() || id != vars.back()))
1016 vars.emplace_back(id);
1017 }
1018 } else {
1019 unsigned id = reg_file[j];
1020 if (id && (vars.empty() || id != vars.back()))
1021 vars.emplace_back(id);
1022 }
1023 }
1024 return vars;
1025 }
1026
1027 /* collect variables from a register area and clear reg_file
1028 * variables are sorted in decreasing size and
1029 * increasing assigned register
1030 */
1031 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1032 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1033 {
1034 std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1035 std::sort(ids.begin(), ids.end(),
1036 [&](unsigned a, unsigned b)
1037 {
1038 assignment& var_a = ctx.assignments[a];
1039 assignment& var_b = ctx.assignments[b];
1040 return var_a.rc.bytes() > var_b.rc.bytes() ||
1041 (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1042 });
1043
1044 for (unsigned id : ids) {
1045 assignment& var = ctx.assignments[id];
1046 reg_file.clear(var.reg, var.rc);
1047 }
1048 return ids;
1049 }
1050
1051 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1052 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1053 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1054 aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1055 DefInfo info, unsigned id)
1056 {
1057 PhysReg reg = def_reg.lo();
1058 /* dead operand: return position in vector */
1059 for (unsigned i = 0; i < instr->operands.size(); i++) {
1060 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1061 instr->operands[i].isKillBeforeDef()) {
1062 assert(!reg_file.test(reg, instr->operands[i].bytes()));
1063 if (info.rc.is_subdword() || reg.byte() == 0)
1064 return reg;
1065 else
1066 return {};
1067 }
1068 reg.reg_b += instr->operands[i].bytes();
1069 }
1070
1071 /* GFX9+ has a VGPR swap instruction. */
1072 if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1073 return {};
1074
1075 /* check if the previous position was in vector */
1076 assignment& var = ctx.assignments[id];
1077 if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1078 reg = def_reg.lo();
1079 /* try to use the previous register of the operand */
1080 for (unsigned i = 0; i < instr->operands.size(); i++) {
1081 if (reg != var.reg) {
1082 reg.reg_b += instr->operands[i].bytes();
1083 continue;
1084 }
1085
1086 /* check if we can swap positions */
1087 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1088 instr->operands[i].regClass() == info.rc) {
1089 assignment& op = ctx.assignments[instr->operands[i].tempId()];
1090 /* if everything matches, create parallelcopy for the killed operand */
1091 if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1092 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1093 Definition pc_def = Definition(reg, info.rc);
1094 parallelcopies.emplace_back(instr->operands[i], pc_def);
1095 return op.reg;
1096 }
1097 }
1098 return {};
1099 }
1100 }
1101 return {};
1102 }
1103
1104 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1105 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1106 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1107 const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1108 const PhysRegInterval def_reg)
1109 {
1110 /* Variables are sorted from large to small and with increasing assigned register */
1111 for (unsigned id : vars) {
1112 assignment& var = ctx.assignments[id];
1113 PhysRegInterval bounds = get_reg_bounds(ctx.program, var.rc.type());
1114 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1115 uint32_t size = info.size;
1116
1117 /* check if this is a dead operand, then we can re-use the space from the definition
1118 * also use the correct stride for sub-dword operands */
1119 bool is_dead_operand = false;
1120 std::optional<PhysReg> res;
1121 if (instr->opcode == aco_opcode::p_create_vector) {
1122 res =
1123 get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1124 } else {
1125 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1126 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1127 info = DefInfo(ctx, instr, var.rc, i);
1128 if (instr->operands[i].isKillBeforeDef()) {
1129 info.bounds = def_reg;
1130 res = get_reg_simple(ctx, reg_file, info);
1131 is_dead_operand = true;
1132 }
1133 break;
1134 }
1135 }
1136 }
1137 if (!res && !def_reg.size) {
1138 /* If this is before definitions are handled, def_reg may be an empty interval. */
1139 info.bounds = bounds;
1140 res = get_reg_simple(ctx, reg_file, info);
1141 } else if (!res) {
1142 /* Try to find space within the bounds but outside of the definition */
1143 info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1144 res = get_reg_simple(ctx, reg_file, info);
1145 if (!res && def_reg.hi() <= bounds.hi()) {
1146 unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1147 info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1148 res = get_reg_simple(ctx, reg_file, info);
1149 }
1150 }
1151
1152 if (res) {
1153 /* mark the area as blocked */
1154 reg_file.block(*res, var.rc);
1155
1156 /* create parallelcopy pair (without definition id) */
1157 Temp tmp = Temp(id, var.rc);
1158 Operand pc_op = Operand(tmp);
1159 pc_op.setFixed(var.reg);
1160 Definition pc_def = Definition(*res, pc_op.regClass());
1161 parallelcopies.emplace_back(pc_op, pc_def);
1162 continue;
1163 }
1164
1165 PhysReg best_pos = bounds.lo();
1166 unsigned num_moves = 0xFF;
1167 unsigned num_vars = 0;
1168
1169 /* we use a sliding window to find potential positions */
1170 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1171 for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1172 reg_win += stride) {
1173 if (!is_dead_operand && intersects(reg_win, def_reg))
1174 continue;
1175
1176 /* second, check that we have at most k=num_moves elements in the window
1177 * and no element is larger than the currently processed one */
1178 unsigned k = 0;
1179 unsigned n = 0;
1180 unsigned last_var = 0;
1181 bool found = true;
1182 for (PhysReg j : reg_win) {
1183 if (reg_file[j] == 0 || reg_file[j] == last_var)
1184 continue;
1185
1186 if (reg_file.is_blocked(j) || k > num_moves) {
1187 found = false;
1188 break;
1189 }
1190 if (reg_file[j] == 0xF0000000) {
1191 k += 1;
1192 n++;
1193 continue;
1194 }
1195 /* we cannot split live ranges of linear vgprs inside control flow */
1196 if (!(ctx.block->kind & block_kind_top_level) &&
1197 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1198 found = false;
1199 break;
1200 }
1201 bool is_kill = false;
1202 for (const Operand& op : instr->operands) {
1203 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1204 is_kill = true;
1205 break;
1206 }
1207 }
1208 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1209 found = false;
1210 break;
1211 }
1212
1213 k += ctx.assignments[reg_file[j]].rc.size();
1214 last_var = reg_file[j];
1215 n++;
1216 if (k > num_moves || (k == num_moves && n <= num_vars)) {
1217 found = false;
1218 break;
1219 }
1220 }
1221
1222 if (found) {
1223 best_pos = reg_win.lo();
1224 num_moves = k;
1225 num_vars = n;
1226 }
1227 }
1228
1229 /* FIXME: we messed up and couldn't find space for the variables to be copied */
1230 if (num_moves == 0xFF)
1231 return false;
1232
1233 PhysRegInterval reg_win{best_pos, size};
1234
1235 /* collect variables and block reg file */
1236 std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1237
1238 /* mark the area as blocked */
1239 reg_file.block(reg_win.lo(), var.rc);
1240 adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1241
1242 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1243 return false;
1244
1245 /* create parallelcopy pair (without definition id) */
1246 Temp tmp = Temp(id, var.rc);
1247 Operand pc_op = Operand(tmp);
1248 pc_op.setFixed(var.reg);
1249 Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1250 parallelcopies.emplace_back(pc_op, pc_def);
1251 }
1252
1253 return true;
1254 }
1255
1256 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1257 get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
1258 std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1259 aco_ptr<Instruction>& instr)
1260 {
1261 const PhysRegInterval& bounds = info.bounds;
1262 uint32_t size = info.size;
1263 uint32_t stride = info.stride;
1264 RegClass rc = info.rc;
1265
1266 /* check how many free regs we have */
1267 unsigned regs_free = reg_file.count_zero(bounds);
1268
1269 /* mark and count killed operands */
1270 unsigned killed_ops = 0;
1271 std::bitset<256> is_killed_operand; /* per-register */
1272 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1273 Operand& op = instr->operands[j];
1274 if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1275 !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1276 assert(op.isFixed());
1277
1278 for (unsigned i = 0; i < op.size(); ++i) {
1279 is_killed_operand[(op.physReg() & 0xff) + i] = true;
1280 }
1281
1282 killed_ops += op.getTemp().size();
1283 }
1284 }
1285
1286 assert(regs_free >= size);
1287 /* we might have to move dead operands to dst in order to make space */
1288 unsigned op_moves = 0;
1289
1290 if (size > (regs_free - killed_ops))
1291 op_moves = size - (regs_free - killed_ops);
1292
1293 /* find the best position to place the definition */
1294 PhysRegInterval best_win = {bounds.lo(), size};
1295 unsigned num_moves = 0xFF;
1296 unsigned num_vars = 0;
1297
1298 /* we use a sliding window to check potential positions */
1299 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1300 reg_win += stride) {
1301 /* first check if the register window starts in the middle of an
1302 * allocated variable: this is what we have to fix to allow for
1303 * num_moves > size */
1304 if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1305 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1306 continue;
1307 if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1308 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1309 continue;
1310
1311 /* second, check that we have at most k=num_moves elements in the window
1312 * and no element is larger than the currently processed one */
1313 unsigned k = op_moves;
1314 unsigned n = 0;
1315 unsigned remaining_op_moves = op_moves;
1316 unsigned last_var = 0;
1317 bool found = true;
1318 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1319 for (const PhysReg j : reg_win) {
1320 /* dead operands effectively reduce the number of estimated moves */
1321 if (is_killed_operand[j & 0xFF]) {
1322 if (remaining_op_moves) {
1323 k--;
1324 remaining_op_moves--;
1325 }
1326 continue;
1327 }
1328
1329 if (reg_file[j] == 0 || reg_file[j] == last_var)
1330 continue;
1331
1332 if (reg_file[j] == 0xF0000000) {
1333 k += 1;
1334 n++;
1335 continue;
1336 }
1337
1338 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1339 found = false;
1340 break;
1341 }
1342
1343 /* we cannot split live ranges of linear vgprs inside control flow */
1344 // TODO: ensure that live range splits inside control flow are never necessary
1345 if (!(ctx.block->kind & block_kind_top_level) &&
1346 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1347 found = false;
1348 break;
1349 }
1350
1351 k += ctx.assignments[reg_file[j]].rc.size();
1352 n++;
1353 last_var = reg_file[j];
1354 }
1355
1356 if (!found || k > num_moves)
1357 continue;
1358 if (k == num_moves && n < num_vars)
1359 continue;
1360 if (!aligned && k == num_moves && n == num_vars)
1361 continue;
1362
1363 if (found) {
1364 best_win = reg_win;
1365 num_moves = k;
1366 num_vars = n;
1367 }
1368 }
1369
1370 if (num_moves == 0xFF)
1371 return {};
1372
1373 /* now, we figured the placement for our definition */
1374 RegisterFile tmp_file(reg_file);
1375
1376 /* p_create_vector: also re-place killed operands in the definition space */
1377 if (instr->opcode == aco_opcode::p_create_vector) {
1378 for (Operand& op : instr->operands) {
1379 if (op.isTemp() && op.isFirstKillBeforeDef())
1380 tmp_file.fill(op);
1381 }
1382 }
1383
1384 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1385
1386 /* re-enable killed operands */
1387 if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
1388 for (Operand& op : instr->operands) {
1389 if (op.isTemp() && op.isFirstKillBeforeDef())
1390 tmp_file.fill(op);
1391 }
1392 }
1393
1394 std::vector<std::pair<Operand, Definition>> pc;
1395 if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1396 return {};
1397
1398 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1399
1400 adjust_max_used_regs(ctx, rc, best_win.lo());
1401 return best_win.lo();
1402 }
1403
1404 bool
get_reg_specified(ra_ctx & ctx,RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg)1405 get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
1406 PhysReg reg)
1407 {
1408 /* catch out-of-range registers */
1409 if (reg >= PhysReg{512})
1410 return false;
1411
1412 std::pair<unsigned, unsigned> sdw_def_info;
1413 if (rc.is_subdword())
1414 sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
1415
1416 if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
1417 return false;
1418 if (!rc.is_subdword() && reg.byte())
1419 return false;
1420
1421 if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
1422 return false;
1423
1424 PhysRegInterval reg_win = {reg, rc.size()};
1425 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1426 PhysRegInterval vcc_win = {vcc, 2};
1427 /* VCC is outside the bounds */
1428 bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1429 bool is_m0 = rc == s1 && reg == m0;
1430 if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
1431 return false;
1432
1433 if (rc.is_subdword()) {
1434 PhysReg test_reg;
1435 test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
1436 if (reg_file.test(test_reg, sdw_def_info.second))
1437 return false;
1438 } else {
1439 if (reg_file.test(reg, rc.bytes()))
1440 return false;
1441 }
1442
1443 adjust_max_used_regs(ctx, rc, reg_win.lo());
1444 return true;
1445 }
1446
1447 bool
increase_register_file(ra_ctx & ctx,RegType type)1448 increase_register_file(ra_ctx& ctx, RegType type)
1449 {
1450 if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
1451 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
1452 ctx.program->max_reg_demand.sgpr));
1453 } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1454 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
1455 ctx.program->max_reg_demand.sgpr + 1));
1456 } else {
1457 return false;
1458 }
1459 return true;
1460 }
1461
1462 struct IDAndRegClass {
IDAndRegClassaco::__anonc76b5ea00111::IDAndRegClass1463 IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1464
1465 unsigned id;
1466 RegClass rc;
1467 };
1468
1469 struct IDAndInfo {
IDAndInfoaco::__anonc76b5ea00111::IDAndInfo1470 IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1471
1472 unsigned id;
1473 DefInfo info;
1474 };
1475
1476 /* Reallocates vars by sorting them and placing each variable after the previous
1477 * one. If one of the variables has 0xffffffff as an ID, the register assigned
1478 * for that variable will be returned.
1479 */
1480 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1481 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1482 std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1483 {
1484 /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1485 * temporary sizes to dwords.
1486 */
1487 std::vector<IDAndInfo> sorted;
1488 for (IDAndRegClass var : vars) {
1489 DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1490 sorted.emplace_back(var.id, info);
1491 }
1492
1493 std::sort(
1494 sorted.begin(), sorted.end(),
1495 [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1496 {
1497 unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1498 unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1499 if (a_stride > b_stride)
1500 return true;
1501 if (a_stride < b_stride)
1502 return false;
1503 if (a.id == 0xffffffff || b.id == 0xffffffff)
1504 return a.id ==
1505 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1506 return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1507 });
1508
1509 PhysReg next_reg = start;
1510 PhysReg space_reg;
1511 for (IDAndInfo& var : sorted) {
1512 unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1513 next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1514
1515 /* 0xffffffff is a special variable ID used reserve a space for killed
1516 * operands and definitions.
1517 */
1518 if (var.id != 0xffffffff) {
1519 if (next_reg != ctx.assignments[var.id].reg) {
1520 RegClass rc = ctx.assignments[var.id].rc;
1521 Temp tmp(var.id, rc);
1522
1523 Operand pc_op(tmp);
1524 pc_op.setFixed(ctx.assignments[var.id].reg);
1525 Definition pc_def(next_reg, rc);
1526 parallelcopies.emplace_back(pc_op, pc_def);
1527 }
1528 } else {
1529 space_reg = next_reg;
1530 }
1531
1532 adjust_max_used_regs(ctx, var.info.rc, next_reg);
1533
1534 next_reg = next_reg.advance(var.info.rc.size() * 4);
1535 }
1536
1537 return space_reg;
1538 }
1539
1540 bool
is_mimg_vaddr_intact(ra_ctx & ctx,RegisterFile & reg_file,Instruction * instr)1541 is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
1542 {
1543 PhysReg first{512};
1544 for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1545 Operand op = instr->operands[i + 3];
1546
1547 if (ctx.assignments[op.tempId()].assigned) {
1548 PhysReg reg = ctx.assignments[op.tempId()].reg;
1549
1550 if (first.reg() == 512) {
1551 PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
1552 first = reg.advance(i * -4);
1553 PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1554 if (!bounds.contains(vec)) /* not enough space for other operands */
1555 return false;
1556 } else {
1557 if (reg != first.advance(i * 4)) /* not at the best position */
1558 return false;
1559 }
1560 } else {
1561 /* If there's an unexpected temporary, this operand is unlikely to be
1562 * placed in the best position.
1563 */
1564 if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1565 return false;
1566 }
1567 }
1568
1569 return true;
1570 }
1571
1572 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr)1573 get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
1574 {
1575 Instruction* vec = ctx.vectors[temp.id()];
1576 unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1577 unsigned our_offset = 0;
1578 for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1579 Operand& op = vec->operands[i];
1580 if (op.isTemp() && op.tempId() == temp.id())
1581 break;
1582 else
1583 our_offset += op.bytes();
1584 }
1585
1586 if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1587 unsigned their_offset = 0;
1588 /* check for every operand of the vector
1589 * - whether the operand is assigned and
1590 * - we can use the register relative to that operand
1591 */
1592 for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1593 Operand& op = vec->operands[i];
1594 if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1595 ctx.assignments[op.tempId()].assigned) {
1596 PhysReg reg = ctx.assignments[op.tempId()].reg;
1597 reg.reg_b += (our_offset - their_offset);
1598 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1599 return reg;
1600
1601 /* return if MIMG vaddr components don't remain vector-aligned */
1602 if (vec->format == Format::MIMG)
1603 return {};
1604 }
1605 their_offset += op.bytes();
1606 }
1607
1608 /* We didn't find a register relative to other vector operands.
1609 * Try to find new space which fits the whole vector.
1610 */
1611 RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1612 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1613 std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1614 if (reg) {
1615 reg->reg_b += our_offset;
1616 /* make sure to only use byte offset if the instruction supports it */
1617 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg))
1618 return reg;
1619 }
1620 }
1621 return {};
1622 }
1623
1624 PhysReg
get_reg(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1625 get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1626 std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1627 int operand_index = -1)
1628 {
1629 auto split_vec = ctx.split_vectors.find(temp.id());
1630 if (split_vec != ctx.split_vectors.end()) {
1631 unsigned offset = 0;
1632 for (Definition def : split_vec->second->definitions) {
1633 if (ctx.assignments[def.tempId()].affinity) {
1634 assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1635 if (affinity.assigned) {
1636 PhysReg reg = affinity.reg;
1637 reg.reg_b -= offset;
1638 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1639 return reg;
1640 }
1641 }
1642 offset += def.bytes();
1643 }
1644 }
1645
1646 if (ctx.assignments[temp.id()].affinity) {
1647 assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1648 if (affinity.assigned) {
1649 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
1650 return affinity.reg;
1651 }
1652 }
1653 if (ctx.assignments[temp.id()].vcc) {
1654 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc))
1655 return vcc;
1656 }
1657 if (ctx.assignments[temp.id()].m0) {
1658 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr))
1659 return m0;
1660 }
1661
1662 std::optional<PhysReg> res;
1663
1664 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1665 res = get_reg_vector(ctx, reg_file, temp, instr);
1666 if (res)
1667 return *res;
1668 }
1669
1670 DefInfo info(ctx, instr, temp.regClass(), operand_index);
1671
1672 if (!ctx.policy.skip_optimistic_path) {
1673 /* try to find space without live-range splits */
1674 res = get_reg_simple(ctx, reg_file, info);
1675
1676 if (res)
1677 return *res;
1678 }
1679
1680 /* try to find space with live-range splits */
1681 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1682
1683 if (res)
1684 return *res;
1685
1686 /* try using more registers */
1687
1688 /* We should only fail here because keeping under the limit would require
1689 * too many moves. */
1690 assert(reg_file.count_zero(info.bounds) >= info.size);
1691
1692 if (!increase_register_file(ctx, info.rc.type())) {
1693 /* fallback algorithm: reallocate all variables at once */
1694 unsigned def_size = info.rc.size();
1695 for (Definition def : instr->definitions) {
1696 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1697 def_size += def.regClass().size();
1698 }
1699
1700 unsigned killed_op_size = 0;
1701 for (Operand op : instr->operands) {
1702 if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1703 killed_op_size += op.regClass().size();
1704 }
1705
1706 const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
1707
1708 /* reallocate passthrough variables and non-killed operands */
1709 std::vector<IDAndRegClass> vars;
1710 for (unsigned id : find_vars(ctx, reg_file, regs))
1711 vars.emplace_back(id, ctx.assignments[id].rc);
1712 vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1713
1714 PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1715
1716 /* reallocate killed operands */
1717 std::vector<IDAndRegClass> killed_op_vars;
1718 for (Operand op : instr->operands) {
1719 if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1720 killed_op_vars.emplace_back(op.tempId(), op.regClass());
1721 }
1722 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1723
1724 /* reallocate definitions */
1725 std::vector<IDAndRegClass> def_vars;
1726 for (Definition def : instr->definitions) {
1727 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1728 def_vars.emplace_back(def.tempId(), def.regClass());
1729 }
1730 def_vars.emplace_back(0xffffffff, info.rc);
1731 return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1732 }
1733
1734 return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1735 }
1736
1737 PhysReg
get_reg_create_vector(ra_ctx & ctx,RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1738 get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1739 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1740 aco_ptr<Instruction>& instr)
1741 {
1742 RegClass rc = temp.regClass();
1743 /* create_vector instructions have different costs w.r.t. register coalescing */
1744 uint32_t size = rc.size();
1745 uint32_t bytes = rc.bytes();
1746 uint32_t stride = get_stride(rc);
1747 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1748
1749 // TODO: improve p_create_vector for sub-dword vectors
1750
1751 PhysReg best_pos{0xFFF};
1752 unsigned num_moves = 0xFF;
1753 bool best_avoid = true;
1754 uint32_t correct_pos_mask = 0;
1755
1756 /* test for each operand which definition placement causes the least shuffle instructions */
1757 for (unsigned i = 0, offset = 0; i < instr->operands.size();
1758 offset += instr->operands[i].bytes(), i++) {
1759 // TODO: think about, if we can alias live operands on the same register
1760 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1761 instr->operands[i].getTemp().type() != rc.type())
1762 continue;
1763
1764 if (offset > instr->operands[i].physReg().reg_b)
1765 continue;
1766
1767 unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1768 if (reg_lower % 4)
1769 continue;
1770 PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1771 unsigned k = 0;
1772
1773 /* no need to check multiple times */
1774 if (reg_win.lo() == best_pos)
1775 continue;
1776
1777 /* check borders */
1778 // TODO: this can be improved */
1779 if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1780 continue;
1781 if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1782 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1783 continue;
1784 if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1785 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1786 continue;
1787
1788 /* count variables to be moved and check "avoid" */
1789 bool avoid = false;
1790 bool linear_vgpr = false;
1791 for (PhysReg j : reg_win) {
1792 if (reg_file[j] != 0) {
1793 if (reg_file[j] == 0xF0000000) {
1794 PhysReg reg;
1795 reg.reg_b = j * 4;
1796 unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1797 for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1798 k += reg_file.test(reg, 1);
1799 } else {
1800 k += 4;
1801 linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1802 }
1803 }
1804 avoid |= ctx.war_hint[j];
1805 }
1806
1807 if (linear_vgpr) {
1808 /* we cannot split live ranges of linear vgprs inside control flow */
1809 if (ctx.block->kind & block_kind_top_level)
1810 avoid = true;
1811 else
1812 continue;
1813 }
1814
1815 if (avoid && !best_avoid)
1816 continue;
1817
1818 /* count operands in wrong positions */
1819 uint32_t correct_pos_mask_new = 0;
1820 for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1821 offset2 += instr->operands[j].bytes(), j++) {
1822 Operand& op = instr->operands[j];
1823 if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1824 correct_pos_mask_new |= 1 << j;
1825 else
1826 k += op.bytes();
1827 }
1828 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1829 if (k > num_moves || (!aligned && k == num_moves))
1830 continue;
1831
1832 best_pos = reg_win.lo();
1833 num_moves = k;
1834 best_avoid = avoid;
1835 correct_pos_mask = correct_pos_mask_new;
1836 }
1837
1838 /* too many moves: try the generic get_reg() function */
1839 if (num_moves >= 2 * bytes) {
1840 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1841 } else if (num_moves > bytes) {
1842 DefInfo info(ctx, instr, rc, -1);
1843 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1844 if (res)
1845 return *res;
1846 }
1847
1848 /* re-enable killed operands which are in the wrong position */
1849 RegisterFile tmp_file(reg_file);
1850 for (Operand& op : instr->operands) {
1851 if (op.isTemp() && op.isFirstKillBeforeDef())
1852 tmp_file.fill(op);
1853 }
1854 for (unsigned i = 0; i < instr->operands.size(); i++) {
1855 if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
1856 tmp_file.clear(instr->operands[i]);
1857 }
1858
1859 /* collect variables to be moved */
1860 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1861
1862 bool success = false;
1863 std::vector<std::pair<Operand, Definition>> pc;
1864 success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
1865
1866 if (!success) {
1867 if (!increase_register_file(ctx, temp.type())) {
1868 /* use the fallback algorithm in get_reg() */
1869 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1870 }
1871 return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1872 }
1873
1874 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1875 adjust_max_used_regs(ctx, rc, best_pos);
1876
1877 return best_pos;
1878 }
1879
1880 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1881 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1882 {
1883 if (instr->format != Format::PSEUDO)
1884 return;
1885
1886 /* all instructions which use handle_operands() need this information */
1887 switch (instr->opcode) {
1888 case aco_opcode::p_extract_vector:
1889 case aco_opcode::p_create_vector:
1890 case aco_opcode::p_split_vector:
1891 case aco_opcode::p_parallelcopy:
1892 case aco_opcode::p_start_linear_vgpr: break;
1893 default: return;
1894 }
1895
1896 bool writes_linear = false;
1897 /* if all definitions are logical vgpr, no need to care for SCC */
1898 for (Definition& def : instr->definitions) {
1899 if (def.getTemp().regClass().is_linear())
1900 writes_linear = true;
1901 }
1902 /* if all operands are constant, no need to care either */
1903 bool reads_linear = false;
1904 bool reads_subdword = false;
1905 for (Operand& op : instr->operands) {
1906 if (op.isTemp() && op.getTemp().regClass().is_linear())
1907 reads_linear = true;
1908 if (op.isTemp() && op.regClass().is_subdword())
1909 reads_subdword = true;
1910 }
1911 bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
1912 (ctx.program->gfx_level <= GFX7 && reads_subdword);
1913 if (!needs_scratch_reg)
1914 return;
1915
1916 instr->pseudo().tmp_in_scc = reg_file[scc];
1917
1918 int reg = ctx.max_used_sgpr;
1919 for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
1920 ;
1921 if (reg < 0) {
1922 reg = ctx.max_used_sgpr + 1;
1923 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
1924 ;
1925 if (reg == ctx.program->max_reg_demand.sgpr) {
1926 assert(reads_subdword && reg_file[m0] == 0);
1927 reg = m0;
1928 }
1929 }
1930
1931 adjust_max_used_regs(ctx, s1, reg);
1932 instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
1933 }
1934
1935 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)1936 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
1937 RegClass rc)
1938 {
1939 if (reg.byte()) {
1940 unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
1941 if (reg.byte() % stride)
1942 return false;
1943 }
1944
1945 switch (instr->format) {
1946 case Format::SMEM:
1947 return reg != scc && reg != exec &&
1948 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1949 (reg != vcc || (instr->definitions.empty() && idx == 2) ||
1950 gfx_level >= GFX10); /* sdata can be vcc */
1951 default:
1952 // TODO: there are more instructions with restrictions on registers
1953 return true;
1954 }
1955 }
1956
1957 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)1958 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
1959 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1960 aco_ptr<Instruction>& instr)
1961 {
1962 assert(instr->operands.size() <= 128);
1963
1964 RegisterFile tmp_file(register_file);
1965
1966 BITSET_DECLARE(mask, 128) = {0};
1967
1968 for (unsigned i = 0; i < instr->operands.size(); i++) {
1969 Operand& op = instr->operands[i];
1970
1971 if (!op.isTemp() || !op.isFixed())
1972 continue;
1973
1974 PhysReg src = ctx.assignments[op.tempId()].reg;
1975 adjust_max_used_regs(ctx, op.regClass(), op.physReg());
1976
1977 if (op.physReg() == src) {
1978 tmp_file.block(op.physReg(), op.regClass());
1979 continue;
1980 }
1981
1982 unsigned j;
1983 bool found = false;
1984 BITSET_FOREACH_SET (j, mask, i) {
1985 if (instr->operands[j].tempId() == op.tempId() &&
1986 instr->operands[j].physReg() == op.physReg()) {
1987 found = true;
1988 break;
1989 }
1990 }
1991 if (found)
1992 continue; /* the copy is already added to the list */
1993
1994 /* clear from register_file so fixed operands are not collected be collect_vars() */
1995 tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
1996
1997 BITSET_SET(mask, i);
1998
1999 Operand pc_op(instr->operands[i].getTemp(), src);
2000 Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2001 parallelcopy.emplace_back(pc_op, pc_def);
2002 }
2003
2004 if (BITSET_IS_EMPTY(mask))
2005 return;
2006
2007 unsigned i;
2008 std::vector<unsigned> blocking_vars;
2009 BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2010 Operand& op = instr->operands[i];
2011 PhysRegInterval target{op.physReg(), op.size()};
2012 std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2013 blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2014
2015 /* prevent get_regs_for_copies() from using these registers */
2016 tmp_file.block(op.physReg(), op.regClass());
2017 }
2018
2019 get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2020 update_renames(ctx, register_file, parallelcopy, instr,
2021 rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
2022 }
2023
2024 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2025 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2026 std::vector<std::pair<Operand, Definition>>& parallelcopy,
2027 aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2028 {
2029 /* clear the operand in case it's only a stride mismatch */
2030 PhysReg src = ctx.assignments[operand.tempId()].reg;
2031 register_file.clear(src, operand.regClass());
2032 PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2033
2034 Operand pc_op = operand;
2035 pc_op.setFixed(src);
2036 Definition pc_def = Definition(dst, pc_op.regClass());
2037 parallelcopy.emplace_back(pc_op, pc_def);
2038 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
2039 }
2040
2041 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2042 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2043 std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2044 aco_ptr<Instruction>& phi, Temp tmp)
2045 {
2046 std::vector<std::pair<Operand, Definition>> parallelcopy;
2047 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2048 update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2049
2050 /* process parallelcopy */
2051 for (std::pair<Operand, Definition> pc : parallelcopy) {
2052 /* see if it's a copy from a different phi */
2053 // TODO: prefer moving some previous phis over live-ins
2054 // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2055 // problem in practice since they can only be fixed to exec)
2056 Instruction* prev_phi = NULL;
2057 std::vector<aco_ptr<Instruction>>::iterator phi_it;
2058 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2059 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2060 prev_phi = phi_it->get();
2061 }
2062 if (prev_phi) {
2063 /* if so, just update that phi's register */
2064 prev_phi->definitions[0].setFixed(pc.second.physReg());
2065 register_file.fill(prev_phi->definitions[0]);
2066 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2067 pc.second.regClass()};
2068 continue;
2069 }
2070
2071 /* rename */
2072 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
2073 Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2074 ctx.orig_names[pc.second.tempId()] = orig;
2075 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
2076
2077 /* otherwise, this is a live-in and we need to create a new phi
2078 * to move it in this block's predecessors */
2079 aco_opcode opcode =
2080 pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2081 std::vector<unsigned>& preds =
2082 pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2083 aco_ptr<Instruction> new_phi{
2084 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2085 new_phi->definitions[0] = pc.second;
2086 for (unsigned i = 0; i < preds.size(); i++)
2087 new_phi->operands[i] = Operand(pc.first);
2088 instructions.emplace_back(std::move(new_phi));
2089
2090 /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2091 * a loop header.
2092 */
2093 live_in.erase(orig.id());
2094 }
2095
2096 return reg;
2097 }
2098
2099 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2100 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2101 std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2102 {
2103 /* move all phis to instructions */
2104 for (aco_ptr<Instruction>& phi : block.instructions) {
2105 if (!is_phi(phi))
2106 break;
2107 if (!phi->definitions[0].isKill())
2108 instructions.emplace_back(std::move(phi));
2109 }
2110
2111 /* assign phis with all-matching registers to that register */
2112 for (aco_ptr<Instruction>& phi : instructions) {
2113 Definition& definition = phi->definitions[0];
2114 if (definition.isFixed())
2115 continue;
2116
2117 if (!phi->operands[0].isTemp())
2118 continue;
2119
2120 PhysReg reg = phi->operands[0].physReg();
2121 auto OpsSame = [=](const Operand& op) -> bool
2122 { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2123 bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2124 if (!all_same)
2125 continue;
2126
2127 if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
2128 continue;
2129
2130 definition.setFixed(reg);
2131 register_file.fill(definition);
2132 ctx.assignments[definition.tempId()].set(definition);
2133 }
2134
2135 /* try to find a register that is used by at least one operand */
2136 for (aco_ptr<Instruction>& phi : instructions) {
2137 Definition& definition = phi->definitions[0];
2138 if (definition.isFixed())
2139 continue;
2140
2141 /* use affinity if available */
2142 if (ctx.assignments[definition.tempId()].affinity &&
2143 ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2144 assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2145 assert(affinity.rc == definition.regClass());
2146 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
2147 definition.setFixed(affinity.reg);
2148 register_file.fill(definition);
2149 ctx.assignments[definition.tempId()].set(definition);
2150 continue;
2151 }
2152 }
2153
2154 /* by going backwards, we aim to avoid copies in else-blocks */
2155 for (int i = phi->operands.size() - 1; i >= 0; i--) {
2156 const Operand& op = phi->operands[i];
2157 if (!op.isTemp() || !op.isFixed())
2158 continue;
2159
2160 PhysReg reg = op.physReg();
2161 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
2162 definition.setFixed(reg);
2163 register_file.fill(definition);
2164 ctx.assignments[definition.tempId()].set(definition);
2165 break;
2166 }
2167 }
2168 }
2169
2170 /* find registers for phis where the register was blocked or no operand was assigned */
2171
2172 /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2173 for (unsigned i = 0; i < instructions.size(); i++) {
2174 aco_ptr<Instruction>& phi = instructions[i];
2175 Definition& definition = phi->definitions[0];
2176 if (definition.isFixed())
2177 continue;
2178
2179 definition.setFixed(
2180 get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2181
2182 register_file.fill(definition);
2183 ctx.assignments[definition.tempId()].set(definition);
2184 }
2185 }
2186
2187 Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2188 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2189 {
2190 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
2191 if (it == ctx.renames[block_idx].end())
2192 return val;
2193 else
2194 return it->second;
2195 }
2196
2197 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2198 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2199 {
2200 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2201 if (preds.size() == 0)
2202 return val;
2203
2204 if (preds.size() == 1) {
2205 /* if the block has only one predecessor, just look there for the name */
2206 return read_variable(ctx, val, preds[0]);
2207 }
2208
2209 /* there are multiple predecessors and the block is sealed */
2210 Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2211
2212 /* get the rename from each predecessor and check if they are the same */
2213 Temp new_val;
2214 bool needs_phi = false;
2215 for (unsigned i = 0; i < preds.size(); i++) {
2216 ops[i] = read_variable(ctx, val, preds[i]);
2217 if (i == 0)
2218 new_val = ops[i];
2219 else
2220 needs_phi |= !(new_val == ops[i]);
2221 }
2222
2223 if (needs_phi) {
2224 assert(!val.regClass().is_linear_vgpr());
2225
2226 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2227 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2228 aco_ptr<Instruction> phi{
2229 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2230 new_val = ctx.program->allocateTmp(val.regClass());
2231 phi->definitions[0] = Definition(new_val);
2232 ctx.assignments.emplace_back();
2233 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2234 for (unsigned i = 0; i < preds.size(); i++) {
2235 /* update the operands so that it uses the new affinity */
2236 phi->operands[i] = Operand(ops[i]);
2237 assert(ctx.assignments[ops[i].id()].assigned);
2238 assert(ops[i].regClass() == new_val.regClass());
2239 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2240 }
2241 block->instructions.insert(block->instructions.begin(), std::move(phi));
2242 }
2243
2244 return new_val;
2245 }
2246
2247 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx)2248 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2249 uint32_t loop_exit_idx)
2250 {
2251 Block& loop_header = ctx.program->blocks[loop_header_idx];
2252 std::unordered_map<unsigned, Temp> renames;
2253
2254 /* create phis for variables renamed during the loop */
2255 for (unsigned t : live_in) {
2256 Temp val = Temp(t, ctx.program->temp_rc[t]);
2257 Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2258 Temp renamed = handle_live_in(ctx, val, &loop_header);
2259 if (renamed == prev)
2260 continue;
2261
2262 /* insert additional renames at block end, but don't overwrite */
2263 renames[prev.id()] = renamed;
2264 ctx.orig_names[renamed.id()] = val;
2265 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2266 auto it = ctx.renames[idx].emplace(val.id(), renamed);
2267 /* if insertion is unsuccessful, update if necessary */
2268 if (!it.second && it.first->second == prev)
2269 it.first->second = renamed;
2270 }
2271
2272 /* update loop-carried values of the phi created by handle_live_in() */
2273 for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2274 Operand& op = loop_header.instructions[0]->operands[i];
2275 if (op.getTemp() == prev)
2276 op.setTemp(renamed);
2277 }
2278
2279 /* use the assignment from the loop preheader and fix def reg */
2280 assignment& var = ctx.assignments[prev.id()];
2281 ctx.assignments[renamed.id()] = var;
2282 loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2283 }
2284
2285 /* rename loop carried phi operands */
2286 for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2287 aco_ptr<Instruction>& phi = loop_header.instructions[i];
2288 if (!is_phi(phi))
2289 break;
2290 const std::vector<unsigned>& preds =
2291 phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2292 for (unsigned j = 1; j < phi->operands.size(); j++) {
2293 Operand& op = phi->operands[j];
2294 if (!op.isTemp())
2295 continue;
2296
2297 /* Find the original name, since this operand might not use the original name if the phi
2298 * was created after init_reg_file().
2299 */
2300 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());
2301 Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2302
2303 op.setTemp(read_variable(ctx, orig, preds[j]));
2304 op.setFixed(ctx.assignments[op.tempId()].reg);
2305 }
2306 }
2307
2308 /* return early if no new phi was created */
2309 if (renames.empty())
2310 return;
2311
2312 /* propagate new renames through loop */
2313 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2314 Block& current = ctx.program->blocks[idx];
2315 /* rename all uses in this block */
2316 for (aco_ptr<Instruction>& instr : current.instructions) {
2317 /* phis are renamed after RA */
2318 if (idx == loop_header_idx && is_phi(instr))
2319 continue;
2320
2321 for (Operand& op : instr->operands) {
2322 if (!op.isTemp())
2323 continue;
2324
2325 auto rename = renames.find(op.tempId());
2326 if (rename != renames.end()) {
2327 assert(rename->second.id());
2328 op.setTemp(rename->second);
2329 }
2330 }
2331 }
2332 }
2333 }
2334
2335 /**
2336 * This function serves the purpose to correctly initialize the register file
2337 * at the beginning of a block (before any existing phis).
2338 * In order to do so, all live-in variables are entered into the RegisterFile.
2339 * Reg-to-reg moves (renames) from previous blocks are taken into account and
2340 * the SSA is repaired by inserting corresponding phi-nodes.
2341 */
2342 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2343 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2344 {
2345 if (block.kind & block_kind_loop_exit) {
2346 uint32_t header = ctx.loop_header.back();
2347 ctx.loop_header.pop_back();
2348 handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2349 }
2350
2351 RegisterFile register_file;
2352 const IDSet& live_in = live_out_per_block[block.index];
2353 assert(block.index != 0 || live_in.empty());
2354
2355 if (block.kind & block_kind_loop_header) {
2356 ctx.loop_header.emplace_back(block.index);
2357 /* already rename phis incoming value */
2358 for (aco_ptr<Instruction>& instr : block.instructions) {
2359 if (!is_phi(instr))
2360 break;
2361 Operand& operand = instr->operands[0];
2362 if (operand.isTemp()) {
2363 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2364 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2365 }
2366 }
2367 for (unsigned t : live_in) {
2368 Temp val = Temp(t, ctx.program->temp_rc[t]);
2369 Temp renamed = read_variable(ctx, val, block.index - 1);
2370 if (renamed != val)
2371 ctx.renames[block.index][val.id()] = renamed;
2372 assignment& var = ctx.assignments[renamed.id()];
2373 assert(var.assigned);
2374 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2375 }
2376 } else {
2377 /* rename phi operands */
2378 for (aco_ptr<Instruction>& instr : block.instructions) {
2379 if (!is_phi(instr))
2380 break;
2381 const std::vector<unsigned>& preds =
2382 instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2383
2384 for (unsigned i = 0; i < instr->operands.size(); i++) {
2385 Operand& operand = instr->operands[i];
2386 if (!operand.isTemp())
2387 continue;
2388 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2389 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2390 }
2391 }
2392 for (unsigned t : live_in) {
2393 Temp val = Temp(t, ctx.program->temp_rc[t]);
2394 Temp renamed = handle_live_in(ctx, val, &block);
2395 assignment& var = ctx.assignments[renamed.id()];
2396 /* due to live-range splits, the live-in might be a phi, now */
2397 if (var.assigned) {
2398 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2399 }
2400 if (renamed != val) {
2401 ctx.renames[block.index].emplace(t, renamed);
2402 ctx.orig_names[renamed.id()] = val;
2403 }
2404 }
2405 }
2406
2407 return register_file;
2408 }
2409
2410 void
get_affinities(ra_ctx & ctx,std::vector<IDSet> & live_out_per_block)2411 get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
2412 {
2413 std::vector<std::vector<Temp>> phi_resources;
2414 std::unordered_map<unsigned, unsigned> temp_to_phi_resources;
2415
2416 for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2417 block_rit++) {
2418 Block& block = *block_rit;
2419
2420 /* first, compute the death points of all live vars within the block */
2421 IDSet& live = live_out_per_block[block.index];
2422
2423 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2424 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2425 aco_ptr<Instruction>& instr = *rit;
2426 if (is_phi(instr))
2427 break;
2428
2429 /* add vector affinities */
2430 if (instr->opcode == aco_opcode::p_create_vector) {
2431 for (const Operand& op : instr->operands) {
2432 if (op.isTemp() && op.isFirstKill() &&
2433 op.getTemp().type() == instr->definitions[0].getTemp().type())
2434 ctx.vectors[op.tempId()] = instr.get();
2435 }
2436 } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2437 !instr->mimg().strict_wqm) {
2438 for (unsigned i = 3; i < instr->operands.size(); i++)
2439 ctx.vectors[instr->operands[i].tempId()] = instr.get();
2440 } else if (instr->opcode == aco_opcode::p_split_vector &&
2441 instr->operands[0].isFirstKillBeforeDef()) {
2442 ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2443 } else if (instr->isVOPC() && !instr->isVOP3()) {
2444 if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2445 ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2446 } else if (instr->isVOP2() && !instr->isVOP3()) {
2447 if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2448 instr->operands[2].regClass().type() == RegType::sgpr)
2449 ctx.assignments[instr->operands[2].tempId()].vcc = true;
2450 if (instr->definitions.size() == 2)
2451 ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2452 } else if (instr->opcode == aco_opcode::s_and_b32 ||
2453 instr->opcode == aco_opcode::s_and_b64) {
2454 /* If SCC is used by a branch, we might be able to use
2455 * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2456 */
2457 if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2458 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2459 ctx.assignments[instr->operands[0].tempId()].vcc = true;
2460 } else if (instr->opcode == aco_opcode::s_sendmsg) {
2461 ctx.assignments[instr->operands[0].tempId()].m0 = true;
2462 }
2463
2464 /* add operands to live variables */
2465 for (const Operand& op : instr->operands) {
2466 if (op.isTemp())
2467 live.insert(op.tempId());
2468 }
2469
2470 /* erase definitions from live */
2471 for (unsigned i = 0; i < instr->definitions.size(); i++) {
2472 const Definition& def = instr->definitions[i];
2473 if (!def.isTemp())
2474 continue;
2475 live.erase(def.tempId());
2476 /* mark last-seen phi operand */
2477 std::unordered_map<unsigned, unsigned>::iterator it =
2478 temp_to_phi_resources.find(def.tempId());
2479 if (it != temp_to_phi_resources.end() &&
2480 def.regClass() == phi_resources[it->second][0].regClass()) {
2481 phi_resources[it->second][0] = def.getTemp();
2482 /* try to coalesce phi affinities with parallelcopies */
2483 Operand op = Operand();
2484 switch (instr->opcode) {
2485 case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
2486
2487 case aco_opcode::v_interp_p2_f32:
2488 case aco_opcode::v_writelane_b32:
2489 case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
2490
2491 case aco_opcode::v_fma_f32:
2492 case aco_opcode::v_fma_f16:
2493 case aco_opcode::v_pk_fma_f16:
2494 if (ctx.program->gfx_level < GFX10)
2495 continue;
2496 FALLTHROUGH;
2497 case aco_opcode::v_mad_f32:
2498 case aco_opcode::v_mad_f16:
2499 if (instr->usesModifiers())
2500 continue;
2501 op = instr->operands[2];
2502 break;
2503
2504 case aco_opcode::v_mad_legacy_f32:
2505 case aco_opcode::v_fma_legacy_f32:
2506 if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32)
2507 continue;
2508 op = instr->operands[2];
2509 break;
2510
2511 default: continue;
2512 }
2513
2514 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2515 phi_resources[it->second].emplace_back(op.getTemp());
2516 temp_to_phi_resources[op.tempId()] = it->second;
2517 }
2518 }
2519 }
2520 }
2521
2522 /* collect phi affinities */
2523 for (; rit != block.instructions.rend(); ++rit) {
2524 aco_ptr<Instruction>& instr = *rit;
2525 assert(is_phi(instr));
2526
2527 live.erase(instr->definitions[0].tempId());
2528 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2529 continue;
2530
2531 assert(instr->definitions[0].isTemp());
2532 std::unordered_map<unsigned, unsigned>::iterator it =
2533 temp_to_phi_resources.find(instr->definitions[0].tempId());
2534 unsigned index = phi_resources.size();
2535 std::vector<Temp>* affinity_related;
2536 if (it != temp_to_phi_resources.end()) {
2537 index = it->second;
2538 phi_resources[index][0] = instr->definitions[0].getTemp();
2539 affinity_related = &phi_resources[index];
2540 } else {
2541 phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2542 affinity_related = &phi_resources.back();
2543 }
2544
2545 for (const Operand& op : instr->operands) {
2546 if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2547 affinity_related->emplace_back(op.getTemp());
2548 if (block.kind & block_kind_loop_header)
2549 continue;
2550 temp_to_phi_resources[op.tempId()] = index;
2551 }
2552 }
2553 }
2554
2555 /* visit the loop header phis first in order to create nested affinities */
2556 if (block.kind & block_kind_loop_exit) {
2557 /* find loop header */
2558 auto header_rit = block_rit;
2559 while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2560 header_rit++;
2561
2562 for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2563 if (!is_phi(phi))
2564 break;
2565 if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2566 continue;
2567
2568 /* create an (empty) merge-set for the phi-related variables */
2569 auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2570 unsigned index = phi_resources.size();
2571 if (it == temp_to_phi_resources.end()) {
2572 temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2573 phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2574 } else {
2575 index = it->second;
2576 }
2577 for (unsigned i = 1; i < phi->operands.size(); i++) {
2578 const Operand& op = phi->operands[i];
2579 if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2580 temp_to_phi_resources[op.tempId()] = index;
2581 }
2582 }
2583 }
2584 }
2585 }
2586 /* create affinities */
2587 for (std::vector<Temp>& vec : phi_resources) {
2588 for (unsigned i = 1; i < vec.size(); i++)
2589 if (vec[i].id() != vec[0].id())
2590 ctx.assignments[vec[i].id()].affinity = vec[0].id();
2591 }
2592 }
2593
2594 void
optimize_encoding_vop2(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2595 optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2596 aco_ptr<Instruction>& instr)
2597 {
2598 /* try to optimize v_mad_f32 -> v_mac_f32 */
2599 if ((instr->opcode != aco_opcode::v_mad_f32 &&
2600 (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) &&
2601 instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 &&
2602 (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) &&
2603 (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) &&
2604 (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) &&
2605 (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) &&
2606 (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) ||
2607 !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() ||
2608 instr->operands[2].getTemp().type() != RegType::vgpr ||
2609 (!instr->operands[0].isOfType(RegType::vgpr) &&
2610 !instr->operands[1].isOfType(RegType::vgpr)) ||
2611 instr->operands[2].physReg().byte() != 0 || instr->valu().opsel[2])
2612 return;
2613
2614 if (instr->isVOP3P() && (instr->valu().opsel_lo != 0 || instr->valu().opsel_hi != 0x7))
2615 return;
2616
2617 if ((instr->operands[0].physReg().byte() != 0 || instr->operands[1].physReg().byte() != 0 ||
2618 instr->valu().opsel) &&
2619 program->gfx_level < GFX11)
2620 return;
2621
2622 unsigned im_mask = instr->isDPP16() ? 0x3 : 0;
2623 if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2624 (instr->valu().neg & ~im_mask))
2625 return;
2626
2627 if (!instr->operands[1].isOfType(RegType::vgpr))
2628 instr->valu().swapOperands(0, 1);
2629
2630 if (!instr->operands[0].isOfType(RegType::vgpr) && instr->valu().opsel[0])
2631 return;
2632
2633 unsigned def_id = instr->definitions[0].tempId();
2634 if (ctx.assignments[def_id].affinity) {
2635 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2636 if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2637 !register_file.test(affinity.reg, instr->operands[2].bytes()))
2638 return;
2639 }
2640
2641 instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2642 (unsigned)Format::VOP2);
2643 instr->valu().opsel_hi = 0;
2644 switch (instr->opcode) {
2645 case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2646 case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2647 case aco_opcode::v_mad_f16:
2648 case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2649 case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2650 case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2651 case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2652 case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2653 case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2654 default: break;
2655 }
2656 }
2657
2658 void
optimize_encoding_sopk(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2659 optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2660 aco_ptr<Instruction>& instr)
2661 {
2662 /* try to optimize sop2 with literal source to sopk */
2663 if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 &&
2664 instr->opcode != aco_opcode::s_cselect_b32)
2665 return;
2666
2667 uint32_t literal_idx = 0;
2668
2669 if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2670 literal_idx = 1;
2671
2672 if (!instr->operands[!literal_idx].isTemp() ||
2673 !instr->operands[!literal_idx].isKillBeforeDef() ||
2674 instr->operands[!literal_idx].getTemp().type() != RegType::sgpr ||
2675 instr->operands[!literal_idx].physReg() >= 128)
2676 return;
2677
2678 if (!instr->operands[literal_idx].isLiteral())
2679 return;
2680
2681 const uint32_t i16_mask = 0xffff8000u;
2682 uint32_t value = instr->operands[literal_idx].constantValue();
2683 if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2684 return;
2685
2686 unsigned def_id = instr->definitions[0].tempId();
2687 if (ctx.assignments[def_id].affinity) {
2688 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2689 if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2690 !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2691 return;
2692 }
2693
2694 static_assert(sizeof(SOPK_instruction) <= sizeof(SOP2_instruction),
2695 "Invalid direct instruction cast.");
2696 instr->format = Format::SOPK;
2697 SOPK_instruction* instr_sopk = &instr->sopk();
2698
2699 instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff;
2700 if (literal_idx == 0)
2701 std::swap(instr_sopk->operands[0], instr_sopk->operands[1]);
2702 if (instr_sopk->operands.size() > 2)
2703 std::swap(instr_sopk->operands[1], instr_sopk->operands[2]);
2704 instr_sopk->operands.pop_back();
2705
2706 switch (instr_sopk->opcode) {
2707 case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break;
2708 case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break;
2709 case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break;
2710 default: unreachable("illegal instruction");
2711 }
2712 }
2713
2714 void
optimize_encoding(Program * program,ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2715 optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2716 aco_ptr<Instruction>& instr)
2717 {
2718 if (instr->isVALU())
2719 optimize_encoding_vop2(program, ctx, register_file, instr);
2720 if (instr->isSALU())
2721 optimize_encoding_sopk(program, ctx, register_file, instr);
2722 }
2723
2724 } /* end namespace */
2725
2726 void
register_allocation(Program * program,std::vector<IDSet> & live_out_per_block,ra_test_policy policy)2727 register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
2728 {
2729 ra_ctx ctx(program, policy);
2730 get_affinities(ctx, live_out_per_block);
2731
2732 for (Block& block : program->blocks) {
2733 ctx.block = █
2734
2735 /* initialize register file */
2736 RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
2737 ctx.war_hint.reset();
2738
2739 std::vector<aco_ptr<Instruction>> instructions;
2740 instructions.reserve(block.instructions.size());
2741
2742 /* this is a slight adjustment from the paper as we already have phi nodes:
2743 * We consider them incomplete phis and only handle the definition. */
2744 get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
2745
2746 /* If this is a merge block, the state of the register file at the branch instruction of the
2747 * predecessors corresponds to the state after phis at the merge block. So, we allocate a
2748 * register for the predecessor's branch definitions as if there was a phi.
2749 */
2750 if (!block.linear_preds.empty() &&
2751 (block.linear_preds.size() != 1 ||
2752 program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
2753 PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file,
2754 instructions, block, ctx.phi_dummy, Temp(0, s2));
2755 for (unsigned pred : block.linear_preds) {
2756 program->blocks[pred].scc_live_out = register_file[scc];
2757 aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
2758
2759 assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
2760 br->definitions[0].isKill());
2761
2762 br->definitions[0].setFixed(br_reg);
2763 }
2764 }
2765
2766 /* Handle all other instructions of the block */
2767 auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
2768 std::vector<aco_ptr<Instruction>>::iterator instr_it =
2769 std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
2770 for (; instr_it != block.instructions.end(); ++instr_it) {
2771 aco_ptr<Instruction>& instr = *instr_it;
2772
2773 /* parallelcopies from p_phi are inserted here which means
2774 * live ranges of killed operands end here as well */
2775 if (instr->opcode == aco_opcode::p_logical_end) {
2776 /* no need to process this instruction any further */
2777 if (block.logical_succs.size() != 1) {
2778 instructions.emplace_back(std::move(instr));
2779 continue;
2780 }
2781
2782 Block& succ = program->blocks[block.logical_succs[0]];
2783 unsigned idx = 0;
2784 for (; idx < succ.logical_preds.size(); idx++) {
2785 if (succ.logical_preds[idx] == block.index)
2786 break;
2787 }
2788 for (aco_ptr<Instruction>& phi : succ.instructions) {
2789 if (phi->opcode == aco_opcode::p_phi) {
2790 if (phi->operands[idx].isTemp() &&
2791 phi->operands[idx].getTemp().type() == RegType::sgpr &&
2792 phi->operands[idx].isFirstKillBeforeDef()) {
2793 Definition phi_op(
2794 read_variable(ctx, phi->operands[idx].getTemp(), block.index));
2795 phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
2796 register_file.clear(phi_op);
2797 }
2798 } else if (phi->opcode != aco_opcode::p_linear_phi) {
2799 break;
2800 }
2801 }
2802 instructions.emplace_back(std::move(instr));
2803 continue;
2804 }
2805
2806 /* unconditional branches are handled after phis of the target */
2807 if (instr->opcode == aco_opcode::p_branch) {
2808 /* last instruction of the block */
2809 instructions.emplace_back(std::move(instr));
2810 break;
2811 }
2812
2813 std::vector<std::pair<Operand, Definition>> parallelcopy;
2814
2815 assert(!is_phi(instr));
2816
2817 bool temp_in_scc = register_file[scc];
2818
2819 /* handle operands */
2820 bool fixed = false;
2821 for (unsigned i = 0; i < instr->operands.size(); ++i) {
2822 auto& operand = instr->operands[i];
2823 if (!operand.isTemp())
2824 continue;
2825
2826 /* rename operands */
2827 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
2828 assert(ctx.assignments[operand.tempId()].assigned);
2829
2830 fixed |=
2831 operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
2832 }
2833
2834 bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
2835 instr->opcode == aco_opcode::v_writelane_b32_e64;
2836 if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
2837 instr->operands[1].isTemp()) {
2838 /* v_writelane_b32 can take two sgprs but only if one is m0. */
2839 if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
2840 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
2841 instr->operands[0].setFixed(m0);
2842 fixed = true;
2843 }
2844 }
2845
2846 if (fixed)
2847 handle_fixed_operands(ctx, register_file, parallelcopy, instr);
2848
2849 for (unsigned i = 0; i < instr->operands.size(); ++i) {
2850 auto& operand = instr->operands[i];
2851 if (!operand.isTemp() || operand.isFixed())
2852 continue;
2853
2854 PhysReg reg = ctx.assignments[operand.tempId()].reg;
2855 if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
2856 operand.setFixed(reg);
2857 else
2858 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
2859
2860 if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
2861 (instr->isDS() && instr->ds().gds)) {
2862 for (unsigned j = 0; j < operand.size(); j++)
2863 ctx.war_hint.set(operand.physReg().reg() + j);
2864 }
2865 }
2866
2867 /* remove dead vars from register file */
2868 for (const Operand& op : instr->operands) {
2869 if (op.isTemp() && op.isFirstKillBeforeDef())
2870 register_file.clear(op);
2871 }
2872
2873 optimize_encoding(program, ctx, register_file, instr);
2874
2875 /* Handle definitions which must have the same register as an operand.
2876 * We expect that the definition has the same size as the operand, otherwise the new
2877 * location for the operand (if it's not killed) might intersect with the old one.
2878 * We can't read from the old location because it's corrupted, and we can't write the new
2879 * location because that's used by a live-through operand.
2880 */
2881 int op_fixed_to_def = get_op_fixed_to_def(instr.get());
2882 if (op_fixed_to_def != -1)
2883 instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
2884
2885 /* handle fixed definitions first */
2886 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2887 auto& definition = instr->definitions[i];
2888 if (!definition.isFixed())
2889 continue;
2890
2891 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
2892 /* check if the target register is blocked */
2893 if (register_file.test(definition.physReg(), definition.bytes())) {
2894 const PhysRegInterval def_regs{definition.physReg(), definition.size()};
2895
2896 /* create parallelcopy pair to move blocking vars */
2897 std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
2898
2899 RegisterFile tmp_file(register_file);
2900 /* re-enable the killed operands, so that we don't move the blocking vars there */
2901 for (const Operand& op : instr->operands) {
2902 if (op.isTemp() && op.isFirstKillBeforeDef())
2903 tmp_file.fill(op);
2904 }
2905
2906 ASSERTED bool success = false;
2907 success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
2908 assert(success);
2909
2910 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2911 }
2912
2913 if (!definition.isTemp())
2914 continue;
2915
2916 ctx.assignments[definition.tempId()].set(definition);
2917 register_file.fill(definition);
2918 }
2919
2920 /* handle all other definitions */
2921 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2922 Definition* definition = &instr->definitions[i];
2923
2924 if (definition->isFixed() || !definition->isTemp())
2925 continue;
2926
2927 /* find free reg */
2928 if (instr->opcode == aco_opcode::p_split_vector) {
2929 PhysReg reg = instr->operands[0].physReg();
2930 RegClass rc = definition->regClass();
2931 for (unsigned j = 0; j < i; j++)
2932 reg.reg_b += instr->definitions[j].bytes();
2933 if (get_reg_specified(ctx, register_file, rc, instr, reg)) {
2934 definition->setFixed(reg);
2935 } else if (i == 0) {
2936 RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
2937 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
2938 std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
2939 if (res && get_reg_specified(ctx, register_file, rc, instr, *res))
2940 definition->setFixed(*res);
2941 } else if (instr->definitions[i - 1].isFixed()) {
2942 reg = instr->definitions[i - 1].physReg();
2943 reg.reg_b += instr->definitions[i - 1].bytes();
2944 if (get_reg_specified(ctx, register_file, rc, instr, reg))
2945 definition->setFixed(reg);
2946 }
2947 } else if (instr->opcode == aco_opcode::p_parallelcopy ||
2948 (instr->opcode == aco_opcode::p_start_linear_vgpr &&
2949 !instr->operands.empty())) {
2950 PhysReg reg = instr->operands[i].physReg();
2951 if (instr->operands[i].isTemp() &&
2952 instr->operands[i].getTemp().type() == definition->getTemp().type() &&
2953 !register_file.test(reg, definition->bytes()))
2954 definition->setFixed(reg);
2955 } else if (instr->opcode == aco_opcode::p_extract_vector) {
2956 PhysReg reg = instr->operands[0].physReg();
2957 reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
2958 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
2959 definition->setFixed(reg);
2960 } else if (instr->opcode == aco_opcode::p_create_vector) {
2961 PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
2962 parallelcopy, instr);
2963 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2964 definition->setFixed(reg);
2965 } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
2966 instr->operands[2].isTemp() && instr->operands[2].isKill() &&
2967 instr->operands[2].regClass() == definition->regClass()) {
2968 /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
2969 * Here we set a policy of forcing them the same if operands[2] gets killed (and
2970 * otherwise they don't overlap). This may not be optimal if RA would select a
2971 * different location due to affinity, but that gets complicated very quickly. */
2972 definition->setFixed(instr->operands[2].physReg());
2973 }
2974
2975 if (!definition->isFixed()) {
2976 Temp tmp = definition->getTemp();
2977 if (definition->regClass().is_subdword() && definition->bytes() < 4) {
2978 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2979 definition->setFixed(reg);
2980 if (reg.byte() || register_file.test(reg, 4)) {
2981 add_subdword_definition(program, instr, reg);
2982 definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
2983 the reference */
2984 }
2985 } else {
2986 definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
2987 }
2988 update_renames(ctx, register_file, parallelcopy, instr,
2989 instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
2990 : (UpdateRenames)0);
2991 }
2992
2993 assert(
2994 definition->isFixed() &&
2995 ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
2996 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
2997 ctx.assignments[definition->tempId()].set(*definition);
2998 register_file.fill(*definition);
2999 }
3000
3001 handle_pseudo(ctx, register_file, instr.get());
3002
3003 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3004 * be read */
3005 for (const Definition& def : instr->definitions) {
3006 if (def.isTemp() && def.isKill())
3007 register_file.clear(def);
3008 }
3009 for (unsigned i = 0; i < instr->operands.size(); i++) {
3010 const Operand& op = instr->operands[i];
3011 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3012 register_file.clear(op);
3013 if (op.isTemp() && op.physReg().byte() != 0)
3014 add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3015 }
3016
3017 /* emit parallelcopy */
3018 if (!parallelcopy.empty()) {
3019 aco_ptr<Pseudo_instruction> pc;
3020 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
3021 Format::PSEUDO, parallelcopy.size(),
3022 parallelcopy.size()));
3023 bool linear_vgpr = false;
3024 bool sgpr_operands_alias_defs = false;
3025 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
3026 for (unsigned i = 0; i < parallelcopy.size(); i++) {
3027 linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
3028
3029 if (temp_in_scc && parallelcopy[i].first.isTemp() &&
3030 parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
3031 if (!sgpr_operands_alias_defs) {
3032 unsigned reg = parallelcopy[i].first.physReg().reg();
3033 unsigned size = parallelcopy[i].first.getTemp().size();
3034 sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
3035
3036 reg = parallelcopy[i].second.physReg().reg();
3037 size = parallelcopy[i].second.getTemp().size();
3038 if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
3039 sgpr_operands_alias_defs = true;
3040 }
3041 }
3042
3043 pc->operands[i] = parallelcopy[i].first;
3044 pc->definitions[i] = parallelcopy[i].second;
3045 assert(pc->operands[i].size() == pc->definitions[i].size());
3046
3047 /* it might happen that the operand is already renamed. we have to restore the
3048 * original name. */
3049 std::unordered_map<unsigned, Temp>::iterator it =
3050 ctx.orig_names.find(pc->operands[i].tempId());
3051 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
3052 ctx.orig_names[pc->definitions[i].tempId()] = orig;
3053 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
3054 }
3055
3056 if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
3057 /* disable definitions and re-enable operands */
3058 RegisterFile tmp_file(register_file);
3059 for (const Definition& def : instr->definitions) {
3060 if (def.isTemp() && !def.isKill())
3061 tmp_file.clear(def);
3062 }
3063 for (const Operand& op : instr->operands) {
3064 if (op.isTemp() && op.isFirstKill())
3065 tmp_file.block(op.physReg(), op.regClass());
3066 }
3067
3068 handle_pseudo(ctx, tmp_file, pc.get());
3069 } else {
3070 pc->tmp_in_scc = false;
3071 }
3072
3073 instructions.emplace_back(std::move(pc));
3074 }
3075
3076 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3077 bool instr_needs_vop3 =
3078 !instr->isVOP3() &&
3079 ((withoutDPP(instr->format) == Format::VOPC &&
3080 instr->definitions[0].physReg() != vcc) ||
3081 (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3082 ((instr->opcode == aco_opcode::v_add_co_u32 ||
3083 instr->opcode == aco_opcode::v_addc_co_u32 ||
3084 instr->opcode == aco_opcode::v_sub_co_u32 ||
3085 instr->opcode == aco_opcode::v_subb_co_u32 ||
3086 instr->opcode == aco_opcode::v_subrev_co_u32 ||
3087 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3088 instr->definitions[1].physReg() != vcc) ||
3089 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3090 instr->opcode == aco_opcode::v_subb_co_u32 ||
3091 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3092 instr->operands[2].physReg() != vcc));
3093 if (instr_needs_vop3) {
3094
3095 /* if the first operand is a literal, we have to move it to a reg */
3096 if (instr->operands.size() && instr->operands[0].isLiteral() &&
3097 program->gfx_level < GFX10) {
3098 bool can_sgpr = true;
3099 /* check, if we have to move to vgpr */
3100 for (const Operand& op : instr->operands) {
3101 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
3102 can_sgpr = false;
3103 break;
3104 }
3105 }
3106 /* disable definitions and re-enable operands */
3107 RegisterFile tmp_file(register_file);
3108 for (const Definition& def : instr->definitions)
3109 tmp_file.clear(def);
3110 for (const Operand& op : instr->operands) {
3111 if (op.isTemp() && op.isFirstKill())
3112 tmp_file.block(op.physReg(), op.regClass());
3113 }
3114 Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
3115 ctx.assignments.emplace_back();
3116 PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);
3117 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3118
3119 aco_ptr<Instruction> mov;
3120 if (can_sgpr)
3121 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
3122 Format::SOP1, 1, 1));
3123 else
3124 mov.reset(create_instruction<VALU_instruction>(aco_opcode::v_mov_b32,
3125 Format::VOP1, 1, 1));
3126 mov->operands[0] = instr->operands[0];
3127 mov->definitions[0] = Definition(tmp);
3128 mov->definitions[0].setFixed(reg);
3129
3130 instr->operands[0] = Operand(tmp);
3131 instr->operands[0].setFixed(reg);
3132 instr->operands[0].setFirstKill(true);
3133
3134 instructions.emplace_back(std::move(mov));
3135 }
3136
3137 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3138 instr->format = asVOP3(instr->format);
3139 }
3140
3141 instructions.emplace_back(std::move(*instr_it));
3142
3143 } /* end for Instr */
3144
3145 block.instructions = std::move(instructions);
3146 } /* end for BB */
3147
3148 /* num_gpr = rnd_up(max_used_gpr + 1) */
3149 program->config->num_vgprs =
3150 std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3151 program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3152
3153 program->progress = CompilationProgress::after_ra;
3154 }
3155
3156 } // namespace aco
3157